visuddhinanda 5 лет назад
Родитель
Сommit
9d78fa81a5
2 измененных файлов с 184 добавлено и 83 удалено
  1. 101 55
      app/dict/split.php
  2. 83 28
      app/dict/turbo_split.php

+ 101 - 55
app/dict/split.php

@@ -82,58 +82,36 @@ foreach ($arrWords as $currword) {
 	
     foreach ($arrword as $oneword) {
 		$result = array(); //全局变量,递归程序的输出容器
-		//$noSandhi = removeSandhi($oneword);
-
-        mySplit2($oneword, 0, false, 0, 0.2, 0.9, true, false);
-		if(count($result) < 2){
-			mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, true, true);
-		}
-        if (isset($_POST["debug"])) {
-            echo "正切:" . count($result) . "<br>\n";
-		}
-		if(count($result) < 2){
-			mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, false, true);
-		}
-        if (isset($_POST["debug"])) {
-            echo "反切:" . count($result) . "<br>\n";
-        }
-        /*
-        if (count($result) < 5) {
-        #sandhi advance
-        mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false, true);
-        if (isset($_POST["debug"])) {
-        echo "反切:" . count($result) . "\n";
-        }
-        }
-        if (count($result) < 5) {
-        #反向
-        mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false);
-        }
-        if (count($result) < 5) {
-        #正向
-        mySplit2($oneword, 0, $_express, 0, 0.8, 0, true);
-        }
-        if (count($result) < 5) {
-        #反向
-        mySplit2($oneword, 0, $_express, 0, 0.8, 0, false);
-        }
-         */
-        arsort($result); //按信心指数排序
-
-        #输出结果 ouput to json
-        $wordlist = array();
-        $iMax = 5;
-        $iCount = 0;
-        foreach ($result as $row => $value) {
-            $iCount++;
-			$word_part = array();
-			
-            $word_part["word"] = $row;
-			$word_part["confidence"] = $value;
+		#输出结果 ouput to json
+		$wordlist = array();
+
+		$needDeep = false;
+		$preSandhi = preSandhi($oneword);
+		if($preSandhi!==$oneword){
+			$word_part["word"] = $preSandhi;
+			$word_part["confidence"] = 1.0;
 			$wordlist[] = $word_part;
 
-			//后处理 进一步切分没有意思的长词
-			$new = split2($row);
+			$new = split2($preSandhi);
+			if($new!==$row){
+				$word_part["word"] = $new;
+				$word_part["confidence"] = $value;
+				$wordlist[] = $word_part;	
+				#再处理一次
+				$new2 = split2($new);
+				if($new2!==$new){
+					$word_part["word"] = $new2;
+					$word_part["confidence"] = $value;
+					$wordlist[] = $word_part;					
+				}	
+				$needDeep = false;
+			}
+			else{
+				$needDeep = true;
+			}
+		}
+		else{
+			$new = split2($oneword);
 			if($new!==$row){
 				$word_part["word"] = $new;
 				$word_part["confidence"] = $value;
@@ -144,16 +122,84 @@ foreach ($arrWords as $currword) {
 					$word_part["word"] = $new2;
 					$word_part["confidence"] = $value;
 					$wordlist[] = $word_part;					
-				}				
+				}	
+				$needDeep = false;
+			}
+			$needDeep = true;
+		}
+
+		if($needDeep){
+			mySplit2($oneword, 0, false, 0, 0.2, 0.9, true, false);
+			if(count($result) < 2){
+				mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, true, true);
+			}
+			if (isset($_POST["debug"])) {
+				echo "正切:" . count($result) . "<br>\n";
+			}
+			if(count($result) < 2){
+				mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, false, true);
+			}
+			if (isset($_POST["debug"])) {
+				echo "反切:" . count($result) . "<br>\n";
+			}
+			/*
+			if (count($result) < 5) {
+			#sandhi advance
+			mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false, true);
+			if (isset($_POST["debug"])) {
+			echo "反切:" . count($result) . "\n";
+			}
 			}
+			if (count($result) < 5) {
+			#反向
+			mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false);
+			}
+			if (count($result) < 5) {
+			#正向
+			mySplit2($oneword, 0, $_express, 0, 0.8, 0, true);
+			}
+			if (count($result) < 5) {
+			#反向
+			mySplit2($oneword, 0, $_express, 0, 0.8, 0, false);
+			}
+			*/
+			arsort($result); //按信心指数排序
 
 
+			$iMax = 5;
+			$iCount = 0;
+			foreach ($result as $row => $value) {
+				$iCount++;
+				$word_part = array();
+				
+				$word_part["word"] = $row;
+				$word_part["confidence"] = $value;
+				$wordlist[] = $word_part;
 
-            if ($iCount >= $iMax) {
-                break;
-            }
+				//后处理 进一步切分没有意思的长词
+				$new = split2($row);
+				if($new!==$row){
+					$word_part["word"] = $new;
+					$word_part["confidence"] = $value;
+					$wordlist[] = $word_part;	
+					#再处理一次
+					$new2 = split2($new);
+					if($new2!==$new){
+						$word_part["word"] = $new2;
+						$word_part["confidence"] = $value;
+						$wordlist[] = $word_part;					
+					}				
+				}
+
+
+
+				if ($iCount >= $iMax) {
+					break;
+				}
+
+			}			
+		}
 
-        }
         $output[] = $wordlist;
 
         if (isset($_POST["debug"])) {

+ 83 - 28
app/dict/turbo_split.php

@@ -109,34 +109,36 @@ $sandhi[] = array("a" => "ṃ", "b" => "cet", "c" => "ñcet", "len" => 4, "adj_l
 $sandhi[] = array("a" => "ṃ", "b" => "ev", "c" => "mev", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
 
 /*
-$sandhi[] = array("a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ā", "b" => "eva", "c" => "āyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "u", "b" => "eva", "c" => "veva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-
-$sandhi[] = array("a" => "a", "b" => "eva", "c" => "evā", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "e", "b" => "eva", "c" => "evā", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "i", "b" => "eva", "c" => "yevā", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "yevā", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "iyevā", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "īyevā", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "o", "b" => "eva", "c" => "ovā", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-
-$sandhi[] = array("a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
-$sandhi[] = array("a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
- */
+$sandhi2[] = array("a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
+$sandhi2[] = array("a" => "ṃ", "b" => "hi", "c" => "ñhi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
+
+$sandhi2[] = array("a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+
+$sandhi2[] = array("a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "u", "b" => "eva", "c" => "veva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+
+$sandhi2[] = array("a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+
+*/
 $sandhi[] = array("a" => "a", "b" => "a", "c" => "a", "len" => 1, "adj_len" => -1, "advance" => true,"cf"=>0.99);
 $sandhi[] = array("a" => "ī", "b" => "", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => true,"cf"=>0.9);
 
@@ -562,4 +564,57 @@ function split2($word){
 
 	}
 	return implode("+",$newword);
+}
+
+function preSandhi($word){
+	$sandhi2[] = array("a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
+$sandhi2[] = array("a" => "ṃ", "b" => "hi", "c" => "ñhi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
+
+$sandhi2[] = array("a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+$sandhi2[] = array("a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
+
+$sandhi2[] = array("a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ṃ", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "u", "b" => "eva", "c" => "veva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+
+$sandhi2[] = array("a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+$sandhi2[] = array("a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
+
+$newWord = "";
+$firstWord=$word;
+do {
+	$isFound = false;
+	foreach ($sandhi2 as $key => $sandhi) {
+		# code...
+		$len = $sandhi["len"];
+		$end = mb_substr($firstWord, 0 - $len, null, "UTF-8");
+		if ($end == $sandhi["c"]) {
+			$word1 = mb_substr($firstWord, 0, mb_strlen($firstWord, "UTF-8") - $len, "UTF-8") .$sandhi["a"];
+			$word2 = $sandhi["b"];
+			$newWord = $word2 . "+" .$newWord;
+			$firstWord = $word1;
+			$isFound=true;
+		break;
+		}
+	}
+} while ($isFound);
+$newWord = $firstWord . "+" .$newWord;
+return mb_substr($newWord,0,-1, "UTF-8");
 }