Browse Source

改公式,加入连音信心指数,格位语尾减长度

visuddhinanda 5 years ago
parent
commit
7d364578c9
2 changed files with 58 additions and 51 deletions
  1. 47 38
      app/dict/split.php
  2. 11 13
      app/dict/turbo_split.php

+ 47 - 38
app/dict/split.php

@@ -86,58 +86,67 @@ foreach ($arrWords as $currword) {
 		$wordlist = array();
 
 		$needDeep = false;
-		$preSandhi = preSandhi($oneword);
-		if($preSandhi!==$oneword){
-			$word_part["word"] = $preSandhi;
-			$word_part["confidence"] = 1.0;
-			$wordlist[] = $word_part;
-
-			$new = split2($preSandhi);
-			if($new!==$row){
-				$word_part["word"] = $new;
+		//看现有的字典里是不是有
+		$new = split2($oneword);
+		if($new!==$oneword){
+			//现有字典里查到
+			$word_part["word"] = $new;
+			$word_part["confidence"] = $value;
+			$wordlist[] = $word_part;	
+			#再处理一次
+			$new2 = split2($new);
+			if($new2!==$new){
+				$word_part["word"] = $new2;
 				$word_part["confidence"] = $value;
-				$wordlist[] = $word_part;	
-				#再处理一次
-				$new2 = split2($new);
-				if($new2!==$new){
-					$word_part["word"] = $new2;
-					$word_part["confidence"] = $value;
-					$wordlist[] = $word_part;					
-				}	
-				$needDeep = false;
-			}
-			else{
-				$needDeep = true;
-			}
+				$wordlist[] = $word_part;					
+			}	
+			$needDeep = false;
 		}
 		else{
-			$new = split2($oneword);
-			if($new!==$row){
-				$word_part["word"] = $new;
-				$word_part["confidence"] = $value;
-				$wordlist[] = $word_part;	
-				#再处理一次
-				$new2 = split2($new);
-				if($new2!==$new){
-					$word_part["word"] = $new2;
+			//没查到,查连音词
+			$preSandhi = preSandhi($oneword);
+			if($preSandhi!==$oneword){
+				$word_part["word"] = $preSandhi;
+				$word_part["confidence"] = 1.0;
+				$wordlist[] = $word_part;
+
+				//将处理后的连音词再二次拆分
+				$new = split2($preSandhi);
+				if($new!==$row){
+					$word_part["word"] = $new;
 					$word_part["confidence"] = $value;
-					$wordlist[] = $word_part;					
-				}	
-				$needDeep = false;
+					$wordlist[] = $word_part;	
+					#再处理一次
+					$new2 = split2($new);
+					if($new2!==$new){
+						$word_part["word"] = $new2;
+						$word_part["confidence"] = $value;
+						$wordlist[] = $word_part;					
+					}	
+					//如果能处理,就不进行深度拆分了
+					$needDeep = false;
+				}
+				else{
+					//连音词的第一部分没查到,进行深度拆分
+					$needDeep = true;
+				}
 			}
-			$needDeep = true;
+			else{
+				$needDeep = true;
+			}		
 		}
 
+
 		if($needDeep){
-			mySplit2($oneword, 0, false, 0, 0.2, 0.9, true, false);
+			mySplit2($oneword, 0, false, 0, 0.5, 0.95, true, false);
 			if(count($result) < 2){
-				mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, true, true);
+				//mySplit2($oneword, 0, $_express, 0, 0.3, 0.8, true, true);
 			}
 			if (isset($_POST["debug"])) {
 				echo "正切:" . count($result) . "<br>\n";
 			}
 			if(count($result) < 2){
-				mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, false, true);
+				//mySplit2($oneword, 0, $_express, 0, 0.3, 0.8, false, true);
 			}
 			if (isset($_POST["debug"])) {
 				echo "反切:" . count($result) . "<br>\n";

+ 11 - 13
app/dict/turbo_split.php

@@ -243,6 +243,9 @@ function isExsit($word, $adj_len = 0)
 		$word_count = $part["{$word}"][0];
 		$case_len = $part["{$word}"][1];
         if ($word_count > 0) {
+			if (isset($_POST["debug"])) {
+                echo "查到:{$word}:{$word_count}个\n";
+            }
             $isFound = true;
             $count = $word_count + 1;
         }
@@ -265,12 +268,13 @@ function isExsit($word, $adj_len = 0)
         if (isset($confidence["{$word}"])) {
             $cf = $confidence["{$word}"];
         } else {
-            $len = mb_strlen($word, "UTF-8") + $adj_len;
+			//$len = mb_strlen($word, "UTF-8") + $adj_len;
+			$len = mb_strlen($word, "UTF-8") - $case_len;
             $len_correct = 1.2;
             $count2 = 1.1 + pow($count, 1.18);
             $conf_num = pow(1 / $count2, pow(($len - 0.5), $len_correct));
-            //$cf = round(1 / (1 + 640 * $conf_num), 9);
-			$cf = round((1-0.02*$case_len) / (1 + 640 * $conf_num), 9);
+            $cf = round(1 / (1 + 640 * $conf_num), 9);
+			//$cf = round((1-0.02*$case_len) / (1 + 640 * $conf_num), 9);
             $confidence["{$word}"] = $cf;
             if (isset($_POST["debug"])) {
                 echo "信心指数:{$word}:{$cf}\n";
@@ -354,15 +358,12 @@ function mySplit2($strWord, $deep = 0, $express = false, $adj_len = 0, $c_thresh
             for ($i = $len; $i > 1; $i--) {
                 foreach ($sandhi as $key => $row) {
                     if ($sandhi_advance == false && $row["advance"] == true) {
-                        continue;
+                        //continue;
                     }
                     if (mb_substr($strWord, $i - $row["len"], $row["len"], "UTF-8") == $row["c"]) {
                         $str1 = mb_substr($strWord, 0, $i - $row["len"], "UTF-8") . $row["a"];
                         $str2 = $row["b"] . mb_substr($strWord, $i, null, "UTF-8");
-                        $confidence = isExsit($str1, $adj_len);
-                        if ($row["advance"] == true) {
-                            $confidence = $confidence * 0.99;
-                        }
+                        $confidence = isExsit($str1, $adj_len)*$row["cf"];
                         if ($confidence > $c_threshhold) {
                             $output[] = array($str1, $str2, $confidence, $row["adj_len"]);
                             if (isset($_POST["debug"])) {
@@ -381,15 +382,12 @@ function mySplit2($strWord, $deep = 0, $express = false, $adj_len = 0, $c_thresh
             for ($i = 1; $i < $len - 1; $i++) {
                 foreach ($sandhi as $key => $row) {
                     if ($sandhi_advance == false && $row["advance"] == true) {
-                        continue;
+                        //continue;
                     }
                     if (mb_substr($strWord, $i, $row["len"], "UTF-8") == $row["c"]) {
                         $str1 = mb_substr($strWord, 0, $i, "UTF-8") . $row["a"];
                         $str2 = $row["b"] . mb_substr($strWord, $i + $row["len"], null, "UTF-8");
-                        $confidence = isExsit($str2, $adj_len);
-                        if ($row["advance"] == true) {
-                            $confidence = $confidence * 0.99;
-                        }
+                        $confidence = isExsit($str2, $adj_len)*$row["cf"];
                         if ($confidence > $c_threshhold) {
                             $output[] = array($str2, $str1, $confidence, $row["adj_len"]);
                             if (isset($_POST["debug"])) {