~a-a~ 第一步:先切开双元音 step 2 : every part use sandhi rule 第二步:用$sandhi的方法切分(套用连音规则) algorithm: 算法: f(word){ 1. cut one letter from the end of word by sandhi rule in array($sandhi) 1. 从单词尾部切去一个字母 2. lookup first part . 2. 查询剩余部分 if confidence value>0.8 如果有结果 - get the confidence value 获取该部分的信心指数 - process the remaining part at same way 用同样的方法处理剩余部分 - f(stack.first element) else apply other sandhi rule back to 1 } this is a recursion, depth=16 此为递归算法,深度=16 */ require_once "../dict/turbo_split.php"; global $auto_split_times; //check input if (isset($_POST["word"])) { $input_word = mb_strtolower(trim($_POST["word"]), 'UTF-8'); if (trim($input_word) == "") { echo "Empty"; exit; } $arrWords = str_getcsv($input_word, "\n"); //支持批量拆分 } else { ?>
"; } //预处理 //将双元音拆开 //step 1 : split at diphthong . ~aa~ -> ~a-a~ //按连字符拆开处理 $arrword = split_diphthong($currword); foreach ($arrword as $oneword) { $result = array(); //全局变量,递归程序的输出容器 #输出结果 ouput to json $wordlist = array(); $needDeep = false; //看现有的字典里是不是有 $new = split2($oneword); if($new!==$oneword){ //现有字典里查到 $word_part["word"] = $new; $word_part["confidence"] = 1.0; $wordlist[] = $word_part; #再处理一次 $new2 = split2($new); if($new2!==$new){ $word_part["word"] = $new2; $word_part["confidence"] = 1.0; $wordlist[] = $word_part; } $needDeep = false; } else{ //没查到,查连音词 $preSandhi = preSandhi($oneword); if($preSandhi!==$oneword){ $word_part["word"] = $preSandhi; $word_part["confidence"] = 1.0; $wordlist[] = $word_part; //将处理后的连音词再二次拆分 $new = split2($preSandhi); if($new!==$row){ $word_part["word"] = $new; $word_part["confidence"] = $value; $wordlist[] = $word_part; #再处理一次 $new2 = split2($new); if($new2!==$new){ $word_part["word"] = $new2; $word_part["confidence"] = $value; $wordlist[] = $word_part; } //如果能处理,就不进行深度拆分了 $needDeep = false; } else{ //连音词的第一部分没查到,进行深度拆分 $needDeep = true; } } else{ $needDeep = true; } } if($needDeep){ if(mb_strlen($oneword,"UTF-8")>35){ mySplit2($oneword, 0, true, 0, 0.9, 0.95, true, false); } else{ mySplit2($oneword, 0, false, 0, 0.5, 0.95, true, false); } if(count($result) < 1){ mySplit2($oneword, 0, $_express, 0, 0.4, 0.8, true, true); } if (isset($_POST["debug"])) { echo "正切:" . count($result) . "";
print_r($allword);
echo "";
}
echo json_encode($allword, JSON_UNESCAPED_UNICODE);
?>