~a-a~
第一步:先切开双元音
step 2 : every part use sandhi rule
第二步:用$sandhi的方法切分(套用连音规则)
algorithm:
算法:
f(word){
1. cut one letter from the end of word by sandhi rule in array($sandhi)
1. 从单词尾部切去一个字母
2. lookup first part .
2. 查询剩余部分
if confidence value>0.8
如果有结果
- get the confidence value
获取该部分的信心指数
- process the remaining part at same way
用同样的方法处理剩余部分
- f(stack.first element)
else
apply other sandhi rule
back to 1
}
this is a recursion, depth=16
此为递归算法,深度=16
*/
require_once "../dict/turbo_split.php";
//check input
if (isset($_POST["word"])) {
$input_word = mb_strtolower(trim($_POST["word"]), 'UTF-8');
if (trim($input_word) == "") {
echo "Empty";
exit;
}
$arrWords = str_getcsv($input_word, "\n"); //支持批量拆分
} else {
?>
";
}
//预处理
//将双元音拆开
//step 1 : split at diphthong . ~aa~ -> ~a-a~
//按连字符拆开处理
$arrword = split_diphthong($currword);
foreach ($arrword as $oneword) {
$result = array(); //全局变量,递归程序的输出容器
//$noSandhi = removeSandhi($oneword);
mySplit2($oneword, 0, false, 0, 0.2, 0.9, true, false);
if(count($result) < 2){
mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, true, true);
}
if (isset($_POST["debug"])) {
echo "正切:" . count($result) . "
\n";
}
if(count($result) < 2){
mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, false, true);
}
if (isset($_POST["debug"])) {
echo "反切:" . count($result) . "
\n";
}
/*
if (count($result) < 5) {
#sandhi advance
mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false, true);
if (isset($_POST["debug"])) {
echo "反切:" . count($result) . "\n";
}
}
if (count($result) < 5) {
#反向
mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false);
}
if (count($result) < 5) {
#正向
mySplit2($oneword, 0, $_express, 0, 0.8, 0, true);
}
if (count($result) < 5) {
#反向
mySplit2($oneword, 0, $_express, 0, 0.8, 0, false);
}
*/
arsort($result); //按信心指数排序
#输出结果 ouput to json
$wordlist = array();
$iMax = 5;
$iCount = 0;
foreach ($result as $row => $value) {
$iCount++;
$word_part = array();
$word_part["word"] = $row;
$word_part["confidence"] = $value;
$wordlist[] = $word_part;
//后处理 进一步切分没有意思的长词
$new = split2($row);
if($new!==$row){
$word_part["word"] = $new;
$word_part["confidence"] = $value;
$wordlist[] = $word_part;
#再处理一次
$new2 = split2($new);
if($new2!==$new){
$word_part["word"] = $new2;
$word_part["confidence"] = $value;
$wordlist[] = $word_part;
}
}
if ($iCount >= $iMax) {
break;
}
}
$output[] = $wordlist;
if (isset($_POST["debug"])) {
echo "{$oneword}
";
echo "" . count($result) . "
";
}
$iCount = 0;
foreach ($result as $row => $value) {
if ($iCount > 100) {
break;
}
$iCount++;
$level = $value * 90;
if (isset($_POST["debug"])) {
echo $row . "-[" . $value . "]
";
}
}
/*
后处理
-ssāpi=-[ssa]-api
*/
}
$t2 = microtime_float();
$one_split["data"] = $output;
$one_split["time"] = $auto_split_times;
$one_split["second"] = $t2 - $t1;
$allword[] = $one_split;
if (isset($_POST["debug"])) {
echo "";
echo "
查询【{$auto_split_times}】次";
echo "time:" . ($t2 - $t1);
echo "
";
}
}
if (isset($_POST["debug"])) {
echo "";
print_r($allword);
echo "";
}
echo json_encode($allword, JSON_UNESCAPED_UNICODE);
?>