iapt-platform
/
mint
spiegel van https://github.com/iapt-platform/mint.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
							<?php
//强力拆分复合词
/*
function: split compound word
step 1 : split at diphthong . ~aa~ -> ~a-a~
第一步：先切开双元音
step 2 : every part use sandhi rule
第二步：用$sandhi的方法切分（套用连音规则）
algorithm:
算法：
f(word){
1. cut one letter from the end of word by sandhi rule in array($sandhi)
1. 从单词尾部切去一个字母
2. lookup first part .
2. 查询剩余部分
if confidence value>0.8
如果有结果
- get the confidence value
获取该部分的信心指数
- process the remaining part at same way
用同样的方法处理剩余部分
- f(stack.first element)
else
apply other sandhi rule
back to 1
}
this is a recursion, depth=16
此为递归算法，深度=16
 */
require_once "../dict/turbo_split.php";
global $auto_split_times;
//check input
if (isset($_POST["word"])) {
    $input_word = mb_strtolower(trim($_POST["word"]), 'UTF-8');
    if (trim($input_word) == "") {
        echo "Empty";
        exit;
    }
    $arrWords = str_getcsv($input_word, "\n"); //支持批量拆分
} else {
    ?>
<!--debug only-->
<form action="split.php" method="post">
Words: <br>
<textarea type="text" name="word" style="width:50em;height:20em;"></textarea><br>
<input name="debug" type="hidden" />批量查询，单词之间用换行分隔。 input word. between two words insert 'enter'
<div>
<input type="checkbox" name = "express" checked /> 快速搜索（遇到第一个连音规则成功就返回） return when get first result
</div>
<input type="submit">
</form>

<?php
return;
}

if (isset($_POST["express"])) {
    if ($_POST["express"] === "on") {
        $_express = true;
    } else {
        $_express = false;
    }
} else {
    $_express = false;
}

//main


$allword = array();
foreach ($arrWords as $currword) {
    $t1 = microtime_float();
    $output = array();
    if (isset($_POST["debug"])) {
        echo "Look up：{$currword}<br>";
    }

    //预处理
    //将双元音拆开
    //step 1 : split at diphthong . ~aa~ -> ~a-a~
    //按连字符拆开处理
	$arrword = split_diphthong($currword);
	
    foreach ($arrword as $oneword) {
		$result = array(); //全局变量，递归程序的输出容器
		#输出结果 ouput to json
		$wordlist = array();

		$needDeep = false;
		//看现有的字典里是不是有
		$new = split2($oneword);
		if($new!==$oneword){
			//现有字典里查到
			$word_part["word"] = $new;
			$word_part["confidence"] = 1.0;
			$wordlist[] = $word_part;	
			#再处理一次
			$new2 = split2($new);
			if($new2!==$new){
				$word_part["word"] = $new2;
				$word_part["confidence"] = 1.0;
				$wordlist[] = $word_part;					
			}
			$needDeep = false;
		}
		else{
			//没查到，查连音词
			$preSandhi = preSandhi($oneword);
			if($preSandhi!==$oneword){
				$word_part["word"] = $preSandhi;
				$word_part["confidence"] = 1.0;
				$wordlist[] = $word_part;

				//将处理后的连音词再二次拆分
				$new = split2($preSandhi);
				if($new!==$row){
					$word_part["word"] = $new;
					$word_part["confidence"] = $value;
					$wordlist[] = $word_part;	
					#再处理一次
					$new2 = split2($new);
					if($new2!==$new){
						$word_part["word"] = $new2;
						$word_part["confidence"] = $value;
						$wordlist[] = $word_part;					
					}	
					//如果能处理，就不进行深度拆分了
					$needDeep = false;
				}
				else{
					//连音词的第一部分没查到，进行深度拆分
					$needDeep = true;
				}
			}
			else{
				$needDeep = true;
			}		
		}


		if($needDeep){
			if(mb_strlen($oneword,"UTF-8")>35){
				mySplit2($oneword, 0, true, 0, 0.9, 0.95, true, false);
			}
			else{
				mySplit2($oneword, 0, false, 0, 0.5, 0.95, true, false);
			}
			
			if(count($result) < 1){
				mySplit2($oneword, 0, $_express, 0, 0.4, 0.8, true, true);
			}
			if (isset($_POST["debug"])) {
				echo "正切：" . count($result) . "<br>\n";
			}
			if(count($result) < 2){
				mySplit2($oneword, 0, $_express, 0, 0.4, 0.8, false, true);
			}
			if (isset($_POST["debug"])) {
				echo "反切：" . count($result) . "<br>\n";
			}

			arsort($result); //按信心指数排序


			$iMax = 5;
			$iCount = 0;
			foreach ($result as $row => $value) {
				$iCount++;
				$word_part = array();
				
				$word_part["word"] = $row;
				$word_part["confidence"] = $value;
				$wordlist[] = $word_part;

				//后处理 进一步切分没有意思的长词
				$new = split2($row);
				if($new!==$row){
					$word_part["word"] = $new;
					$word_part["confidence"] = $value;
					$wordlist[] = $word_part;	
					#再处理一次
					$new2 = split2($new);
					if($new2!==$new){
						$word_part["word"] = $new2;
						$word_part["confidence"] = $value;
						$wordlist[] = $word_part;					
					}				
				}


				if ($iCount >= $iMax) {
					break;
				}

			}			
		}

        $output[] = $wordlist;

        if (isset($_POST["debug"])) {
            echo "<h2>{$oneword}</h2>";
            echo "<h4>" . count($result) . "</h4>";
        }
        $iCount = 0;
        foreach ($result as $row => $value) {
            if ($iCount > 10) {
                break;
            }
            $iCount++;
            $level = $value * 90;
            if (isset($_POST["debug"])) {
                echo $row . "-[" . $value . "]<br>";
            }
        }

        /*
    后处理
    -ssāpi=-[ssa]-api
     */
    }
    $t2 = microtime_float();
    $one_split["data"] = $output;
    $one_split["time"] = $auto_split_times;
    $one_split["second"] = $t2 - $t1;
    $allword[] = $one_split;

    if (isset($_POST["debug"])) {
        echo "<div>";
        echo "<br>查询【{$auto_split_times}】次";
        echo "time:" . ($t2 - $t1);
        echo "</div>";
    }
}

if (isset($_POST["debug"])) {
    echo "<pre style='margin:2em;padding:1em;background-color:#e9e9e9;'>";
    print_r($allword);
    echo "</pre>";
}
echo json_encode($allword, JSON_UNESCAPED_UNICODE);

?>