Words:
"; $arrword = str_getcsv($word,"-"); $t1=microtime_float(); foreach($arrword as $oneword){ $result = array(); if(mb_strlen($oneword,"UTF-8")<30){ mySplit2($oneword,0); } else{ mySplit2($oneword,0,true); } arsort($result); echo "

{$oneword}

"; echo "

".count($result)."

"; $iCount=0; foreach($result as $row=>$value){ if($iCount>10){ break; } $iCount++; $level=$value*90; if(isset($_POST["debug"])){ echo $row."-[".$value."]
"; } else{ echo " "; echo $row."-[".round($level)."] "; echo ""; echo "
"; } } /* 后处理 -ssāpi=-[ssa]-api */ echo "-"; } echo "
查询【{$auto_split_times}】次"; $t2 = microtime_float(); echo "time:".($t2-$t1); } function myfunction($v1,$v2) { return $v1 . "+" . $v2; } function microtime_float() { list($usec, $sec) = explode(" ", microtime()); return ((float)$usec + (float)$sec); } /* 查找某个单词是否在现有词典出现 返回信心指数 */ function isExsit($word,$adj_len=0){ global $PDO; global $auto_split_times; global $result; global $part; global $confidence; $auto_split_times++; //echo "
正在查询:{$word}
"; $isFound=false; if(isset($part["{$word}"])) { $isFound=true; $count=$part["{$word}"]+1; } if($isFound) { if(isset($confidence["{$word}"])){ $cf=$confidence["{$word}"]; } else{ $len=mb_strlen($word,"UTF-8")+$adj_len; $len_correct=1.2; $count2=1.1+pow($count,1.18); $conf_num=pow(1/$count2,pow(($len-1),$len_correct)); $cf=round(1/(1+640*$conf_num),9); $confidence["{$word}"]=$cf; } return($cf); } else{ return(-1); } } function mySplit2($strWord,$deep,$turbo=false,$adj_len=0){ global $path; global $result; $output = array(); $min_part = 2; if($deep>=16){ $word = ""; $cf=1.0; for($i=0;$i<$deep;$i++){ $word .= $path[$i][0]; if(isset($_POST["debug"])){ $word .="(".$path[$i][1].")-"; } else{ $word .= "-"; } $cf=$cf*$path[$i][1]; } $len=pow(mb_strlen($strWord,"UTF-8"),3); $cf+=(0-$len)/($len+150); $word .= "{$strWord}(0)"; $result[$word]=$cf; return; } //直接找到 $confidence=isExsit($strWord,$adj_len); if($confidence>=0){ $output[] = array($strWord,"",$confidence); } else{ $confidence=isExsit("[".$strWord."]"); if($confidence>=0){ $output[] = array("[".$strWord."]","",$confidence); } } //如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。 $doubleword="kkggccjjṭṭḍḍttddppbb"; if(mb_strlen($strWord,"UTF-8")>2){ $left2=mb_substr($strWord,0,2,"UTF-8"); if(mb_strpos($doubleword,$left2,0,"UTF-8")!==FALSE){ $strWord=mb_substr($strWord,1,NULL,"UTF-8"); } } $sandhi[]=array("a"=>"","b"=>"","c"=>"","len"=>0,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"a","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"ā","b"=>"ā","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"ā","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"e","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"i","c"=>"i","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"o","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"u","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"u","b"=>"a","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"u","b"=>"u","c"=>"ū","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"u","c"=>"u","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"ī","c"=>"ī","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"ū","c"=>"ū","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"i","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"i","b"=>"i","c"=>"ī","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"i","b"=>"e","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"i","b"=>"a","c"=>"ya","len"=>2,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"atth","c"=>"atth","len"=>4,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"taṃ","b"=>"n","c"=>"tann","len"=>4,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[ṃ]","b"=>"api","c"=>"mpi","len"=>3,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[ṃ]","b"=>"eva","c"=>"meva","len"=>4,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[o]","b"=>"iva","c"=>"ova","len"=>3,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"ādi","c"=>"ādi","len"=>3,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a[ānaṃ]","b"=>"a","c"=>"ānama","len"=>5,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"a","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[ṃ]","b"=>"ca","c"=>"ñca","len"=>3,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[ṃ]","b"=>"iti","c"=>"nti","len"=>3,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[ṃ]","b"=>"a","c"=>"ma","len"=>2,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[ṃ]","b"=>"ā","c"=>"mā","len"=>2,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[ṃ]","b"=>"u","c"=>"mu","len"=>2,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"[ṃ]","b"=>"h","c"=>"ñh","len"=>2,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"ā","b"=>"[ṃ]","c"=>"am","len"=>2,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"ī","b"=>"[ṃ]","c"=>"im","len"=>2,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"ati","b"=>"tabba","c"=>"atabba","len"=>6,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"ati","b"=>"tabba","c"=>"itabba","len"=>6,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"iti","b"=>"a","c"=>"icca","len"=>4,"adj_len"=>0,"advance"=>false); /* $sandhi[]=array("a"=>"u[ūnaṃ]","b"=>"a","c"=>"ūnama","len"=>5,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"ī[īnaṃ]","b"=>"a","c"=>"īnama","len"=>5,"adj_len"=>0,"advance"=>false); $sandhi[]=array("a"=>"ā","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"a","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"e","b"=>"iti","c"=>"eti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"ī","b"=>"iti","c"=>"īti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"i","b"=>"iti","c"=>"īti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"o","b"=>"iti","c"=>"oti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"ū","b"=>"iti","c"=>"ūti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"u","b"=>"iti","c"=>"ūti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"ṃ","b"=>"iti","c"=>"nti","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"ṃ","b"=>"ca","c"=>"ñca","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"a","b"=>"eva","c"=>"eva","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"ā","b"=>"eva","c"=>"āyeva","len"=>5,"adj_len"=>0); $sandhi[]=array("a"=>"e","b"=>"eva","c"=>"eva","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"i","b"=>"eva","c"=>"yeva","len"=>4,"adj_len"=>0); $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"iyeva","len"=>5,"adj_len"=>0); $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"īyeva","len"=>5,"adj_len"=>0); $sandhi[]=array("a"=>"o","b"=>"eva","c"=>"ova","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"u","b"=>"eva","c"=>"veva","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"a","b"=>"eva","c"=>"evā","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"e","b"=>"eva","c"=>"evā","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"i","b"=>"eva","c"=>"yevā","len"=>4,"adj_len"=>0); $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"yevā","len"=>4,"adj_len"=>0); $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"iyevā","len"=>4,"adj_len"=>0); $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"īyevā","len"=>4,"adj_len"=>0); $sandhi[]=array("a"=>"o","b"=>"eva","c"=>"ovā","len"=>4,"adj_len"=>0); $sandhi[]=array("a"=>"ā","b"=>"api","c"=>"āpi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"a","b"=>"api","c"=>"āpi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"e","b"=>"api","c"=>"epi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"ī","b"=>"api","c"=>"īpi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"i","b"=>"api","c"=>"īpi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"o","b"=>"api","c"=>"opi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"ū","b"=>"api","c"=>"ūpi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"u","b"=>"api","c"=>"ūpi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"u","b"=>"api","c"=>"upi","len"=>3,"adj_len"=>0); $sandhi[]=array("a"=>"ṃ","b"=>"api","c"=>"mpi","len"=>3,"adj_len"=>0); */ //$sandhi[]=array("a"=>"a","b"=>"a","c"=>"a","len"=>1,"adj_len"=>-1,"advance"=>true); //$sandhi[]=array("a"=>"ī","b"=>"","c"=>"i","len"=>1,"adj_len"=>0,"advance"=>true); $len=mb_strlen($strWord,"UTF-8"); if($len>2){ for($i=$len;$i>1;$i--){ foreach($sandhi as $row){ if(mb_substr($strWord,$i-$row["len"],$row["len"],"UTF-8")==$row["c"]){ $str1=mb_substr($strWord,0,$i-$row["len"],"UTF-8").$row["a"]; $str2=$row["b"].mb_substr($strWord,$i,NULL,"UTF-8"); $confidence=isExsit($str1,$adj_len); if($confidence>=0.1){ $output[] = array($str1,$str2,$confidence,$row["adj_len"]); if($turbo){ break; } } } } } } if(count($output)>0){ foreach($output as $part){ $path[$deep][0]=$part[0]; $path[$deep][1]=$part[2]; if($part[1]!=""){ mySplit2($part[1],($deep+1),$turbo,$part[3]); } else{ $word = ""; $cf=1.0; for($i=0;$i<$deep;$i++){ $word .= $path[$i][0]."+"; if(isset($_POST["debug"])){ $word .= "(".$path[$i][1].")-"; } $cf=$cf*$path[$i][1]; } $word .= $part[0]; if(isset($_POST["debug"])){ $word .= "({$part[2]})"; } $cf=$cf+$part[2]*0.1; $result[$word]=$cf; } } } else{ $word = ""; $cf=1.0; for($i=0;$i<$deep;$i++){ $word .= $path[$i][0]."+"; if(isset($_POST["debug"])){ $word .= "(".$path[$i][1].")-"; } $cf=$cf*$path[$i][1]; } $len=pow(mb_strlen($strWord,"UTF-8"),3); $cf+=(0-$len)/($len+150); $word .= "{$strWord}(0)"; $result[$word]=$cf; } } ?>