|
@@ -11,6 +11,7 @@ use Illuminate\Support\Arr;
|
|
|
|
|
|
|
|
class TurboSplit
|
|
class TurboSplit
|
|
|
{
|
|
{
|
|
|
|
|
+ protected $node = [];
|
|
|
protected $path = array();
|
|
protected $path = array();
|
|
|
protected $isDebug = false;
|
|
protected $isDebug = false;
|
|
|
#当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
|
|
#当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
|
|
@@ -58,6 +59,7 @@ class TurboSplit
|
|
|
["a" => "[ṃ]", "b" => "a", "c" => "ma", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
|
|
["a" => "[ṃ]", "b" => "a", "c" => "ma", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
|
|
|
["a" => "ṃ", "b" => "a", "c" => "m", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
|
|
["a" => "ṃ", "b" => "a", "c" => "m", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
|
|
|
["a" => "[ṃ]", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
|
|
["a" => "[ṃ]", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
|
|
|
|
|
+ ["a" => "ṃ", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9],
|
|
|
["a" => "[ṃ]", "b" => "u", "c" => "mu", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
|
|
["a" => "[ṃ]", "b" => "u", "c" => "mu", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
|
|
|
["a" => "[ṃ]", "b" => "h", "c" => "ñh", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
|
|
["a" => "[ṃ]", "b" => "h", "c" => "ñh", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
|
|
|
["a" => "ā", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
|
|
["a" => "ā", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
|
|
@@ -130,7 +132,7 @@ class TurboSplit
|
|
|
for($i=0;$i<$this->MAX_DEEP;$i++ ){
|
|
for($i=0;$i<$this->MAX_DEEP;$i++ ){
|
|
|
array_push($this->path, array("", 0));
|
|
array_push($this->path, array("", 0));
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -163,16 +165,20 @@ class TurboSplit
|
|
|
if (strlen($word) <= 1) {
|
|
if (strlen($word) <= 1) {
|
|
|
return array(0,0);
|
|
return array(0,0);
|
|
|
}
|
|
}
|
|
|
|
|
+ $search = $word;
|
|
|
//去掉单词首尾的 []
|
|
//去掉单词首尾的 []
|
|
|
- if(mb_substr($word,0,1) !== "["){
|
|
|
|
|
|
|
+ /*
|
|
|
|
|
+ if(mb_substr($word,0,1) !== "[")
|
|
|
|
|
+ {
|
|
|
$search = $word;
|
|
$search = $word;
|
|
|
}
|
|
}
|
|
|
else{
|
|
else{
|
|
|
$search = str_replace("[","",$word);
|
|
$search = str_replace("[","",$word);
|
|
|
- $search = str_replace("]","",$search);
|
|
|
|
|
|
|
+ $search = str_replace("]","",$search);
|
|
|
}
|
|
}
|
|
|
|
|
+ */
|
|
|
//获取单词权重
|
|
//获取单词权重
|
|
|
- $row = Cache::remember('palicanon/wordpart/weight/'.$search, 100 , function() use($search) {
|
|
|
|
|
|
|
+ $row = Cache::remember('palicanon/wordpart/weight/'.$search, 10 , function() use($search) {
|
|
|
return WordPart::where('word',$search)->value('weight');
|
|
return WordPart::where('word',$search)->value('weight');
|
|
|
});
|
|
});
|
|
|
if ($row) {
|
|
if ($row) {
|
|
@@ -195,7 +201,7 @@ class TurboSplit
|
|
|
$base_weight = 0;
|
|
$base_weight = 0;
|
|
|
$len = 0;
|
|
$len = 0;
|
|
|
foreach ($newWord as $x => $x_value) {
|
|
foreach ($newWord as $x => $x_value) {
|
|
|
- $row = Cache::remember('palicanon/wordpart/weight/'.$search, 100 , function() use($x) {
|
|
|
|
|
|
|
+ $row = Cache::remember('palicanon/wordpart/weight/'.$search, 10 , function() use($x) {
|
|
|
return WordPart::where('word',$x)->value('weight');
|
|
return WordPart::where('word',$x)->value('weight');
|
|
|
});
|
|
});
|
|
|
if ($row) {
|
|
if ($row) {
|
|
@@ -214,17 +220,16 @@ class TurboSplit
|
|
|
* 返回信心指数
|
|
* 返回信心指数
|
|
|
* look up single word in dictionary vocabulary
|
|
* look up single word in dictionary vocabulary
|
|
|
* return the confidence value
|
|
* return the confidence value
|
|
|
- *
|
|
|
|
|
- *
|
|
|
|
|
- *
|
|
|
|
|
|
|
+ *
|
|
|
|
|
+ *
|
|
|
|
|
+ *
|
|
|
*/
|
|
*/
|
|
|
public function isExsit($word, $adj_len = 0){
|
|
public function isExsit($word, $adj_len = 0){
|
|
|
$this->log("正在查询:{$word}");
|
|
$this->log("正在查询:{$word}");
|
|
|
|
|
|
|
|
$isFound = false;
|
|
$isFound = false;
|
|
|
$count = 0;
|
|
$count = 0;
|
|
|
- $cacheKey = "turbosplit/part/";
|
|
|
|
|
- $wordPart = Cache::remember($cacheKey.$word,1000,function() use($word){
|
|
|
|
|
|
|
+ $wordPart = Cache::remember("turbosplit/part/{$word}",10,function() use($word){
|
|
|
return implode(',',$this->dict_lookup($word));
|
|
return implode(',',$this->dict_lookup($word));
|
|
|
});
|
|
});
|
|
|
$arrWordPart = explode(',',$wordPart);
|
|
$arrWordPart = explode(',',$wordPart);
|
|
@@ -234,7 +239,7 @@ class TurboSplit
|
|
|
$this->log("查到:{$word}:{$word_count}个");
|
|
$this->log("查到:{$word}:{$word_count}个");
|
|
|
$isFound = true;
|
|
$isFound = true;
|
|
|
$count = $word_count + 1;
|
|
$count = $word_count + 1;
|
|
|
- }
|
|
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
//fomular of confidence value 信心值计算公式
|
|
//fomular of confidence value 信心值计算公式
|
|
|
if ($isFound) {
|
|
if ($isFound) {
|
|
@@ -251,27 +256,38 @@ class TurboSplit
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
|
+ * word
|
|
|
|
|
+ * cf
|
|
|
|
|
+ * children[]
|
|
|
|
|
+ */
|
|
|
/**
|
|
/**
|
|
|
* 核心拆分函数
|
|
* 核心拆分函数
|
|
|
* $strWord, word to be look up 要查询的词
|
|
* $strWord, word to be look up 要查询的词
|
|
|
* $deep, 当前递归深度
|
|
* $deep, 当前递归深度
|
|
|
|
|
+ * $forward 搜索方向
|
|
|
|
|
+ * true 正向
|
|
|
|
|
+ * false 反向
|
|
|
* $express=true, 快速查询
|
|
* $express=true, 快速查询
|
|
|
* $adj_len=0 长度校正系数
|
|
* $adj_len=0 长度校正系数
|
|
|
* $c_threshhold 信心指数阈值
|
|
* $c_threshhold 信心指数阈值
|
|
|
- *
|
|
|
|
|
- *
|
|
|
|
|
- *
|
|
|
|
|
|
|
+ *
|
|
|
|
|
+ *
|
|
|
|
|
+ *
|
|
|
*/
|
|
*/
|
|
|
- function split($strWord, $deep = 0, $express = false, $adj_len = 0, $c_threshhold = 0.8, $w_threshhold = 0.8, $forward = true, $sandhi_advance = false)
|
|
|
|
|
|
|
+ function split(&$node, $deep = 0, $express = false, $adj_len = 0, $c_threshhold = 0.8, $w_threshhold = 0.8, $forward = true, $sandhi_advance = false)
|
|
|
{
|
|
{
|
|
|
|
|
+ $strWord = $node["remain"];
|
|
|
$this->log("spliting word={$strWord} deep={$deep}");
|
|
$this->log("spliting word={$strWord} deep={$deep}");
|
|
|
$output = array();
|
|
$output = array();
|
|
|
- #当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
|
|
|
|
|
|
|
+ #currPathCf是当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
|
|
|
if($deep == 0){
|
|
if($deep == 0){
|
|
|
$this->currPathCf = 1;
|
|
$this->currPathCf = 1;
|
|
|
}
|
|
}
|
|
|
//达到最大搜索深度,返回
|
|
//达到最大搜索深度,返回
|
|
|
if ($deep >= $this->MAX_DEEP) {
|
|
if ($deep >= $this->MAX_DEEP) {
|
|
|
|
|
+ return ;
|
|
|
|
|
+ /*
|
|
|
$word = "";
|
|
$word = "";
|
|
|
$cf = 1.0;
|
|
$cf = 1.0;
|
|
|
for ($i = 0; $i < $deep; $i++) {
|
|
for ($i = 0; $i < $deep; $i++) {
|
|
@@ -295,17 +311,37 @@ class TurboSplit
|
|
|
$this->result[$reverseWord] = $cf;
|
|
$this->result[$reverseWord] = $cf;
|
|
|
return 0;
|
|
return 0;
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+*/
|
|
|
}
|
|
}
|
|
|
//直接找到
|
|
//直接找到
|
|
|
|
|
+
|
|
|
$confidence = $this->isExsit($strWord, $adj_len);
|
|
$confidence = $this->isExsit($strWord, $adj_len);
|
|
|
if ($confidence > $c_threshhold) {
|
|
if ($confidence > $c_threshhold) {
|
|
|
array_push($output, array($strWord, "", $confidence));
|
|
array_push($output, array($strWord, "", $confidence));
|
|
|
- }
|
|
|
|
|
- else {
|
|
|
|
|
- $confidence = $this->isExsit("[" . $strWord . "]");
|
|
|
|
|
|
|
+ if(isset($node['sum_cf'])){
|
|
|
|
|
+ $parent_sum_cf = $node['sum_cf'];
|
|
|
|
|
+ }else{
|
|
|
|
|
+ $parent_sum_cf = 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ $sum_cf = $parent_sum_cf * $confidence;
|
|
|
|
|
+ $node['children'][] = ['word'=>$strWord,'remain'=>"",'cf'=>$confidence,"sum_cf"=>$sum_cf];
|
|
|
|
|
+ $this->log("直接找到{$strWord}-{$confidence}");
|
|
|
|
|
+ }
|
|
|
|
|
+ else if(mb_strlen($strWord,"UTF-8")<6){
|
|
|
|
|
+ //按照语尾查询
|
|
|
|
|
+ $search = "[{$strWord}]";
|
|
|
|
|
+ $confidence = $this->isExsit($search);
|
|
|
|
|
+ $this->log("查询:{$search}-信心指数{$confidence}");
|
|
|
if ($confidence > $c_threshhold) {
|
|
if ($confidence > $c_threshhold) {
|
|
|
- array_push($output, array("[" . $strWord . "]", "", $confidence));
|
|
|
|
|
|
|
+ array_push($output, array($search, "", $confidence));
|
|
|
|
|
+ if(isset($node['sum_cf'])){
|
|
|
|
|
+ $parent_sum_cf = $node['sum_cf'];
|
|
|
|
|
+ }else{
|
|
|
|
|
+ $parent_sum_cf = 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ $sum_cf = $parent_sum_cf * $confidence;
|
|
|
|
|
+ $node['children'][] = ['word'=>$search,'remain'=>"",'cf'=>$confidence,"sum_cf"=>$sum_cf];
|
|
|
|
|
+ $this->log("直接找到{$strWord}-{$confidence}");
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -324,8 +360,8 @@ class TurboSplit
|
|
|
#正向切
|
|
#正向切
|
|
|
$this->log("正向切");
|
|
$this->log("正向切");
|
|
|
for ($i = $len; $i > 1; $i--) {
|
|
for ($i = $len; $i > 1; $i--) {
|
|
|
|
|
+ //应用连音规则切分单词
|
|
|
foreach ($this->sandhi as $key => $row) {
|
|
foreach ($this->sandhi as $key => $row) {
|
|
|
- //应用连音规则切分单词
|
|
|
|
|
if ($sandhi_advance == false && $row["advance"] == true) {
|
|
if ($sandhi_advance == false && $row["advance"] == true) {
|
|
|
//continue;
|
|
//continue;
|
|
|
}
|
|
}
|
|
@@ -336,10 +372,27 @@ class TurboSplit
|
|
|
if ($confidence > $c_threshhold) {
|
|
if ($confidence > $c_threshhold) {
|
|
|
//信心指数大于预设的阈值,插入
|
|
//信心指数大于预设的阈值,插入
|
|
|
array_push($output, array($str1, $str2, $confidence, $row["adj_len"]));
|
|
array_push($output, array($str1, $str2, $confidence, $row["adj_len"]));
|
|
|
|
|
+ if(isset($node['sum_cf'])){
|
|
|
|
|
+ $parent_sum_cf = $node['sum_cf'];
|
|
|
|
|
+ }else{
|
|
|
|
|
+ $parent_sum_cf = 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ $sum_cf = $parent_sum_cf * $confidence;
|
|
|
|
|
+ if($sum_cf>$c_threshhold){
|
|
|
|
|
+ $node['children'][] = [
|
|
|
|
|
+ 'word'=>$str1,
|
|
|
|
|
+ 'remain'=>$str2,
|
|
|
|
|
+ 'cf'=>$confidence,
|
|
|
|
|
+ 'sum_cf'=>$sum_cf,
|
|
|
|
|
+ 'children'=>[]
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
$this->log("插入结构数组:{$str1} 剩余{$str2} 应用:{$row["a"]}-{$row["b"]}-{$row["c"]}");
|
|
$this->log("插入结构数组:{$str1} 剩余{$str2} 应用:{$row["a"]}-{$row["b"]}-{$row["c"]}");
|
|
|
if ($express) {
|
|
if ($express) {
|
|
|
break;
|
|
break;
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -357,6 +410,21 @@ class TurboSplit
|
|
|
$confidence = $this->isExsit($str2, $adj_len)*$row["cf"];
|
|
$confidence = $this->isExsit($str2, $adj_len)*$row["cf"];
|
|
|
if ($confidence > $c_threshhold) {
|
|
if ($confidence > $c_threshhold) {
|
|
|
array_push($output, array($str2, $str1, $confidence, $row["adj_len"]));
|
|
array_push($output, array($str2, $str1, $confidence, $row["adj_len"]));
|
|
|
|
|
+ if(isset($node['sum_cf'])){
|
|
|
|
|
+ $parent_sum_cf = $node['sum_cf'];
|
|
|
|
|
+ }else{
|
|
|
|
|
+ $parent_sum_cf = 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ $sum_cf = $parent_sum_cf * $confidence;
|
|
|
|
|
+ if($sum_cf>$c_threshhold){
|
|
|
|
|
+ $node['children'][] = [
|
|
|
|
|
+ 'word'=>$str2,
|
|
|
|
|
+ 'remain'=>$str1,
|
|
|
|
|
+ 'cf'=>$confidence,
|
|
|
|
|
+ 'sum_cf'=>$sum_cf,
|
|
|
|
|
+ 'children'=>[],
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
$this->log("将此次结果插入结果数组:剩余={$str2}");
|
|
$this->log("将此次结果插入结果数组:剩余={$str2}");
|
|
|
if ($express) {
|
|
if ($express) {
|
|
|
break;
|
|
break;
|
|
@@ -367,9 +435,18 @@ class TurboSplit
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
$word = "";
|
|
$word = "";
|
|
|
$this->log("结果数组个数:".count($output));
|
|
$this->log("结果数组个数:".count($output));
|
|
|
|
|
+ //print_r($node);
|
|
|
|
|
+ //遍历children
|
|
|
|
|
+ foreach ($node['children'] as $key => $child) {
|
|
|
|
|
+ # code...
|
|
|
|
|
+ if(isset($child) && !empty($child['remain'])){
|
|
|
|
|
+ $this->split($node['children'][$key], ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ /*
|
|
|
if (count($output) > 0) {
|
|
if (count($output) > 0) {
|
|
|
foreach ($output as $part) {
|
|
foreach ($output as $part) {
|
|
|
$checked = $part[0];
|
|
$checked = $part[0];
|
|
@@ -409,6 +486,7 @@ class TurboSplit
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
} else {
|
|
} else {
|
|
|
|
|
+ //还有剩余部分
|
|
|
#计算当前信心指数
|
|
#计算当前信心指数
|
|
|
$cf = 1.0;
|
|
$cf = 1.0;
|
|
|
for ($i = 0; $i < $deep; $i++) {
|
|
for ($i = 0; $i < $deep; $i++) {
|
|
@@ -421,7 +499,7 @@ class TurboSplit
|
|
|
}else{
|
|
}else{
|
|
|
#接着切
|
|
#接着切
|
|
|
$this->log("接着切:{$remainder}");
|
|
$this->log("接着切:{$remainder}");
|
|
|
- $this->split($remainder, ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
|
|
|
|
|
|
|
+ $this->split($remainder, ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -438,15 +516,15 @@ class TurboSplit
|
|
|
$word .= "+";
|
|
$word .= "+";
|
|
|
$cf = $cf * $this->path[$i][1];
|
|
$cf = $cf * $this->path[$i][1];
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
$len = pow(mb_strlen($strWord, "UTF-8"), 3);
|
|
$len = pow(mb_strlen($strWord, "UTF-8"), 3);
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
if ($forward) {
|
|
if ($forward) {
|
|
|
$cf =(1-$cf) * $len / ($len + 150);
|
|
$cf =(1-$cf) * $len / ($len + 150);
|
|
|
} else {
|
|
} else {
|
|
|
$cf =(1-$cf) * $len / ($len + 5);
|
|
$cf =(1-$cf) * $len / ($len + 5);
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
if ($this->isDebug) {
|
|
if ($this->isDebug) {
|
|
|
$word = $word.$strWord . "(0)";
|
|
$word = $word.$strWord . "(0)";
|
|
|
} else {
|
|
} else {
|
|
@@ -457,7 +535,7 @@ class TurboSplit
|
|
|
if ($forward == true) {
|
|
if ($forward == true) {
|
|
|
$this->result[$word] = $cf;
|
|
$this->result[$word] = $cf;
|
|
|
return 0;
|
|
return 0;
|
|
|
- }
|
|
|
|
|
|
|
+ }
|
|
|
else {
|
|
else {
|
|
|
$reverseWord = $this->word_reverse($word);
|
|
$reverseWord = $this->word_reverse($word);
|
|
|
$this->result[$reverseWord] = $cf;
|
|
$this->result[$reverseWord] = $cf;
|
|
@@ -465,9 +543,30 @@ class TurboSplit
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+*/
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 遍历树状结构,获取结果
|
|
|
|
|
+ */
|
|
|
|
|
+ private function get_result($node,&$path){
|
|
|
|
|
+
|
|
|
|
|
+ # code...
|
|
|
|
|
+ $path[] = $node['word'];
|
|
|
|
|
+ if(isset($node['children']) && count($node['children'])>0){
|
|
|
|
|
+ foreach ($node['children'] as $key => $value) {
|
|
|
|
|
+ $this->get_result($value,$path);
|
|
|
|
|
+ }
|
|
|
|
|
+ }else{
|
|
|
|
|
+ if(empty($node['remain'])){
|
|
|
|
|
+ $factors = trim(implode("+",$path),'+');
|
|
|
|
|
+ $this->result[$factors] = $node['sum_cf'];
|
|
|
|
|
+ }else{
|
|
|
|
|
+
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ array_pop($path);
|
|
|
|
|
+ }
|
|
|
/**
|
|
/**
|
|
|
* 颠倒词序
|
|
* 颠倒词序
|
|
|
*/
|
|
*/
|
|
@@ -502,11 +601,11 @@ class TurboSplit
|
|
|
}
|
|
}
|
|
|
if(mb_strlen($word,"UTF-8")>4){
|
|
if(mb_strlen($word,"UTF-8")>4){
|
|
|
# 先看有没有中文意思
|
|
# 先看有没有中文意思
|
|
|
- //Log::info("先看有没有中文意思");
|
|
|
|
|
|
|
+ //$this->log("先看有没有中文意思");
|
|
|
if(UserDict::where('word',$word)->where('mean','<>','')->where('language','<>','my')->exists()){
|
|
if(UserDict::where('word',$word)->where('mean','<>','')->where('language','<>','my')->exists()){
|
|
|
$newword[]=$word;
|
|
$newword[]=$word;
|
|
|
}else{
|
|
}else{
|
|
|
- //Log::info("如果没有查巴缅替换拆分");
|
|
|
|
|
|
|
+ //$this->log("如果没有查巴缅替换拆分");
|
|
|
#如果没有查巴缅替换拆分
|
|
#如果没有查巴缅替换拆分
|
|
|
if(UserDict::where('word',$word)->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->exists()){
|
|
if(UserDict::where('word',$word)->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->exists()){
|
|
|
$pmPart = explode("+",UserDict::where('word',$word)->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->value('factors')) ;
|
|
$pmPart = explode("+",UserDict::where('word',$word)->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->value('factors')) ;
|
|
@@ -516,12 +615,12 @@ class TurboSplit
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
else{
|
|
else{
|
|
|
- //Log::info("如果没有查规则变形");
|
|
|
|
|
|
|
+ //$this->log("如果没有查规则变形");
|
|
|
#如果没有查规则变形
|
|
#如果没有查规则变形
|
|
|
if(UserDict::where('word',$word)->where('source','_SYS_REGULAR_')->exists()){
|
|
if(UserDict::where('word',$word)->where('source','_SYS_REGULAR_')->exists()){
|
|
|
$rglPart = explode("+",UserDict::where('word',$word)->where('source','_SYS_REGULAR_')->value('factors')) ;
|
|
$rglPart = explode("+",UserDict::where('word',$word)->where('source','_SYS_REGULAR_')->value('factors')) ;
|
|
|
#看巴缅有没有第一部分
|
|
#看巴缅有没有第一部分
|
|
|
- //Log::info("看巴缅有没有第一部分");
|
|
|
|
|
|
|
+ //$this->log("看巴缅有没有第一部分");
|
|
|
if(UserDict::where('word',$rglPart[0])->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->exists()){
|
|
if(UserDict::where('word',$rglPart[0])->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->exists()){
|
|
|
$pmPart = explode("+",UserDict::where('word',$rglPart[0])->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->value('factors')) ;
|
|
$pmPart = explode("+",UserDict::where('word',$rglPart[0])->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->value('factors')) ;
|
|
|
foreach ($pmPart as $pm) {
|
|
foreach ($pmPart as $pm) {
|
|
@@ -537,7 +636,7 @@ class TurboSplit
|
|
|
}
|
|
}
|
|
|
else{
|
|
else{
|
|
|
#还没有就认命了
|
|
#还没有就认命了
|
|
|
- //Log::info("还没有就认命了");
|
|
|
|
|
|
|
+ //$this->log("还没有就认命了");
|
|
|
$newword[]=$word;
|
|
$newword[]=$word;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -583,7 +682,7 @@ class TurboSplit
|
|
|
//预处理连音词
|
|
//预处理连音词
|
|
|
$word1 = $this->splitSandhi($word);
|
|
$word1 = $this->splitSandhi($word);
|
|
|
# 处理双元音
|
|
# 处理双元音
|
|
|
- Log::info("处理双元音");
|
|
|
|
|
|
|
+ $this->log("处理双元音");
|
|
|
$arrword = $this->splitDiphthong($word1);
|
|
$arrword = $this->splitDiphthong($word1);
|
|
|
if (count($arrword) > 1) {
|
|
if (count($arrword) > 1) {
|
|
|
array_push($output,['word'=>$word,'type'=>'.un.','grammar'=>'','parent'=>'','factors'=>implode("+", $arrword),'confidence'=>0.9999]);
|
|
array_push($output,['word'=>$word,'type'=>'.un.','grammar'=>'','parent'=>'','factors'=>implode("+", $arrword),'confidence'=>0.9999]);
|
|
@@ -591,34 +690,40 @@ class TurboSplit
|
|
|
|
|
|
|
|
foreach ($arrword as $oneword) {
|
|
foreach ($arrword as $oneword) {
|
|
|
$this->result = array(); //清空递归程序的输出容器
|
|
$this->result = array(); //清空递归程序的输出容器
|
|
|
-
|
|
|
|
|
|
|
+ $node = ['word'=>"",'remain'=>$oneword,'children'=>[]];
|
|
|
if(mb_strlen($oneword)>35){
|
|
if(mb_strlen($oneword)>35){
|
|
|
//长词使用快速切分 正向切分 不使用少见sandi规则
|
|
//长词使用快速切分 正向切分 不使用少见sandi规则
|
|
|
- $this->split($oneword, 0, true, 0.8, 0.9, 0, true, false);
|
|
|
|
|
|
|
+ $this->split($node, 0, true, 0.8, 0.9, 0, true, false);
|
|
|
$min_result = 1;
|
|
$min_result = 1;
|
|
|
}else{
|
|
}else{
|
|
|
- $this->split($oneword, 0, false, 0.8, 0.9, 0, true, false);
|
|
|
|
|
- $min_result=3;
|
|
|
|
|
|
|
+ $this->split($node, 0, false, 0.8, 0.9, 0, true, false);
|
|
|
|
|
+ $min_result=2;
|
|
|
}
|
|
}
|
|
|
- Log::info("正向切分结束 结果数量".count($this->result));
|
|
|
|
|
-
|
|
|
|
|
|
|
+ $path = [];
|
|
|
|
|
+ $this->log($node);
|
|
|
|
|
+ $this->get_result($node,$path);
|
|
|
|
|
+
|
|
|
|
|
+ $this->log("正向切分结束 结果数量".count($this->result));
|
|
|
|
|
+
|
|
|
if(count($this->result)<$min_result){
|
|
if(count($this->result)<$min_result){
|
|
|
//有效结果过少
|
|
//有效结果过少
|
|
|
- $this->split($oneword, 0, false, 0.2, 0.8, 0, true, true);
|
|
|
|
|
- Log::info("有效结果过少 再次正切".count($this->result) );
|
|
|
|
|
|
|
+ $node = ['word'=>"",'remain'=>$oneword,'children'=>[]];
|
|
|
|
|
+ $this->split($node, 0, false, 0.2, 0.8, 0, true, true);
|
|
|
|
|
+ $this->log("有效结果过少 再次正切".count($this->result) );
|
|
|
if(count($this->result)<2){
|
|
if(count($this->result)<2){
|
|
|
- $this->split($oneword, 0, false, 0.2, 0.8, 0, false, true);
|
|
|
|
|
- Log::info("有效结果过少 再次反切:结果数量" . count($this->result));
|
|
|
|
|
|
|
+ $node = ['word'=>"",'remain'=>$oneword,'children'=>[]];
|
|
|
|
|
+ $this->split($node, 0, false, 0.2, 0.8, 0, false, true);
|
|
|
|
|
+ $this->log("有效结果过少 再次反切:结果数量" . count($this->result));
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- Log::info("{$oneword}:" . count($this->result));
|
|
|
|
|
|
|
+ $this->log("{$oneword}:" . count($this->result));
|
|
|
if (count($this->result) > 0) {
|
|
if (count($this->result) > 0) {
|
|
|
- arsort($this->result); //按信心指数排序
|
|
|
|
|
|
|
+ arsort($this->result); //按信心指数升序排序
|
|
|
$iCount = 0;
|
|
$iCount = 0;
|
|
|
foreach ($this->result as $row => $value) {
|
|
foreach ($this->result as $row => $value) {
|
|
|
$factors = $row;
|
|
$factors = $row;
|
|
|
- if(strpos($row,'[') !== FALSE){
|
|
|
|
|
|
|
+ if(strpos($row,']+') !== FALSE){
|
|
|
$type = '.un.';
|
|
$type = '.un.';
|
|
|
$factors = \str_replace(['+[ṃ]+','[ṃ]+'],'ṃ+',$row);
|
|
$factors = \str_replace(['+[ṃ]+','[ṃ]+'],'ṃ+',$row);
|
|
|
}else{
|
|
}else{
|
|
@@ -630,13 +735,17 @@ class TurboSplit
|
|
|
if($iCount==0){
|
|
if($iCount==0){
|
|
|
//对于最优结果进行处理 找到base
|
|
//对于最优结果进行处理 找到base
|
|
|
$wordWithType = ['word'=>$oneword,'type'=>'','grammar'=>'','parent'=>'','factors'=>$factors,'confidence'=>$value];
|
|
$wordWithType = ['word'=>$oneword,'type'=>'','grammar'=>'','parent'=>'','factors'=>$factors,'confidence'=>$value];
|
|
|
-
|
|
|
|
|
- Log::info("查找base");
|
|
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
|
|
+ $this->log("查找base");
|
|
|
|
|
+
|
|
|
$factors = explode('+',$row);
|
|
$factors = explode('+',$row);
|
|
|
$endOfFactor = end($factors);
|
|
$endOfFactor = end($factors);
|
|
|
-
|
|
|
|
|
- Log::info("结尾词:".$endOfFactor);
|
|
|
|
|
|
|
+ if(strpos($endOfFactor,"[") !== FALSE){
|
|
|
|
|
+ if(count($factors)>=2){
|
|
|
|
|
+ $endOfFactor = $factors[count($factors)-2];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ $this->log("结尾词:".$endOfFactor);
|
|
|
$caseman = new CaseMan();
|
|
$caseman = new CaseMan();
|
|
|
//猜测单词的base
|
|
//猜测单词的base
|
|
|
$parents = $caseman->WordToBase($oneword,1,false);
|
|
$parents = $caseman->WordToBase($oneword,1,false);
|
|
@@ -649,19 +758,21 @@ class TurboSplit
|
|
|
if(count($end_parents)>0){
|
|
if(count($end_parents)>0){
|
|
|
foreach ($end_parents as $base2=>$case2) {
|
|
foreach ($end_parents as $base2=>$case2) {
|
|
|
if(\mb_substr($base2,-2)===\mb_substr($base,-2)){
|
|
if(\mb_substr($base2,-2)===\mb_substr($base,-2)){
|
|
|
- Log::info("{$base} ok");
|
|
|
|
|
|
|
+ $this->log("{$base} ok");
|
|
|
foreach ($case as $value) {
|
|
foreach ($case as $value) {
|
|
|
# code...
|
|
# code...
|
|
|
foreach ($case2 as $value2) {
|
|
foreach ($case2 as $value2) {
|
|
|
//验证语法信息是否正确
|
|
//验证语法信息是否正确
|
|
|
- if($value['type'] == $value2['type'] && $value['grammar'] == $value2['grammar']){
|
|
|
|
|
|
|
+ if($value['type'] == $value2['type'] &&
|
|
|
|
|
+ substr($value['grammar'],0,3) === substr($value2['grammar'],0,3) &&
|
|
|
|
|
+ $value['confidence']>0.5){
|
|
|
$wordWithType['type'] = $value['type'];
|
|
$wordWithType['type'] = $value['type'];
|
|
|
$wordWithType['grammar'] = $value['grammar'];
|
|
$wordWithType['grammar'] = $value['grammar'];
|
|
|
$wordWithType['factors'] = $value['factors'];
|
|
$wordWithType['factors'] = $value['factors'];
|
|
|
$wordWithType['parent'] = $base;
|
|
$wordWithType['parent'] = $base;
|
|
|
$wordWithType['confidence'] = $value2['confidence'];
|
|
$wordWithType['confidence'] = $value2['confidence'];
|
|
|
- Log::info("word:{$wordWithType['word']} ; type:{$wordWithType['type']}; grammar:{$wordWithType['grammar']};parent:{$wordWithType['parent']}");
|
|
|
|
|
- array_push($output,$wordWithType);
|
|
|
|
|
|
|
+ $this->log("word:{$wordWithType['word']} ; type:{$wordWithType['type']}; grammar:{$wordWithType['grammar']};parent:{$wordWithType['parent']}");
|
|
|
|
|
+ array_push($output,$wordWithType);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -674,14 +785,14 @@ class TurboSplit
|
|
|
$wordWithType['factors'] = $value['factors'];
|
|
$wordWithType['factors'] = $value['factors'];
|
|
|
$wordWithType['parent'] = $base;
|
|
$wordWithType['parent'] = $base;
|
|
|
$wordWithType['confidence'] = 0.1;
|
|
$wordWithType['confidence'] = 0.1;
|
|
|
- array_push($output,$wordWithType);
|
|
|
|
|
|
|
+ array_push($output,$wordWithType);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
//后处理 进一步切分没有意思的长词
|
|
//后处理 进一步切分没有意思的长词
|
|
|
- Log::info("后处理 进一步切分没有意思的长词");
|
|
|
|
|
|
|
+ $this->log("后处理 进一步切分没有意思的长词");
|
|
|
$new = $this->split2($row);
|
|
$new = $this->split2($row);
|
|
|
if($new !== $row){
|
|
if($new !== $row){
|
|
|
$newword['factors'] = $new;
|
|
$newword['factors'] = $new;
|
|
@@ -713,6 +824,10 @@ class TurboSplit
|
|
|
return $this->result;
|
|
return $this->result;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ public function debug($debug){
|
|
|
|
|
+ $this->isDebug = $debug;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
private function log($message){
|
|
private function log($message){
|
|
|
if ($this->isDebug) {
|
|
if ($this->isDebug) {
|
|
|
Log::info($message);
|
|
Log::info($message);
|