ending as $ending) { # code... if ($ending[4] < $confidence) { continue; } switch ($type) { case '.n:base.': if ($ending[2] !== '.n.' || strpos($ending[3], $grammar) !== 0) { continue 2; } break; case '.ti:base.': if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') { continue 2; } break; case '.adj:base.': if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') { continue 2; } break; case '.v:base.': if ($ending[2] !== '.v.') { continue 2; } break; default: continue 2; break; } $endingLen = mb_strlen($ending[0], "UTF-8"); $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8"); if ($wordEnd === $ending[0]) { //匹配成功 $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1]; //尝试sandhi //TODO 加两个sandhi $hasSandhi = false; foreach ($case->union as $sandhi) { $sandhiLen = mb_strlen($sandhi[0], 'UTF-8'); $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8"); if ($sandhiEnd === $sandhi[0]) { $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1]; $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first(); if ($count) { $hasSandhi = true; $newWord[] = [ 'word' => $sandhiWord, 'ending' => $ending[1], 'type' => '.un.', 'grammar' => '', 'factors' => "{$word}+{$sandhi[2]}", 'count' => $count->count, 'bold' => $count->bold ]; //添加一个去掉ti的数据 if ($sandhi[2] === 'iti') { $newWord[] = [ 'word' => mb_substr($sandhiWord, 0, -2, 'UTF-8'), 'ending' => $ending[1], 'grammar' => $ending[3], 'factors' => "{$base}+[{$ending[1]}]", 'count' => $count->count, 'bold' => $count->bold ]; } } } } $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first(); if ($count || $hasSandhi) { $newWord[] = [ 'word' => $word, 'ending' => $ending[1], 'grammar' => $ending[3], 'factors' => "{$base}+[{$ending[1]}]", 'count' => $count ? $count->count : 0, 'bold' => $count ? $count->bold : 0 ]; } } } return $newWord; } private function endingMatch($base, $ending, $array = null) { $case = new CaseEnding(); $output = array(); $endingLen = mb_strlen($ending[0], "UTF-8"); $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8"); if ($wordEnd === $ending[0]) { //匹配成功 $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1]; if (is_array($array)) { if (!isset($array[$word])) { $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first(); } } else { $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first(); } if (isset($count) && $count) { $output[$word] = ["count" => $count->count, "bold" => $count->bold]; } else { $output[$word] = false; } //尝试sandhi //TODO 加两个sandhi foreach ($case->union as $sandhi) { $sandhiLen = strlen($sandhi[0]); $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8"); if ($sandhiEnd === $sandhi[0]) { $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1]; if (is_array($array)) { if (!isset($array[$sandhiWord])) { $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first(); } } else { $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first(); } if (isset($count) && $count) { $output[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold]; } else { $output[$sandhiWord] = false; } } } } return $output; } /** * 从词干到单词的变化 * * @return array */ public function BaseToWord($base, $confidence = 0.5) { $newWord = array(); $case = new CaseEnding(); foreach ($case->ending as $ending) { # code... if ($ending[4] < $confidence) { continue; } /* $matched = $this->endingMatch($base,$ending,$newWord); foreach ($matched as $key => $new) { $newWord[$key] = $new; } */ $endingLen = mb_strlen($ending[0], "UTF-8"); $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8"); if ($wordEnd === $ending[0]) { //匹配成功 $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1]; if (!isset($newWord[$word])) { $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first(); if ($count) { $newWord[$word] = ["count" => $count->count, "bold" => $count->bold]; } else { $newWord[$word] = false; } } //尝试sandhi //TODO 加两个sandhi foreach ($case->union as $sandhi) { $sandhiLen = mb_strlen($sandhi[0], 'UTF-8'); $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8"); if ($sandhiEnd === $sandhi[0]) { $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1]; if (!isset($newWord[$sandhiWord])) { $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first(); if ($count) { $newWord[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold]; } else { $newWord[$sandhiWord] = false; } } } } } } $result = []; foreach ($newWord as $key => $value) { # code... if ($value !== false) { $result[] = ['word' => $key, 'ending', "count" => $value["count"], "bold" => $value["bold"]]; } } return $result; } /** * 从单词到词干的变化 * 小蝌蚪找妈妈 * @param string $word 输入 * @param int $deep 搜索深度 * @param boolean $verify 是否验证单词存在 * @return array */ public function WordToBase($word, $deep = 1, $verify = true) { $newWords = array(); $newBase = array(); $input[$word] = true; $case = new CaseEnding(); for ($i = 0; $i < $deep; $i++) { # code... foreach ($input as $currWord => $status) { # code... if ($status) { $input[$currWord] = false; foreach ($case->ending as $ending) { # code... if ($ending[4] < 0.5) { continue; } $endingLen = mb_strlen($ending[1], "UTF-8"); $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8"); if ($wordEnd === $ending[1]) { //匹配成功 $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0]; if (!isset($newBase[$base])) { $newBase[$base] = array(); } array_push($newBase[$base], [ 'word' => $currWord, 'type' => $ending[2], 'grammar' => $ending[3], 'parent' => $base, 'factors' => "{$base}+[{$ending[1]}]", 'confidence' => $ending[4], ]); } } } } foreach ($newBase as $currWord => $value) { # 把新词加入列表 if (!isset($input[$currWord])) { $input[$currWord] = true; } } } if ($verify) { $output = array(); foreach ($newBase as $base => $rows) { # code... if (($verify = $this->VerifyBase($base, $rows)) !== false) { if (count($verify) > 0) { $output[$base] = $verify; } } } if (count($output) == 0) { //如果验证失败 输出最可能的结果 $short = 10000; $shortBase = ""; foreach ($newBase as $base => $rows) { if (mb_strlen($base, "UTF-8") < $short) { $short = mb_strlen($base, "UTF-8"); $shortBase = $base; } } foreach ($newBase as $base => $rows) { if ($base == $shortBase) { $output[$base] = $rows; } } } return $output; } else { return $newBase; } } /** * 验证base在字典中是否存在 */ public function VerifyBase($base, $rows) { # $output = array(); $dictWords = UserDict::where('word', $base)->select(['type', 'grammar'])->groupBy(['type', 'grammar'])->get(); if (count($dictWords) > 0) { $newBase[$base] = 1; $case = array(); //字典中这个拼写的单词的语法信息 foreach ($dictWords as $value) { # code... $case["{$value->type}{$value->grammar}"] = 1; } foreach ($rows as $value) { //根据输入的猜测的type,grammar拼接合理的 parent 语法信息 switch ($value['type']) { case '.n.': $parentType = '.n:base.'; break; case '.ti.': $parentType = '.ti:base.'; break; case '.v.': $parentType = '.v:base.'; break; default: $parentType = ''; break; } if (!empty($value['grammar']) && $value['type'] !== ".v.") { $arrGrammar = explode('$', $value['grammar']); $parentType .= $arrGrammar[0]; } # 只保存语法信息合理的数据 if (isset($case[$parentType])) { array_push($output, $value); } } return $output; } else { return false; } } }