| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359 |
- <?php
- namespace App\Tools;
- use Illuminate\Support\Facades\Cache;
- use Illuminate\Support\Facades\Log;
- use App\Models\UserDict;
- use App\Models\WordIndex;
- class CaseMan
- {
- /**
- * Create a new class instance.
- *
- * @return void
- */
- public function __construct()
- {
- return;
- }
- /**
- * 从词干到单词的变化
- *
- * @return void
- */
- public function Declension($base, $type = null, $grammar = '', $confidence = 0.5)
- {
- $newWord = array();
- $case = new CaseEnding();
- foreach ($case->ending as $ending) {
- # code...
- if ($ending[4] < $confidence) {
- continue;
- }
- switch ($type) {
- case '.n:base.':
- if ($ending[2] !== '.n.' || strpos($ending[3], $grammar) !== 0) {
- continue 2;
- }
- break;
- case '.ti:base.':
- if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') {
- continue 2;
- }
- break;
- case '.adj:base.':
- if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') {
- continue 2;
- }
- break;
- case '.v:base.':
- if ($ending[2] !== '.v.') {
- continue 2;
- }
- break;
- default:
- continue 2;
- break;
- }
- $endingLen = mb_strlen($ending[0], "UTF-8");
- $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
- if ($wordEnd === $ending[0]) {
- //匹配成功
- $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
- //尝试sandhi
- //TODO 加两个sandhi
- $hasSandhi = false;
- foreach ($case->union as $sandhi) {
- $sandhiLen = mb_strlen($sandhi[0], 'UTF-8');
- $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
- if ($sandhiEnd === $sandhi[0]) {
- $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
- $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
- if ($count) {
- $hasSandhi = true;
- $newWord[] = [
- 'word' => $sandhiWord,
- 'ending' => $ending[1],
- 'type' => '.un.',
- 'grammar' => '',
- 'factors' => "{$word}+{$sandhi[2]}",
- 'count' => $count->count,
- 'bold' => $count->bold
- ];
- //添加一个去掉ti的数据
- if ($sandhi[2] === 'iti') {
- $newWord[] = [
- 'word' => mb_substr($sandhiWord, 0, -2, 'UTF-8'),
- 'ending' => $ending[1],
- 'grammar' => $ending[3],
- 'factors' => "{$base}+[{$ending[1]}]",
- 'count' => $count->count,
- 'bold' => $count->bold
- ];
- }
- }
- }
- }
- $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
- if ($count || $hasSandhi) {
- $newWord[] = [
- 'word' => $word,
- 'ending' => $ending[1],
- 'grammar' => $ending[3],
- 'factors' => "{$base}+[{$ending[1]}]",
- 'count' => $count ? $count->count : 0,
- 'bold' => $count ? $count->bold : 0
- ];
- }
- }
- }
- return $newWord;
- }
- private function endingMatch($base, $ending, $array = null)
- {
- $case = new CaseEnding();
- $output = array();
- $endingLen = mb_strlen($ending[0], "UTF-8");
- $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
- if ($wordEnd === $ending[0]) {
- //匹配成功
- $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
- if (is_array($array)) {
- if (!isset($array[$word])) {
- $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
- }
- } else {
- $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
- }
- if (isset($count) && $count) {
- $output[$word] = ["count" => $count->count, "bold" => $count->bold];
- } else {
- $output[$word] = false;
- }
- //尝试sandhi
- //TODO 加两个sandhi
- foreach ($case->union as $sandhi) {
- $sandhiLen = strlen($sandhi[0]);
- $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
- if ($sandhiEnd === $sandhi[0]) {
- $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
- if (is_array($array)) {
- if (!isset($array[$sandhiWord])) {
- $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
- }
- } else {
- $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
- }
- if (isset($count) && $count) {
- $output[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold];
- } else {
- $output[$sandhiWord] = false;
- }
- }
- }
- }
- return $output;
- }
- /**
- * 从词干到单词的变化
- *
- * @return array
- */
- public function BaseToWord($base, $confidence = 0.5)
- {
- $newWord = array();
- $case = new CaseEnding();
- foreach ($case->ending as $ending) {
- # code...
- if ($ending[4] < $confidence) {
- continue;
- }
- /*
- $matched = $this->endingMatch($base,$ending,$newWord);
- foreach ($matched as $key => $new) {
- $newWord[$key] = $new;
- }
- */
- $endingLen = mb_strlen($ending[0], "UTF-8");
- $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
- if ($wordEnd === $ending[0]) {
- //匹配成功
- $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
- if (!isset($newWord[$word])) {
- $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
- if ($count) {
- $newWord[$word] = ["count" => $count->count, "bold" => $count->bold];
- } else {
- $newWord[$word] = false;
- }
- }
- //尝试sandhi
- //TODO 加两个sandhi
- foreach ($case->union as $sandhi) {
- $sandhiLen = mb_strlen($sandhi[0], 'UTF-8');
- $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
- if ($sandhiEnd === $sandhi[0]) {
- $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
- if (!isset($newWord[$sandhiWord])) {
- $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
- if ($count) {
- $newWord[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold];
- } else {
- $newWord[$sandhiWord] = false;
- }
- }
- }
- }
- }
- }
- $result = [];
- foreach ($newWord as $key => $value) {
- # code...
- if ($value !== false) {
- $result[] = ['word' => $key, 'ending', "count" => $value["count"], "bold" => $value["bold"]];
- }
- }
- return $result;
- }
- /**
- * 从单词到词干的变化
- * 小蝌蚪找妈妈
- * @param string $word 输入
- * @param int $deep 搜索深度
- * @param boolean $verify 是否验证单词存在
- * @return array
- */
- public function WordToBase($word, $deep = 1, $verify = true)
- {
- $newWords = array();
- $newBase = array();
- $input[$word] = true;
- $case = new CaseEnding();
- for ($i = 0; $i < $deep; $i++) {
- # code...
- foreach ($input as $currWord => $status) {
- # code...
- if ($status) {
- $input[$currWord] = false;
- foreach ($case->ending as $ending) {
- # code...
- if ($ending[4] < 0.5) {
- continue;
- }
- $endingLen = mb_strlen($ending[1], "UTF-8");
- $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
- if ($wordEnd === $ending[1]) {
- //匹配成功
- $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
- if (!isset($newBase[$base])) {
- $newBase[$base] = array();
- }
- array_push($newBase[$base], [
- 'word' => $currWord,
- 'type' => $ending[2],
- 'grammar' => $ending[3],
- 'parent' => $base,
- 'factors' => "{$base}+[{$ending[1]}]",
- 'confidence' => $ending[4],
- ]);
- }
- }
- }
- }
- foreach ($newBase as $currWord => $value) {
- # 把新词加入列表
- if (!isset($input[$currWord])) {
- $input[$currWord] = true;
- }
- }
- }
- if ($verify) {
- $output = array();
- foreach ($newBase as $base => $rows) {
- # code...
- if (($verify = $this->VerifyBase($base, $rows)) !== false) {
- if (count($verify) > 0) {
- $output[$base] = $verify;
- }
- }
- }
- if (count($output) == 0) {
- //如果验证失败 输出最可能的结果
- $short = 10000;
- $shortBase = "";
- foreach ($newBase as $base => $rows) {
- if (mb_strlen($base, "UTF-8") < $short) {
- $short = mb_strlen($base, "UTF-8");
- $shortBase = $base;
- }
- }
- foreach ($newBase as $base => $rows) {
- if ($base == $shortBase) {
- $output[$base] = $rows;
- }
- }
- }
- return $output;
- } else {
- return $newBase;
- }
- }
- /**
- * 验证base在字典中是否存在
- */
- public function VerifyBase($base, $rows)
- {
- #
- $output = array();
- $dictWords = UserDict::where('word', $base)->select(['type', 'grammar'])->groupBy(['type', 'grammar'])->get();
- if (count($dictWords) > 0) {
- $newBase[$base] = 1;
- $case = array();
- //字典中这个拼写的单词的语法信息
- foreach ($dictWords as $value) {
- # code...
- $case["{$value->type}{$value->grammar}"] = 1;
- }
- foreach ($rows as $value) {
- //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
- switch ($value['type']) {
- case '.n.':
- $parentType = '.n:base.';
- break;
- case '.ti.':
- $parentType = '.ti:base.';
- break;
- case '.v.':
- $parentType = '.v:base.';
- break;
- default:
- $parentType = '';
- break;
- }
- if (!empty($value['grammar']) && $value['type'] !== ".v.") {
- $arrGrammar = explode('$', $value['grammar']);
- $parentType .= $arrGrammar[0];
- }
- # 只保存语法信息合理的数据
- if (isset($case[$parentType])) {
- array_push($output, $value);
- }
- }
- return $output;
- } else {
- return false;
- }
- }
- }
|