CaseMan.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. <?php
  2. namespace App\Tools;
  3. use Illuminate\Support\Facades\Cache;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\UserDict;
  6. use App\Models\WordIndex;
  7. class CaseMan
  8. {
  9. /**
  10. * Create a new class instance.
  11. *
  12. * @return void
  13. */
  14. public function __construct()
  15. {
  16. return;
  17. }
  18. /**
  19. * 从词干到单词的变化
  20. *
  21. * @return void
  22. */
  23. public function Declension($base, $type = null, $grammar = '', $confidence = 0.5)
  24. {
  25. $newWord = array();
  26. $case = new CaseEnding();
  27. foreach ($case->ending as $ending) {
  28. # code...
  29. if ($ending[4] < $confidence) {
  30. continue;
  31. }
  32. switch ($type) {
  33. case '.n:base.':
  34. if ($ending[2] !== '.n.' || strpos($ending[3], $grammar) !== 0) {
  35. continue 2;
  36. }
  37. break;
  38. case '.ti:base.':
  39. if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') {
  40. continue 2;
  41. }
  42. break;
  43. case '.adj:base.':
  44. if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') {
  45. continue 2;
  46. }
  47. break;
  48. case '.v:base.':
  49. if ($ending[2] !== '.v.') {
  50. continue 2;
  51. }
  52. break;
  53. default:
  54. continue 2;
  55. break;
  56. }
  57. $endingLen = mb_strlen($ending[0], "UTF-8");
  58. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  59. if ($wordEnd === $ending[0]) {
  60. //匹配成功
  61. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  62. //尝试sandhi
  63. //TODO 加两个sandhi
  64. $hasSandhi = false;
  65. foreach ($case->union as $sandhi) {
  66. $sandhiLen = mb_strlen($sandhi[0], 'UTF-8');
  67. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  68. if ($sandhiEnd === $sandhi[0]) {
  69. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  70. $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
  71. if ($count) {
  72. $hasSandhi = true;
  73. $newWord[] = [
  74. 'word' => $sandhiWord,
  75. 'ending' => $ending[1],
  76. 'type' => '.un.',
  77. 'grammar' => '',
  78. 'factors' => "{$word}+{$sandhi[2]}",
  79. 'count' => $count->count,
  80. 'bold' => $count->bold
  81. ];
  82. //添加一个去掉ti的数据
  83. if ($sandhi[2] === 'iti') {
  84. $newWord[] = [
  85. 'word' => mb_substr($sandhiWord, 0, -2, 'UTF-8'),
  86. 'ending' => $ending[1],
  87. 'grammar' => $ending[3],
  88. 'factors' => "{$base}+[{$ending[1]}]",
  89. 'count' => $count->count,
  90. 'bold' => $count->bold
  91. ];
  92. }
  93. }
  94. }
  95. }
  96. $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
  97. if ($count || $hasSandhi) {
  98. $newWord[] = [
  99. 'word' => $word,
  100. 'ending' => $ending[1],
  101. 'grammar' => $ending[3],
  102. 'factors' => "{$base}+[{$ending[1]}]",
  103. 'count' => $count ? $count->count : 0,
  104. 'bold' => $count ? $count->bold : 0
  105. ];
  106. }
  107. }
  108. }
  109. return $newWord;
  110. }
  111. private function endingMatch($base, $ending, $array = null)
  112. {
  113. $case = new CaseEnding();
  114. $output = array();
  115. $endingLen = mb_strlen($ending[0], "UTF-8");
  116. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  117. if ($wordEnd === $ending[0]) {
  118. //匹配成功
  119. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  120. if (is_array($array)) {
  121. if (!isset($array[$word])) {
  122. $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
  123. }
  124. } else {
  125. $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
  126. }
  127. if (isset($count) && $count) {
  128. $output[$word] = ["count" => $count->count, "bold" => $count->bold];
  129. } else {
  130. $output[$word] = false;
  131. }
  132. //尝试sandhi
  133. //TODO 加两个sandhi
  134. foreach ($case->union as $sandhi) {
  135. $sandhiLen = strlen($sandhi[0]);
  136. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  137. if ($sandhiEnd === $sandhi[0]) {
  138. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  139. if (is_array($array)) {
  140. if (!isset($array[$sandhiWord])) {
  141. $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
  142. }
  143. } else {
  144. $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
  145. }
  146. if (isset($count) && $count) {
  147. $output[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold];
  148. } else {
  149. $output[$sandhiWord] = false;
  150. }
  151. }
  152. }
  153. }
  154. return $output;
  155. }
  156. /**
  157. * 从词干到单词的变化
  158. *
  159. * @return array
  160. */
  161. public function BaseToWord($base, $confidence = 0.5)
  162. {
  163. $newWord = array();
  164. $case = new CaseEnding();
  165. foreach ($case->ending as $ending) {
  166. # code...
  167. if ($ending[4] < $confidence) {
  168. continue;
  169. }
  170. /*
  171. $matched = $this->endingMatch($base,$ending,$newWord);
  172. foreach ($matched as $key => $new) {
  173. $newWord[$key] = $new;
  174. }
  175. */
  176. $endingLen = mb_strlen($ending[0], "UTF-8");
  177. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  178. if ($wordEnd === $ending[0]) {
  179. //匹配成功
  180. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  181. if (!isset($newWord[$word])) {
  182. $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
  183. if ($count) {
  184. $newWord[$word] = ["count" => $count->count, "bold" => $count->bold];
  185. } else {
  186. $newWord[$word] = false;
  187. }
  188. }
  189. //尝试sandhi
  190. //TODO 加两个sandhi
  191. foreach ($case->union as $sandhi) {
  192. $sandhiLen = mb_strlen($sandhi[0], 'UTF-8');
  193. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  194. if ($sandhiEnd === $sandhi[0]) {
  195. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  196. if (!isset($newWord[$sandhiWord])) {
  197. $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
  198. if ($count) {
  199. $newWord[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold];
  200. } else {
  201. $newWord[$sandhiWord] = false;
  202. }
  203. }
  204. }
  205. }
  206. }
  207. }
  208. $result = [];
  209. foreach ($newWord as $key => $value) {
  210. # code...
  211. if ($value !== false) {
  212. $result[] = ['word' => $key, 'ending', "count" => $value["count"], "bold" => $value["bold"]];
  213. }
  214. }
  215. return $result;
  216. }
  217. /**
  218. * 从单词到词干的变化
  219. * 小蝌蚪找妈妈
  220. * @param string $word 输入
  221. * @param int $deep 搜索深度
  222. * @param boolean $verify 是否验证单词存在
  223. * @return array
  224. */
  225. public function WordToBase($word, $deep = 1, $verify = true)
  226. {
  227. $newWords = array();
  228. $newBase = array();
  229. $input[$word] = true;
  230. $case = new CaseEnding();
  231. for ($i = 0; $i < $deep; $i++) {
  232. # code...
  233. foreach ($input as $currWord => $status) {
  234. # code...
  235. if ($status) {
  236. $input[$currWord] = false;
  237. foreach ($case->ending as $ending) {
  238. # code...
  239. if ($ending[4] < 0.5) {
  240. continue;
  241. }
  242. $endingLen = mb_strlen($ending[1], "UTF-8");
  243. $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
  244. if ($wordEnd === $ending[1]) {
  245. //匹配成功
  246. $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
  247. if (!isset($newBase[$base])) {
  248. $newBase[$base] = array();
  249. }
  250. array_push($newBase[$base], [
  251. 'word' => $currWord,
  252. 'type' => $ending[2],
  253. 'grammar' => $ending[3],
  254. 'parent' => $base,
  255. 'factors' => "{$base}+[{$ending[1]}]",
  256. 'confidence' => $ending[4],
  257. ]);
  258. }
  259. }
  260. }
  261. }
  262. foreach ($newBase as $currWord => $value) {
  263. # 把新词加入列表
  264. if (!isset($input[$currWord])) {
  265. $input[$currWord] = true;
  266. }
  267. }
  268. }
  269. if ($verify) {
  270. $output = array();
  271. foreach ($newBase as $base => $rows) {
  272. # code...
  273. if (($verify = $this->VerifyBase($base, $rows)) !== false) {
  274. if (count($verify) > 0) {
  275. $output[$base] = $verify;
  276. }
  277. }
  278. }
  279. if (count($output) == 0) {
  280. //如果验证失败 输出最可能的结果
  281. $short = 10000;
  282. $shortBase = "";
  283. foreach ($newBase as $base => $rows) {
  284. if (mb_strlen($base, "UTF-8") < $short) {
  285. $short = mb_strlen($base, "UTF-8");
  286. $shortBase = $base;
  287. }
  288. }
  289. foreach ($newBase as $base => $rows) {
  290. if ($base == $shortBase) {
  291. $output[$base] = $rows;
  292. }
  293. }
  294. }
  295. return $output;
  296. } else {
  297. return $newBase;
  298. }
  299. }
  300. /**
  301. * 验证base在字典中是否存在
  302. */
  303. public function VerifyBase($base, $rows)
  304. {
  305. #
  306. $output = array();
  307. $dictWords = UserDict::where('word', $base)->select(['type', 'grammar'])->groupBy(['type', 'grammar'])->get();
  308. if (count($dictWords) > 0) {
  309. $newBase[$base] = 1;
  310. $case = array();
  311. //字典中这个拼写的单词的语法信息
  312. foreach ($dictWords as $value) {
  313. # code...
  314. $case["{$value->type}{$value->grammar}"] = 1;
  315. }
  316. foreach ($rows as $value) {
  317. //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
  318. switch ($value['type']) {
  319. case '.n.':
  320. $parentType = '.n:base.';
  321. break;
  322. case '.ti.':
  323. $parentType = '.ti:base.';
  324. break;
  325. case '.v.':
  326. $parentType = '.v:base.';
  327. break;
  328. default:
  329. $parentType = '';
  330. break;
  331. }
  332. if (!empty($value['grammar']) && $value['type'] !== ".v.") {
  333. $arrGrammar = explode('$', $value['grammar']);
  334. $parentType .= $arrGrammar[0];
  335. }
  336. # 只保存语法信息合理的数据
  337. if (isset($case[$parentType])) {
  338. array_push($output, $value);
  339. }
  340. }
  341. return $output;
  342. } else {
  343. return false;
  344. }
  345. }
  346. }