CaseMan.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. <?php
  2. namespace App\Tools;
  3. use Illuminate\Support\Facades\Cache;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\UserDict;
  6. use App\Models\WordIndex;
  7. class CaseMan
  8. {
  9. /**
  10. * Create a new class instance.
  11. *
  12. * @return void
  13. */
  14. public function __construct()
  15. {
  16. return;
  17. }
  18. /**
  19. * 从词干到单词的变化
  20. *
  21. * @return void
  22. */
  23. public function Declension($base, $type = null, $grammar = '', $confidence = 0.5)
  24. {
  25. $newWord = array();
  26. $case = new CaseEnding();
  27. foreach ($case->ending as $ending) {
  28. # code...
  29. if ($ending[4] < $confidence) {
  30. continue;
  31. }
  32. switch ($type) {
  33. case '.n:base.':
  34. if ($ending[2] !== '.n.' || strpos($ending[3], $grammar) !== 0) {
  35. continue 2;
  36. }
  37. break;
  38. case '.ti:base.':
  39. if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') {
  40. continue 2;
  41. }
  42. break;
  43. case '.adj:base.':
  44. if ($ending[2] !== '.ti.' && $ending[2] !== '.n.') {
  45. continue 2;
  46. }
  47. break;
  48. case '.v:base.':
  49. if ($ending[2] !== '.v.') {
  50. continue 2;
  51. }
  52. break;
  53. default:
  54. continue 2;
  55. break;
  56. }
  57. $endingLen = mb_strlen($ending[0], "UTF-8");
  58. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  59. if ($wordEnd === $ending[0]) {
  60. //匹配成功
  61. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  62. //尝试sandhi
  63. //TODO 加两个sandhi
  64. $hasSandhi = false;
  65. foreach ($case->union as $sandhi) {
  66. $sandhiLen = mb_strlen($sandhi[0], 'UTF-8');
  67. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  68. if ($sandhiEnd === $sandhi[0]) {
  69. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  70. $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
  71. if ($count) {
  72. $hasSandhi = true;
  73. $newWord[] = [
  74. 'word' => $sandhiWord,
  75. 'ending' => $ending[1],
  76. 'type' => '.un.',
  77. 'grammar' => '',
  78. 'factors' => "{$word}+{$sandhi[2]}",
  79. 'count' => $count->count,
  80. 'bold' => $count->bold
  81. ];
  82. //添加一个去掉ti的数据
  83. if ($sandhi[2] === 'iti') {
  84. $newWord[] = [
  85. 'word' => mb_substr($sandhiWord, 0, -2, 'UTF-8'),
  86. 'ending' => $ending[1],
  87. 'grammar' => $ending[3],
  88. 'factors' => "{$base}+[{$ending[1]}]",
  89. 'count' => $count->count,
  90. 'bold' => $count->bold
  91. ];
  92. }
  93. }
  94. }
  95. }
  96. $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
  97. if ($count || $hasSandhi) {
  98. $newWord[] = [
  99. 'word' => $word,
  100. 'ending' => $ending[1],
  101. 'grammar' => $ending[3],
  102. 'factors' => "{$base}+[{$ending[1]}]",
  103. 'count' => $count ? $count->count : 0,
  104. 'bold' => $count ? $count->bold : 0
  105. ];
  106. }
  107. }
  108. }
  109. return $newWord;
  110. }
  111. private function endingMatch($base, $ending, $array = null)
  112. {
  113. $case = new CaseEnding();
  114. $output = array();
  115. $endingLen = mb_strlen($ending[0], "UTF-8");
  116. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  117. if ($wordEnd === $ending[0]) {
  118. //匹配成功
  119. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  120. if (is_array($array)) {
  121. if (!isset($array[$word])) {
  122. $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
  123. }
  124. } else {
  125. $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
  126. }
  127. if (isset($count) && $count) {
  128. $output[$word] = ["count" => $count->count, "bold" => $count->bold];
  129. } else {
  130. $output[$word] = false;
  131. }
  132. //尝试sandhi
  133. //TODO 加两个sandhi
  134. foreach ($case->union as $sandhi) {
  135. $sandhiLen = strlen($sandhi[0]);
  136. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  137. if ($sandhiEnd === $sandhi[0]) {
  138. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  139. if (is_array($array)) {
  140. if (!isset($array[$sandhiWord])) {
  141. $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
  142. }
  143. } else {
  144. $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
  145. }
  146. if (isset($count) && $count) {
  147. $output[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold];
  148. } else {
  149. $output[$sandhiWord] = false;
  150. }
  151. }
  152. }
  153. }
  154. return $output;
  155. }
  156. /**
  157. * 从词干到单词的变化
  158. *
  159. * @return void
  160. */
  161. public function BaseToWord($base, $confidence = 0.5)
  162. {
  163. $newWord = array();
  164. $case = new CaseEnding();
  165. foreach ($case->ending as $ending) {
  166. # code...
  167. if ($ending[4] < $confidence) {
  168. continue;
  169. }
  170. /*
  171. $matched = $this->endingMatch($base,$ending,$newWord);
  172. foreach ($matched as $key => $new) {
  173. $newWord[$key] = $new;
  174. }
  175. */
  176. $endingLen = mb_strlen($ending[0], "UTF-8");
  177. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  178. if ($wordEnd === $ending[0]) {
  179. //匹配成功
  180. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  181. if (!isset($newWord[$word])) {
  182. $count = WordIndex::where('word', $word)->select(['count', 'bold'])->first();
  183. if ($count) {
  184. $newWord[$word] = ["count" => $count->count, "bold" => $count->bold];
  185. } else {
  186. $newWord[$word] = false;
  187. }
  188. }
  189. //尝试sandhi
  190. //TODO 加两个sandhi
  191. foreach ($case->union as $sandhi) {
  192. $sandhiLen = mb_strlen($sandhi[0], 'UTF-8');
  193. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  194. if ($sandhiEnd === $sandhi[0]) {
  195. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  196. if (!isset($newWord[$sandhiWord])) {
  197. $count = WordIndex::where('word', $sandhiWord)->select(['count', 'bold'])->first();
  198. if ($count) {
  199. $newWord[$sandhiWord] = ["count" => $count->count, "bold" => $count->bold];
  200. } else {
  201. $newWord[$sandhiWord] = false;
  202. }
  203. }
  204. }
  205. }
  206. }
  207. }
  208. $result = [];
  209. foreach ($newWord as $key => $value) {
  210. # code...
  211. if ($value !== false) {
  212. $result[] = ['word' => $key, 'ending', "count" => $value["count"], "bold" => $value["bold"]];
  213. }
  214. }
  215. return $result;
  216. }
  217. /**
  218. * 从单词到词干的变化
  219. * 小蝌蚪找妈妈
  220. * @return array
  221. */
  222. public function WordToBase($word, $deep = 1, $verify = true)
  223. {
  224. $newWords = array();
  225. $newBase = array();
  226. $input[$word] = true;
  227. $case = new CaseEnding();
  228. for ($i = 0; $i < $deep; $i++) {
  229. # code...
  230. foreach ($input as $currWord => $status) {
  231. # code...
  232. if ($status) {
  233. $input[$currWord] = false;
  234. foreach ($case->ending as $ending) {
  235. # code...
  236. if ($ending[4] < 0.5) {
  237. continue;
  238. }
  239. $endingLen = mb_strlen($ending[1], "UTF-8");
  240. $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
  241. if ($wordEnd === $ending[1]) {
  242. //匹配成功
  243. $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
  244. if (!isset($newBase[$base])) {
  245. $newBase[$base] = array();
  246. }
  247. array_push($newBase[$base], [
  248. 'word' => $currWord,
  249. 'type' => $ending[2],
  250. 'grammar' => $ending[3],
  251. 'parent' => $base,
  252. 'factors' => "{$base}+[{$ending[1]}]",
  253. 'confidence' => $ending[4],
  254. ]);
  255. }
  256. }
  257. }
  258. }
  259. foreach ($newBase as $currWord => $value) {
  260. # 把新词加入列表
  261. if (!isset($input[$currWord])) {
  262. $input[$currWord] = true;
  263. }
  264. }
  265. }
  266. if ($verify) {
  267. $output = array();
  268. foreach ($newBase as $base => $rows) {
  269. # code...
  270. if (($verify = $this->VerifyBase($base, $rows)) !== false) {
  271. if (count($verify) > 0) {
  272. $output[$base] = $verify;
  273. }
  274. }
  275. }
  276. if (count($output) == 0) {
  277. //如果验证失败 输出最可能的结果
  278. $short = 10000;
  279. $shortBase = "";
  280. foreach ($newBase as $base => $rows) {
  281. if (mb_strlen($base, "UTF-8") < $short) {
  282. $short = mb_strlen($base, "UTF-8");
  283. $shortBase = $base;
  284. }
  285. }
  286. foreach ($newBase as $base => $rows) {
  287. if ($base == $shortBase) {
  288. $output[$base] = $rows;
  289. }
  290. }
  291. }
  292. return $output;
  293. } else {
  294. return $newBase;
  295. }
  296. }
  297. /**
  298. * 验证base在字典中是否存在
  299. */
  300. public function VerifyBase($base, $rows)
  301. {
  302. #
  303. $output = array();
  304. $dictWords = UserDict::where('word', $base)->select(['type', 'grammar'])->groupBy(['type', 'grammar'])->get();
  305. if (count($dictWords) > 0) {
  306. $newBase[$base] = 1;
  307. $case = array();
  308. //字典中这个拼写的单词的语法信息
  309. foreach ($dictWords as $value) {
  310. # code...
  311. $case["{$value->type}{$value->grammar}"] = 1;
  312. }
  313. foreach ($rows as $value) {
  314. //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
  315. switch ($value['type']) {
  316. case '.n.':
  317. $parentType = '.n:base.';
  318. break;
  319. case '.ti.':
  320. $parentType = '.ti:base.';
  321. break;
  322. case '.v.':
  323. $parentType = '.v:base.';
  324. break;
  325. default:
  326. $parentType = '';
  327. break;
  328. }
  329. if (!empty($value['grammar']) && $value['type'] !== ".v.") {
  330. $arrGrammar = explode('$', $value['grammar']);
  331. $parentType .= $arrGrammar[0];
  332. }
  333. # 只保存语法信息合理的数据
  334. if (isset($case[$parentType])) {
  335. array_push($output, $value);
  336. }
  337. }
  338. return $output;
  339. } else {
  340. return false;
  341. }
  342. }
  343. }