CaseMan.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. <?php
  2. namespace App\Tools;
  3. use Illuminate\Support\Facades\Cache;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\UserDict;
  6. use App\Models\WordIndex;
  7. class CaseMan
  8. {
  9. /**
  10. * Create a new class instance.
  11. *
  12. * @return void
  13. */
  14. public function __construct()
  15. {
  16. return;
  17. }
  18. /**
  19. * 从词干到单词的变化
  20. *
  21. * @return void
  22. */
  23. public function Declension($base,$type=null,$grammar='',$confidence=0.5){
  24. $newWord = array();
  25. $case = new CaseEnding();
  26. foreach ($case->ending as $ending) {
  27. # code...
  28. if($ending[4]<$confidence){
  29. continue;
  30. }
  31. switch ($type) {
  32. case '.n:base.':
  33. if($ending[2] !== '.n.' || strpos($ending[3],$grammar)!==0){continue 2;}
  34. break;
  35. case '.ti:base.':
  36. if($ending[2] !== '.ti.' || strpos($ending[3],$grammar)!==0){continue 2;}
  37. break;
  38. case '.adj:base.':
  39. if($ending[2] !== '.ti.' || strpos($ending[3],$grammar)!==0){continue 2;}
  40. break;
  41. case '.v:base.':
  42. if($ending[2] !== '.v.'){continue 2;}
  43. break;
  44. default:
  45. continue 2;
  46. break;
  47. }
  48. $endingLen = mb_strlen($ending[0], "UTF-8");
  49. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  50. if ($wordEnd === $ending[0]) {
  51. //匹配成功
  52. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  53. //尝试sandhi
  54. //TODO 加两个sandhi
  55. $hasSandhi = false;
  56. foreach ($case->union as $sandhi) {
  57. $sandhiLen = strlen($sandhi[0]);
  58. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  59. if ($sandhiEnd === $sandhi[0]) {
  60. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  61. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  62. if($count){
  63. $hasSandhi = true;
  64. $newWord[] = ['word'=>$sandhiWord,
  65. 'ending'=>$ending[1],
  66. 'grammar'=>'.un.',
  67. 'factors'=>"{$word}+{$sandhi[2]}",
  68. 'count'=>$count->count,
  69. 'bold'=>$count->bold
  70. ];
  71. }
  72. }
  73. }
  74. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  75. if($count || $hasSandhi){
  76. $newWord[] = ['word'=>$word,
  77. 'ending'=>$ending[1],
  78. 'grammar'=>$ending[3],
  79. 'factors'=>"{$base}+[{$ending[1]}]",
  80. 'count'=>$count?$count->count:0,
  81. 'bold'=>$count?$count->bold:0
  82. ];
  83. }
  84. }
  85. }
  86. return $newWord;
  87. }
  88. private function endingMatch($base,$ending,$array=null){
  89. $case = new CaseEnding();
  90. $output = array();
  91. $endingLen = mb_strlen($ending[0], "UTF-8");
  92. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  93. if ($wordEnd === $ending[0]) {
  94. //匹配成功
  95. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  96. if(is_array($array)){
  97. if(!isset($array[$word])){
  98. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  99. }
  100. }else{
  101. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  102. }
  103. if(isset($count) && $count){
  104. $output[$word] = ["count"=>$count->count,"bold"=>$count->bold];
  105. }else{
  106. $output[$word] = false;
  107. }
  108. //尝试sandhi
  109. //TODO 加两个sandhi
  110. foreach ($case->union as $sandhi) {
  111. $sandhiLen = strlen($sandhi[0]);
  112. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  113. if ($sandhiEnd === $sandhi[0]) {
  114. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  115. if(is_array($array)){
  116. if(!isset($array[$sandhiWord])){
  117. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  118. }
  119. }else{
  120. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  121. }
  122. if(isset($count) && $count){
  123. $output[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
  124. }else{
  125. $output[$sandhiWord] = false;
  126. }
  127. }
  128. }
  129. }
  130. return $output;
  131. }
  132. /**
  133. * 从词干到单词的变化
  134. *
  135. * @return void
  136. */
  137. public function BaseToWord($base,$confidence=0.5){
  138. $newWord = array();
  139. $case = new CaseEnding();
  140. foreach ($case->ending as $ending) {
  141. # code...
  142. if($ending[4]<$confidence){
  143. continue;
  144. }
  145. /*
  146. $matched = $this->endingMatch($base,$ending,$newWord);
  147. foreach ($matched as $key => $new) {
  148. $newWord[$key] = $new;
  149. }
  150. */
  151. $endingLen = mb_strlen($ending[0], "UTF-8");
  152. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  153. if ($wordEnd === $ending[0]) {
  154. //匹配成功
  155. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  156. if(!isset($newWord[$word])){
  157. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  158. if($count){
  159. $newWord[$word] = ["count"=>$count->count,"bold"=>$count->bold];
  160. }else{
  161. $newWord[$word] = false;
  162. }
  163. }
  164. //尝试sandhi
  165. //TODO 加两个sandhi
  166. foreach ($case->union as $sandhi) {
  167. $sandhiLen = strlen($sandhi[0]);
  168. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  169. if ($sandhiEnd === $sandhi[0]) {
  170. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  171. if(!isset($newWord[$sandhiWord])){
  172. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  173. if($count){
  174. $newWord[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
  175. }else{
  176. $newWord[$sandhiWord] = false;
  177. }
  178. }
  179. }
  180. }
  181. }
  182. }
  183. $result = [];
  184. foreach ($newWord as $key => $value) {
  185. # code...
  186. if($value !== false){
  187. $result[] = ['word'=>$key,'ending',"count"=>$value["count"],"bold"=>$value["bold"]];
  188. }
  189. }
  190. return $result;
  191. }
  192. /**
  193. * 从单词到词干的变化
  194. * 小蝌蚪找妈妈
  195. * @return void
  196. */
  197. public function WordToBase($word,$deep=1,$verify=true){
  198. $newWords = array();
  199. $newBase = array();
  200. $input[$word] = true;
  201. $case = new CaseEnding();
  202. for ($i=0; $i < $deep; $i++) {
  203. # code...
  204. foreach ($input as $currWord => $status) {
  205. # code...
  206. if($status){
  207. $input[$currWord] = false;
  208. foreach ($case->ending as $ending) {
  209. # code...
  210. if($ending[1] < 0.5){
  211. continue;
  212. }
  213. $endingLen = mb_strlen($ending[1], "UTF-8");
  214. $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
  215. if ($wordEnd === $ending[1]) {
  216. //匹配成功
  217. $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
  218. if(!isset($newBase[$base])){
  219. $newBase[$base] = array();
  220. }
  221. array_push($newBase[$base],[
  222. 'word'=>$currWord,
  223. 'type'=>$ending[2],
  224. 'grammar'=>$ending[3],
  225. 'parent'=>$base,
  226. 'factors'=>"{$base}+[{$ending[1]}]",
  227. 'confidence'=>$ending[4],
  228. ]);
  229. }
  230. }
  231. }
  232. }
  233. foreach ($newBase as $currWord => $value) {
  234. # 把新词加入列表
  235. if(!isset($input[$currWord])){
  236. $input[$currWord] = true;
  237. }
  238. }
  239. }
  240. if($verify){
  241. $output = array();
  242. foreach ($newBase as $base => $rows) {
  243. # code...
  244. if(($verify = $this->VerifyBase($base,$rows)) !== false){
  245. if(count($verify)>0){
  246. $output[$base] = $verify;
  247. }
  248. }
  249. }
  250. if(count($output)==0){
  251. //如果验证失败 输出最可能的结果
  252. $short = 10000;
  253. $shortBase = "";
  254. foreach ($newBase as $base => $rows) {
  255. if(mb_strlen($base,"UTF-8") < $short){
  256. $short = mb_strlen($base,"UTF-8");
  257. $shortBase = $base;
  258. }
  259. }
  260. foreach ($newBase as $base => $rows) {
  261. if($base == $shortBase){
  262. $output[$base] = $rows;
  263. }
  264. }
  265. }
  266. return $output;
  267. }else{
  268. return $newBase;
  269. }
  270. }
  271. /**
  272. * 验证base在字典中是否存在
  273. */
  274. public function VerifyBase($base,$rows){
  275. #
  276. $output = array();
  277. $dictWords = UserDict::where('word',$base)->select(['type','grammar'])->groupBy(['type','grammar'])->get();
  278. if(count($dictWords)>0){
  279. $newBase[$base] = 1;
  280. $case = array();
  281. //字典中这个拼写的单词的语法信息
  282. foreach ($dictWords as $value) {
  283. # code...
  284. $case["{$value->type}{$value->grammar}"] = 1;
  285. }
  286. foreach ($rows as $value) {
  287. //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
  288. switch ($value['type']) {
  289. case '.n.':
  290. $parentType = '.n:base.';
  291. break;
  292. case '.ti.':
  293. $parentType = '.ti:base.';
  294. break;
  295. case '.v.':
  296. $parentType = '.v:base.';
  297. break;
  298. default:
  299. $parentType = '';
  300. break;
  301. }
  302. if(!empty($value['grammar']) && $value['type'] !== ".v."){
  303. $arrGrammar = explode('$',$value['grammar']);
  304. $parentType .= $arrGrammar[0];
  305. }
  306. # 只保存语法信息合理的数据
  307. if(isset($case[$parentType])){
  308. array_push($output,$value);
  309. }
  310. }
  311. return $output;
  312. }else{
  313. return false;
  314. }
  315. }
  316. }