CaseMan.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. <?php
  2. namespace App\Tools;
  3. use Illuminate\Support\Facades\Cache;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\UserDict;
  6. use App\Models\WordIndex;
  7. class CaseMan
  8. {
  9. /**
  10. * Create a new class instance.
  11. *
  12. * @return void
  13. */
  14. public function __construct()
  15. {
  16. return;
  17. }
  18. /**
  19. * 从词干到单词的变化
  20. *
  21. * @return void
  22. */
  23. public function Declension($base,$type=null,$grammar='',$confidence=0.5){
  24. $newWord = array();
  25. $case = new CaseEnding();
  26. foreach ($case->ending as $ending) {
  27. # code...
  28. if($ending[4]<$confidence){
  29. continue;
  30. }
  31. switch ($type) {
  32. case '.n:base.':
  33. if($ending[2] !== '.n.' || strpos($ending[3],$grammar)!==0){continue 2;}
  34. break;
  35. case '.ti:base.':
  36. if($ending[2] !== '.ti.' && $ending[2] !== '.n.' ){continue 2;}
  37. break;
  38. case '.adj:base.':
  39. if($ending[2] !== '.ti.' && $ending[2] !== '.n.' ){continue 2;}
  40. break;
  41. case '.v:base.':
  42. if($ending[2] !== '.v.'){continue 2;}
  43. break;
  44. default:
  45. continue 2;
  46. break;
  47. }
  48. $endingLen = mb_strlen($ending[0], "UTF-8");
  49. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  50. if ($wordEnd === $ending[0]) {
  51. //匹配成功
  52. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  53. //尝试sandhi
  54. //TODO 加两个sandhi
  55. $hasSandhi = false;
  56. foreach ($case->union as $sandhi) {
  57. $sandhiLen = strlen($sandhi[0]);
  58. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  59. if ($sandhiEnd === $sandhi[0]) {
  60. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  61. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  62. if($count){
  63. $hasSandhi = true;
  64. $newWord[] = ['word'=>$sandhiWord,
  65. 'ending'=>$ending[1],
  66. 'type'=>'.un.',
  67. 'grammar'=>'',
  68. 'factors'=>"{$word}+{$sandhi[2]}",
  69. 'count'=>$count->count,
  70. 'bold'=>$count->bold
  71. ];
  72. }
  73. }
  74. }
  75. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  76. if($count || $hasSandhi){
  77. $newWord[] = ['word'=>$word,
  78. 'ending'=>$ending[1],
  79. 'grammar'=>$ending[3],
  80. 'factors'=>"{$base}+[{$ending[1]}]",
  81. 'count'=>$count?$count->count:0,
  82. 'bold'=>$count?$count->bold:0
  83. ];
  84. }
  85. }
  86. }
  87. return $newWord;
  88. }
  89. private function endingMatch($base,$ending,$array=null){
  90. $case = new CaseEnding();
  91. $output = array();
  92. $endingLen = mb_strlen($ending[0], "UTF-8");
  93. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  94. if ($wordEnd === $ending[0]) {
  95. //匹配成功
  96. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  97. if(is_array($array)){
  98. if(!isset($array[$word])){
  99. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  100. }
  101. }else{
  102. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  103. }
  104. if(isset($count) && $count){
  105. $output[$word] = ["count"=>$count->count,"bold"=>$count->bold];
  106. }else{
  107. $output[$word] = false;
  108. }
  109. //尝试sandhi
  110. //TODO 加两个sandhi
  111. foreach ($case->union as $sandhi) {
  112. $sandhiLen = strlen($sandhi[0]);
  113. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  114. if ($sandhiEnd === $sandhi[0]) {
  115. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  116. if(is_array($array)){
  117. if(!isset($array[$sandhiWord])){
  118. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  119. }
  120. }else{
  121. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  122. }
  123. if(isset($count) && $count){
  124. $output[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
  125. }else{
  126. $output[$sandhiWord] = false;
  127. }
  128. }
  129. }
  130. }
  131. return $output;
  132. }
  133. /**
  134. * 从词干到单词的变化
  135. *
  136. * @return void
  137. */
  138. public function BaseToWord($base,$confidence=0.5){
  139. $newWord = array();
  140. $case = new CaseEnding();
  141. foreach ($case->ending as $ending) {
  142. # code...
  143. if($ending[4]<$confidence){
  144. continue;
  145. }
  146. /*
  147. $matched = $this->endingMatch($base,$ending,$newWord);
  148. foreach ($matched as $key => $new) {
  149. $newWord[$key] = $new;
  150. }
  151. */
  152. $endingLen = mb_strlen($ending[0], "UTF-8");
  153. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  154. if ($wordEnd === $ending[0]) {
  155. //匹配成功
  156. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  157. if(!isset($newWord[$word])){
  158. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  159. if($count){
  160. $newWord[$word] = ["count"=>$count->count,"bold"=>$count->bold];
  161. }else{
  162. $newWord[$word] = false;
  163. }
  164. }
  165. //尝试sandhi
  166. //TODO 加两个sandhi
  167. foreach ($case->union as $sandhi) {
  168. $sandhiLen = strlen($sandhi[0]);
  169. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  170. if ($sandhiEnd === $sandhi[0]) {
  171. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  172. if(!isset($newWord[$sandhiWord])){
  173. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  174. if($count){
  175. $newWord[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
  176. }else{
  177. $newWord[$sandhiWord] = false;
  178. }
  179. }
  180. }
  181. }
  182. }
  183. }
  184. $result = [];
  185. foreach ($newWord as $key => $value) {
  186. # code...
  187. if($value !== false){
  188. $result[] = ['word'=>$key,'ending',"count"=>$value["count"],"bold"=>$value["bold"]];
  189. }
  190. }
  191. return $result;
  192. }
  193. /**
  194. * 从单词到词干的变化
  195. * 小蝌蚪找妈妈
  196. * @return void
  197. */
  198. public function WordToBase($word,$deep=1,$verify=true){
  199. $newWords = array();
  200. $newBase = array();
  201. $input[$word] = true;
  202. $case = new CaseEnding();
  203. for ($i=0; $i < $deep; $i++) {
  204. # code...
  205. foreach ($input as $currWord => $status) {
  206. # code...
  207. if($status){
  208. $input[$currWord] = false;
  209. foreach ($case->ending as $ending) {
  210. # code...
  211. if($ending[4] < 0.5){
  212. continue;
  213. }
  214. $endingLen = mb_strlen($ending[1], "UTF-8");
  215. $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
  216. if ($wordEnd === $ending[1]) {
  217. //匹配成功
  218. $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
  219. if(!isset($newBase[$base])){
  220. $newBase[$base] = array();
  221. }
  222. array_push($newBase[$base],[
  223. 'word'=>$currWord,
  224. 'type'=>$ending[2],
  225. 'grammar'=>$ending[3],
  226. 'parent'=>$base,
  227. 'factors'=>"{$base}+[{$ending[1]}]",
  228. 'confidence'=>$ending[4],
  229. ]);
  230. }
  231. }
  232. }
  233. }
  234. foreach ($newBase as $currWord => $value) {
  235. # 把新词加入列表
  236. if(!isset($input[$currWord])){
  237. $input[$currWord] = true;
  238. }
  239. }
  240. }
  241. if($verify){
  242. $output = array();
  243. foreach ($newBase as $base => $rows) {
  244. # code...
  245. if(($verify = $this->VerifyBase($base,$rows)) !== false){
  246. if(count($verify)>0){
  247. $output[$base] = $verify;
  248. }
  249. }
  250. }
  251. if(count($output)==0){
  252. //如果验证失败 输出最可能的结果
  253. $short = 10000;
  254. $shortBase = "";
  255. foreach ($newBase as $base => $rows) {
  256. if(mb_strlen($base,"UTF-8") < $short){
  257. $short = mb_strlen($base,"UTF-8");
  258. $shortBase = $base;
  259. }
  260. }
  261. foreach ($newBase as $base => $rows) {
  262. if($base == $shortBase){
  263. $output[$base] = $rows;
  264. }
  265. }
  266. }
  267. return $output;
  268. }else{
  269. return $newBase;
  270. }
  271. }
  272. /**
  273. * 验证base在字典中是否存在
  274. */
  275. public function VerifyBase($base,$rows){
  276. #
  277. $output = array();
  278. $dictWords = UserDict::where('word',$base)->select(['type','grammar'])->groupBy(['type','grammar'])->get();
  279. if(count($dictWords)>0){
  280. $newBase[$base] = 1;
  281. $case = array();
  282. //字典中这个拼写的单词的语法信息
  283. foreach ($dictWords as $value) {
  284. # code...
  285. $case["{$value->type}{$value->grammar}"] = 1;
  286. }
  287. foreach ($rows as $value) {
  288. //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
  289. switch ($value['type']) {
  290. case '.n.':
  291. $parentType = '.n:base.';
  292. break;
  293. case '.ti.':
  294. $parentType = '.ti:base.';
  295. break;
  296. case '.v.':
  297. $parentType = '.v:base.';
  298. break;
  299. default:
  300. $parentType = '';
  301. break;
  302. }
  303. if(!empty($value['grammar']) && $value['type'] !== ".v."){
  304. $arrGrammar = explode('$',$value['grammar']);
  305. $parentType .= $arrGrammar[0];
  306. }
  307. # 只保存语法信息合理的数据
  308. if(isset($case[$parentType])){
  309. array_push($output,$value);
  310. }
  311. }
  312. return $output;
  313. }else{
  314. return false;
  315. }
  316. }
  317. }