CaseMan.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. <?php
  2. namespace App\Tools;
  3. use Illuminate\Support\Facades\Cache;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\UserDict;
  6. use App\Models\WordIndex;
  7. class CaseMan
  8. {
  9. /**
  10. * Create a new class instance.
  11. *
  12. * @return void
  13. */
  14. public function __construct()
  15. {
  16. return;
  17. }
  18. /**
  19. * 从词干到单词的变化
  20. *
  21. * @return void
  22. */
  23. public function Declension($base,$type=null,$grammar='',$confidence=0.5){
  24. $newWord = array();
  25. $case = new CaseEnding();
  26. foreach ($case->ending as $ending) {
  27. # code...
  28. if($ending[4]<$confidence){
  29. continue;
  30. }
  31. switch ($type) {
  32. case '.n:base.':
  33. if($ending[2] !== '.n.' || strpos($ending[3],$grammar)!==0){continue 2;}
  34. break;
  35. case '.ti:base.':
  36. if($ending[2] !== '.ti.' && $ending[2] !== '.n.' ){continue 2;}
  37. break;
  38. case '.adj:base.':
  39. if($ending[2] !== '.ti.' && $ending[2] !== '.n.' ){continue 2;}
  40. break;
  41. case '.v:base.':
  42. if($ending[2] !== '.v.'){continue 2;}
  43. break;
  44. default:
  45. continue 2;
  46. break;
  47. }
  48. $endingLen = mb_strlen($ending[0], "UTF-8");
  49. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  50. if ($wordEnd === $ending[0]) {
  51. //匹配成功
  52. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  53. //尝试sandhi
  54. //TODO 加两个sandhi
  55. $hasSandhi = false;
  56. foreach ($case->union as $sandhi) {
  57. $sandhiLen = mb_strlen($sandhi[0],'UTF-8');
  58. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  59. if ($sandhiEnd === $sandhi[0]) {
  60. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  61. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  62. if($count){
  63. $hasSandhi = true;
  64. $newWord[] = ['word'=>$sandhiWord,
  65. 'ending'=>$ending[1],
  66. 'type'=>'.un.',
  67. 'grammar'=>'',
  68. 'factors'=>"{$word}+{$sandhi[2]}",
  69. 'count'=>$count->count,
  70. 'bold'=>$count->bold
  71. ];
  72. //添加一个去掉ti的数据
  73. if($sandhi[2] === 'iti'){
  74. $newWord[] = ['word'=>mb_substr($sandhiWord,0,-2,'UTF-8'),
  75. 'ending'=>$ending[1],
  76. 'grammar'=>$ending[3],
  77. 'factors'=>"{$base}+[{$ending[1]}]",
  78. 'count'=>$count->count,
  79. 'bold'=>$count->bold
  80. ];
  81. }
  82. }
  83. }
  84. }
  85. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  86. if($count || $hasSandhi){
  87. $newWord[] = ['word'=>$word,
  88. 'ending'=>$ending[1],
  89. 'grammar'=>$ending[3],
  90. 'factors'=>"{$base}+[{$ending[1]}]",
  91. 'count'=>$count?$count->count:0,
  92. 'bold'=>$count?$count->bold:0
  93. ];
  94. }
  95. }
  96. }
  97. return $newWord;
  98. }
  99. private function endingMatch($base,$ending,$array=null){
  100. $case = new CaseEnding();
  101. $output = array();
  102. $endingLen = mb_strlen($ending[0], "UTF-8");
  103. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  104. if ($wordEnd === $ending[0]) {
  105. //匹配成功
  106. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  107. if(is_array($array)){
  108. if(!isset($array[$word])){
  109. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  110. }
  111. }else{
  112. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  113. }
  114. if(isset($count) && $count){
  115. $output[$word] = ["count"=>$count->count,"bold"=>$count->bold];
  116. }else{
  117. $output[$word] = false;
  118. }
  119. //尝试sandhi
  120. //TODO 加两个sandhi
  121. foreach ($case->union as $sandhi) {
  122. $sandhiLen = strlen($sandhi[0]);
  123. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  124. if ($sandhiEnd === $sandhi[0]) {
  125. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  126. if(is_array($array)){
  127. if(!isset($array[$sandhiWord])){
  128. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  129. }
  130. }else{
  131. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  132. }
  133. if(isset($count) && $count){
  134. $output[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
  135. }else{
  136. $output[$sandhiWord] = false;
  137. }
  138. }
  139. }
  140. }
  141. return $output;
  142. }
  143. /**
  144. * 从词干到单词的变化
  145. *
  146. * @return void
  147. */
  148. public function BaseToWord($base,$confidence=0.5){
  149. $newWord = array();
  150. $case = new CaseEnding();
  151. foreach ($case->ending as $ending) {
  152. # code...
  153. if($ending[4]<$confidence){
  154. continue;
  155. }
  156. /*
  157. $matched = $this->endingMatch($base,$ending,$newWord);
  158. foreach ($matched as $key => $new) {
  159. $newWord[$key] = $new;
  160. }
  161. */
  162. $endingLen = mb_strlen($ending[0], "UTF-8");
  163. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  164. if ($wordEnd === $ending[0]) {
  165. //匹配成功
  166. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  167. if(!isset($newWord[$word])){
  168. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  169. if($count){
  170. $newWord[$word] = ["count"=>$count->count,"bold"=>$count->bold];
  171. }else{
  172. $newWord[$word] = false;
  173. }
  174. }
  175. //尝试sandhi
  176. //TODO 加两个sandhi
  177. foreach ($case->union as $sandhi) {
  178. $sandhiLen = mb_strlen($sandhi[0],'UTF-8');
  179. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  180. if ($sandhiEnd === $sandhi[0]) {
  181. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  182. if(!isset($newWord[$sandhiWord])){
  183. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  184. if($count){
  185. $newWord[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
  186. }else{
  187. $newWord[$sandhiWord] = false;
  188. }
  189. }
  190. }
  191. }
  192. }
  193. }
  194. $result = [];
  195. foreach ($newWord as $key => $value) {
  196. # code...
  197. if($value !== false){
  198. $result[] = ['word'=>$key,'ending',"count"=>$value["count"],"bold"=>$value["bold"]];
  199. }
  200. }
  201. return $result;
  202. }
  203. /**
  204. * 从单词到词干的变化
  205. * 小蝌蚪找妈妈
  206. * @return void
  207. */
  208. public function WordToBase($word,$deep=1,$verify=true){
  209. $newWords = array();
  210. $newBase = array();
  211. $input[$word] = true;
  212. $case = new CaseEnding();
  213. for ($i=0; $i < $deep; $i++) {
  214. # code...
  215. foreach ($input as $currWord => $status) {
  216. # code...
  217. if($status){
  218. $input[$currWord] = false;
  219. foreach ($case->ending as $ending) {
  220. # code...
  221. if($ending[4] < 0.5){
  222. continue;
  223. }
  224. $endingLen = mb_strlen($ending[1], "UTF-8");
  225. $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
  226. if ($wordEnd === $ending[1]) {
  227. //匹配成功
  228. $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
  229. if(!isset($newBase[$base])){
  230. $newBase[$base] = array();
  231. }
  232. array_push($newBase[$base],[
  233. 'word'=>$currWord,
  234. 'type'=>$ending[2],
  235. 'grammar'=>$ending[3],
  236. 'parent'=>$base,
  237. 'factors'=>"{$base}+[{$ending[1]}]",
  238. 'confidence'=>$ending[4],
  239. ]);
  240. }
  241. }
  242. }
  243. }
  244. foreach ($newBase as $currWord => $value) {
  245. # 把新词加入列表
  246. if(!isset($input[$currWord])){
  247. $input[$currWord] = true;
  248. }
  249. }
  250. }
  251. if($verify){
  252. $output = array();
  253. foreach ($newBase as $base => $rows) {
  254. # code...
  255. if(($verify = $this->VerifyBase($base,$rows)) !== false){
  256. if(count($verify)>0){
  257. $output[$base] = $verify;
  258. }
  259. }
  260. }
  261. if(count($output)==0){
  262. //如果验证失败 输出最可能的结果
  263. $short = 10000;
  264. $shortBase = "";
  265. foreach ($newBase as $base => $rows) {
  266. if(mb_strlen($base,"UTF-8") < $short){
  267. $short = mb_strlen($base,"UTF-8");
  268. $shortBase = $base;
  269. }
  270. }
  271. foreach ($newBase as $base => $rows) {
  272. if($base == $shortBase){
  273. $output[$base] = $rows;
  274. }
  275. }
  276. }
  277. return $output;
  278. }else{
  279. return $newBase;
  280. }
  281. }
  282. /**
  283. * 验证base在字典中是否存在
  284. */
  285. public function VerifyBase($base,$rows){
  286. #
  287. $output = array();
  288. $dictWords = UserDict::where('word',$base)->select(['type','grammar'])->groupBy(['type','grammar'])->get();
  289. if(count($dictWords)>0){
  290. $newBase[$base] = 1;
  291. $case = array();
  292. //字典中这个拼写的单词的语法信息
  293. foreach ($dictWords as $value) {
  294. # code...
  295. $case["{$value->type}{$value->grammar}"] = 1;
  296. }
  297. foreach ($rows as $value) {
  298. //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
  299. switch ($value['type']) {
  300. case '.n.':
  301. $parentType = '.n:base.';
  302. break;
  303. case '.ti.':
  304. $parentType = '.ti:base.';
  305. break;
  306. case '.v.':
  307. $parentType = '.v:base.';
  308. break;
  309. default:
  310. $parentType = '';
  311. break;
  312. }
  313. if(!empty($value['grammar']) && $value['type'] !== ".v."){
  314. $arrGrammar = explode('$',$value['grammar']);
  315. $parentType .= $arrGrammar[0];
  316. }
  317. # 只保存语法信息合理的数据
  318. if(isset($case[$parentType])){
  319. array_push($output,$value);
  320. }
  321. }
  322. return $output;
  323. }else{
  324. return false;
  325. }
  326. }
  327. }