CaseMan.php 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. <?php
  2. namespace App\Tools;
  3. use Illuminate\Support\Facades\Cache;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\UserDict;
  6. use App\Models\WordIndex;
  7. class CaseMan
  8. {
  9. /**
  10. * Create a new class instance.
  11. *
  12. * @return void
  13. */
  14. public function __construct()
  15. {
  16. return;
  17. }
  18. /**
  19. * 从词干到单词的变化
  20. *
  21. * @return void
  22. */
  23. public function Declension($base,$type=null,$grammar='',$confidence=0.5){
  24. $newWord = array();
  25. $case = new CaseEnding();
  26. foreach ($case->ending as $ending) {
  27. # code...
  28. if($ending[4]<$confidence){
  29. continue;
  30. }
  31. switch ($type) {
  32. case '.n:base.':
  33. if($ending[2] !== '.n.' || strpos($ending[3],$grammar)!==0){continue 2;}
  34. break;
  35. case '.ti:base.':
  36. if($ending[2] !== '.ti.' || strpos($ending[3],$grammar)!==0){continue 2;}
  37. break;
  38. case '.adj:base.':
  39. if($ending[2] !== '.ti.' || strpos($ending[3],$grammar)!==0){continue 2;}
  40. break;
  41. case '.v:base.':
  42. if($ending[2] !== '.v.'){continue 2;}
  43. break;
  44. default:
  45. continue 2;
  46. break;
  47. }
  48. $endingLen = mb_strlen($ending[0], "UTF-8");
  49. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  50. if ($wordEnd === $ending[0]) {
  51. //匹配成功
  52. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  53. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  54. if($count){
  55. $newWord[] = ['word'=>$word,
  56. 'ending'=>$ending[1],
  57. 'grammar'=>$ending[3],
  58. 'count'=>$count->count,
  59. 'bold'=>$count->bold
  60. ];
  61. }
  62. }
  63. }
  64. /*
  65. $result = [];
  66. foreach ($newWord as $key => $value) {
  67. $result[] = ['word'=>$key,'ending',"count"=>$value["count"],"bold"=>$value["bold"]];
  68. }
  69. */
  70. return $newWord;
  71. }
  72. /**
  73. * 从词干到单词的变化
  74. *
  75. * @return void
  76. */
  77. public function BaseToWord($base,$confidence=0.5){
  78. $newWord = array();
  79. $case = new CaseEnding();
  80. foreach ($case->ending as $ending) {
  81. # code...
  82. if($ending[4]<$confidence){
  83. continue;
  84. }
  85. $endingLen = mb_strlen($ending[0], "UTF-8");
  86. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  87. if ($wordEnd === $ending[0]) {
  88. //匹配成功
  89. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  90. if(!isset($newWord[$word])){
  91. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  92. if($count){
  93. $newWord[$word] = ["count"=>$count->count,"bold"=>$count->bold];
  94. }else{
  95. $newWord[$word] = false;
  96. }
  97. }
  98. }
  99. }
  100. $result = [];
  101. foreach ($newWord as $key => $value) {
  102. # code...
  103. if($value !== false){
  104. $result[] = ['word'=>$key,'ending',"count"=>$value["count"],"bold"=>$value["bold"]];
  105. }
  106. }
  107. return $result;
  108. }
  109. /**
  110. * 从单词到词干的变化
  111. * 小蝌蚪找妈妈
  112. * @return void
  113. */
  114. public function WordToBase($word,$deep=1,$verify=true){
  115. $newWords = array();
  116. $newBase = array();
  117. $input[$word] = true;
  118. $case = new CaseEnding();
  119. for ($i=0; $i < $deep; $i++) {
  120. # code...
  121. foreach ($input as $currWord => $status) {
  122. # code...
  123. if($status){
  124. $input[$currWord] = false;
  125. foreach ($case->ending as $ending) {
  126. # code...
  127. if($ending[1] < 0.5){
  128. continue;
  129. }
  130. $endingLen = mb_strlen($ending[1], "UTF-8");
  131. $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
  132. if ($wordEnd === $ending[1]) {
  133. //匹配成功
  134. $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
  135. if(!isset($newBase[$base])){
  136. $newBase[$base] = array();
  137. }
  138. array_push($newBase[$base],[
  139. 'word'=>$currWord,
  140. 'type'=>$ending[2],
  141. 'grammar'=>$ending[3],
  142. 'parent'=>$base,
  143. 'factors'=>"{$base}+[{$ending[1]}]",
  144. 'confidence'=>$ending[4],
  145. ]);
  146. }
  147. }
  148. }
  149. }
  150. foreach ($newBase as $currWord => $value) {
  151. # 把新词加入列表
  152. if(!isset($input[$currWord])){
  153. $input[$currWord] = true;
  154. }
  155. }
  156. }
  157. if($verify){
  158. $output = array();
  159. foreach ($newBase as $base => $rows) {
  160. # code...
  161. if(($verify = $this->VerifyBase($base,$rows)) !== false){
  162. if(count($verify)>0){
  163. $output[$base] = $verify;
  164. }
  165. }
  166. }
  167. if(count($output)==0){
  168. //如果验证失败 输出最可能的结果
  169. $short = 10000;
  170. $shortBase = "";
  171. foreach ($newBase as $base => $rows) {
  172. if(mb_strlen($base,"UTF-8") < $short){
  173. $short = mb_strlen($base,"UTF-8");
  174. $shortBase = $base;
  175. }
  176. }
  177. foreach ($newBase as $base => $rows) {
  178. if($base == $shortBase){
  179. $output[$base] = $rows;
  180. }
  181. }
  182. }
  183. return $output;
  184. }else{
  185. return $newBase;
  186. }
  187. }
  188. /**
  189. * 验证base在字典中是否存在
  190. */
  191. public function VerifyBase($base,$rows){
  192. #
  193. $output = array();
  194. $dictWords = UserDict::where('word',$base)->select(['type','grammar'])->groupBy(['type','grammar'])->get();
  195. if(count($dictWords)>0){
  196. $newBase[$base] = 1;
  197. $case = array();
  198. //字典中这个拼写的单词的语法信息
  199. foreach ($dictWords as $value) {
  200. # code...
  201. $case["{$value->type}{$value->grammar}"] = 1;
  202. }
  203. foreach ($rows as $value) {
  204. //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
  205. switch ($value['type']) {
  206. case '.n.':
  207. $parentType = '.n:base.';
  208. break;
  209. case '.ti.':
  210. $parentType = '.ti:base.';
  211. break;
  212. case '.v.':
  213. $parentType = '.v:base.';
  214. break;
  215. default:
  216. $parentType = '';
  217. break;
  218. }
  219. if(!empty($value['grammar']) && $value['type'] !== ".v."){
  220. $arrGrammar = explode('$',$value['grammar']);
  221. $parentType .= $arrGrammar[0];
  222. }
  223. # 只保存语法信息合理的数据
  224. if(isset($case[$parentType])){
  225. array_push($output,$value);
  226. }
  227. }
  228. return $output;
  229. }else{
  230. return false;
  231. }
  232. }
  233. }