CaseMan.php 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. <?php
  2. namespace App\Tools;
  3. use Illuminate\Support\Facades\Cache;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\UserDict;
  6. use App\Models\WordIndex;
  7. class CaseMan
  8. {
  9. /**
  10. * Create a new class instance.
  11. *
  12. * @return void
  13. */
  14. public function __construct()
  15. {
  16. return;
  17. }
  18. /**
  19. * 从词干到单词的变化
  20. *
  21. * @return void
  22. */
  23. public function Declension($base,$type=null,$grammar='',$confidence=0.5){
  24. $newWord = array();
  25. $case = new CaseEnding();
  26. foreach ($case->ending as $ending) {
  27. # code...
  28. if($ending[4]<$confidence){
  29. continue;
  30. }
  31. switch ($type) {
  32. case '.n:base.':
  33. if($ending[2] !== '.n.' || strpos($ending[3],$grammar)!==0){continue 2;}
  34. break;
  35. case '.ti:base.':
  36. if($ending[2] !== '.ti.' || strpos($ending[3],$grammar)!==0){continue 2;}
  37. break;
  38. case '.adj:base.':
  39. if($ending[2] !== '.ti.' || strpos($ending[3],$grammar)!==0){continue 2;}
  40. break;
  41. case '.v:base.':
  42. if($ending[2] !== '.v.'){continue 2;}
  43. break;
  44. default:
  45. continue 2;
  46. break;
  47. }
  48. $endingLen = mb_strlen($ending[0], "UTF-8");
  49. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  50. if ($wordEnd === $ending[0]) {
  51. //匹配成功
  52. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  53. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  54. if($count){
  55. $newWord[] = ['word'=>$word,
  56. 'ending'=>$ending[1],
  57. 'grammar'=>$ending[3],
  58. 'count'=>$count->count,
  59. 'bold'=>$count->bold
  60. ];
  61. }
  62. }
  63. }
  64. /*
  65. $result = [];
  66. foreach ($newWord as $key => $value) {
  67. $result[] = ['word'=>$key,'ending',"count"=>$value["count"],"bold"=>$value["bold"]];
  68. }
  69. */
  70. return $newWord;
  71. }
  72. /**
  73. * 从词干到单词的变化
  74. *
  75. * @return void
  76. */
  77. public function BaseToWord($base,$confidence=0.5){
  78. $newWord = array();
  79. $case = new CaseEnding();
  80. foreach ($case->ending as $ending) {
  81. # code...
  82. if($ending[4]<$confidence){
  83. continue;
  84. }
  85. $endingLen = mb_strlen($ending[0], "UTF-8");
  86. $wordEnd = mb_substr($base, 0 - $endingLen, null, "UTF-8");
  87. if ($wordEnd === $ending[0]) {
  88. //匹配成功
  89. $word = mb_substr($base, 0, mb_strlen($base, "UTF-8") - $endingLen, "UTF-8") . $ending[1];
  90. if(!isset($newWord[$word])){
  91. $count = WordIndex::where('word',$word)->select(['count','bold'])->first();
  92. if($count){
  93. $newWord[$word] = ["count"=>$count->count,"bold"=>$count->bold];
  94. }else{
  95. $newWord[$word] = false;
  96. }
  97. }
  98. //尝试sandhi
  99. //TODO 加两个sandhi
  100. foreach ($case->union as $sandhi) {
  101. $sandhiLen = strlen($sandhi[0]);
  102. $sandhiEnd = mb_substr($word, 0 - $sandhiLen, null, "UTF-8");
  103. if ($sandhiEnd === $sandhi[0]) {
  104. $sandhiWord = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $sandhiLen, "UTF-8") . $sandhi[1];
  105. if(!isset($newWord[$sandhiWord])){
  106. $count = WordIndex::where('word',$sandhiWord)->select(['count','bold'])->first();
  107. if($count){
  108. $newWord[$sandhiWord] = ["count"=>$count->count,"bold"=>$count->bold];
  109. }else{
  110. $newWord[$sandhiWord] = false;
  111. }
  112. }
  113. }
  114. }
  115. }
  116. }
  117. $result = [];
  118. foreach ($newWord as $key => $value) {
  119. # code...
  120. if($value !== false){
  121. $result[] = ['word'=>$key,'ending',"count"=>$value["count"],"bold"=>$value["bold"]];
  122. }
  123. }
  124. return $result;
  125. }
  126. /**
  127. * 从单词到词干的变化
  128. * 小蝌蚪找妈妈
  129. * @return void
  130. */
  131. public function WordToBase($word,$deep=1,$verify=true){
  132. $newWords = array();
  133. $newBase = array();
  134. $input[$word] = true;
  135. $case = new CaseEnding();
  136. for ($i=0; $i < $deep; $i++) {
  137. # code...
  138. foreach ($input as $currWord => $status) {
  139. # code...
  140. if($status){
  141. $input[$currWord] = false;
  142. foreach ($case->ending as $ending) {
  143. # code...
  144. if($ending[1] < 0.5){
  145. continue;
  146. }
  147. $endingLen = mb_strlen($ending[1], "UTF-8");
  148. $wordEnd = mb_substr($currWord, 0 - $endingLen, null, "UTF-8");
  149. if ($wordEnd === $ending[1]) {
  150. //匹配成功
  151. $base = mb_substr($currWord, 0, mb_strlen($currWord, "UTF-8") - $endingLen, "UTF-8") . $ending[0];
  152. if(!isset($newBase[$base])){
  153. $newBase[$base] = array();
  154. }
  155. array_push($newBase[$base],[
  156. 'word'=>$currWord,
  157. 'type'=>$ending[2],
  158. 'grammar'=>$ending[3],
  159. 'parent'=>$base,
  160. 'factors'=>"{$base}+[{$ending[1]}]",
  161. 'confidence'=>$ending[4],
  162. ]);
  163. }
  164. }
  165. }
  166. }
  167. foreach ($newBase as $currWord => $value) {
  168. # 把新词加入列表
  169. if(!isset($input[$currWord])){
  170. $input[$currWord] = true;
  171. }
  172. }
  173. }
  174. if($verify){
  175. $output = array();
  176. foreach ($newBase as $base => $rows) {
  177. # code...
  178. if(($verify = $this->VerifyBase($base,$rows)) !== false){
  179. if(count($verify)>0){
  180. $output[$base] = $verify;
  181. }
  182. }
  183. }
  184. if(count($output)==0){
  185. //如果验证失败 输出最可能的结果
  186. $short = 10000;
  187. $shortBase = "";
  188. foreach ($newBase as $base => $rows) {
  189. if(mb_strlen($base,"UTF-8") < $short){
  190. $short = mb_strlen($base,"UTF-8");
  191. $shortBase = $base;
  192. }
  193. }
  194. foreach ($newBase as $base => $rows) {
  195. if($base == $shortBase){
  196. $output[$base] = $rows;
  197. }
  198. }
  199. }
  200. return $output;
  201. }else{
  202. return $newBase;
  203. }
  204. }
  205. /**
  206. * 验证base在字典中是否存在
  207. */
  208. public function VerifyBase($base,$rows){
  209. #
  210. $output = array();
  211. $dictWords = UserDict::where('word',$base)->select(['type','grammar'])->groupBy(['type','grammar'])->get();
  212. if(count($dictWords)>0){
  213. $newBase[$base] = 1;
  214. $case = array();
  215. //字典中这个拼写的单词的语法信息
  216. foreach ($dictWords as $value) {
  217. # code...
  218. $case["{$value->type}{$value->grammar}"] = 1;
  219. }
  220. foreach ($rows as $value) {
  221. //根据输入的猜测的type,grammar拼接合理的 parent 语法信息
  222. switch ($value['type']) {
  223. case '.n.':
  224. $parentType = '.n:base.';
  225. break;
  226. case '.ti.':
  227. $parentType = '.ti:base.';
  228. break;
  229. case '.v.':
  230. $parentType = '.v:base.';
  231. break;
  232. default:
  233. $parentType = '';
  234. break;
  235. }
  236. if(!empty($value['grammar']) && $value['type'] !== ".v."){
  237. $arrGrammar = explode('$',$value['grammar']);
  238. $parentType .= $arrGrammar[0];
  239. }
  240. # 只保存语法信息合理的数据
  241. if(isset($case[$parentType])){
  242. array_push($output,$value);
  243. }
  244. }
  245. return $output;
  246. }else{
  247. return false;
  248. }
  249. }
  250. }