split.php 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. <?php
  2. //强力拆分复合词
  3. /*
  4. function: split compound word
  5. step 1 : split at diphthong . ~aa~ -> ~a-a~
  6. 第一步:先切开双元音
  7. step 2 : every part use sandhi rule
  8. 第二步:用$sandhi的方法切分(套用连音规则)
  9. algorithm:
  10. 算法:
  11. f(word){
  12. 1. cut one letter from the end of word by sandhi rule in array($sandhi)
  13. 1. 从单词尾部切去一个字母
  14. 2. lookup first part .
  15. 2. 查询剩余部分
  16. if confidence value>0.8
  17. 如果有结果
  18. - get the confidence value
  19. 获取该部分的信心指数
  20. - process the remaining part at same way
  21. 用同样的方法处理剩余部分
  22. - f(stack.first element)
  23. else
  24. apply other sandhi rule
  25. back to 1
  26. }
  27. this is a recursion, depth=16
  28. 此为递归算法,深度=16
  29. */
  30. require_once "../dict/turbo_split.php";
  31. global $auto_split_times;
  32. //check input
  33. if (isset($_POST["word"])) {
  34. $input_word = mb_strtolower(trim($_POST["word"]), 'UTF-8');
  35. if (trim($input_word) == "") {
  36. echo "Empty";
  37. exit;
  38. }
  39. $arrWords = str_getcsv($input_word, "\n"); //支持批量拆分
  40. } else {
  41. ?>
  42. <!--debug only-->
  43. <form action="split.php" method="post">
  44. Words: <br>
  45. <textarea type="text" name="word" style="width:50em;height:20em;"></textarea><br>
  46. <input name="debug" type="hidden" />批量查询,单词之间用换行分隔。 input word. between two words insert 'enter'
  47. <div>
  48. <input type="checkbox" name = "express" checked /> 快速搜索(遇到第一个连音规则成功就返回) return when get first result
  49. </div>
  50. <input type="submit">
  51. </form>
  52. <?php
  53. return;
  54. }
  55. if (isset($_POST["express"])) {
  56. if ($_POST["express"] === "on") {
  57. $_express = true;
  58. } else {
  59. $_express = false;
  60. }
  61. } else {
  62. $_express = false;
  63. }
  64. //main
  65. $allword = array();
  66. foreach ($arrWords as $currword) {
  67. $t1 = microtime_float();
  68. $output = array();
  69. if (isset($_POST["debug"])) {
  70. echo "Look up:{$currword}<br>";
  71. }
  72. //预处理
  73. //将双元音拆开
  74. //step 1 : split at diphthong . ~aa~ -> ~a-a~
  75. //按连字符拆开处理
  76. $arrword = split_diphthong($currword);
  77. foreach ($arrword as $oneword) {
  78. $result = array(); //全局变量,递归程序的输出容器
  79. #输出结果 ouput to json
  80. $wordlist = array();
  81. $needDeep = false;
  82. //看现有的字典里是不是有
  83. $new = split2($oneword);
  84. if($new!==$oneword){
  85. //现有字典里查到
  86. $word_part["word"] = $new;
  87. $word_part["confidence"] = 1.0;
  88. $wordlist[] = $word_part;
  89. #再处理一次
  90. $new2 = split2($new);
  91. if($new2!==$new){
  92. $word_part["word"] = $new2;
  93. $word_part["confidence"] = 1.0;
  94. $wordlist[] = $word_part;
  95. }
  96. $needDeep = false;
  97. }
  98. else{
  99. //没查到,查连音词
  100. $preSandhi = preSandhi($oneword);
  101. if($preSandhi!==$oneword){
  102. $word_part["word"] = $preSandhi;
  103. $word_part["confidence"] = 1.0;
  104. $wordlist[] = $word_part;
  105. //将处理后的连音词再二次拆分
  106. $new = split2($preSandhi);
  107. if($new!==$row){
  108. $word_part["word"] = $new;
  109. $word_part["confidence"] = $value;
  110. $wordlist[] = $word_part;
  111. #再处理一次
  112. $new2 = split2($new);
  113. if($new2!==$new){
  114. $word_part["word"] = $new2;
  115. $word_part["confidence"] = $value;
  116. $wordlist[] = $word_part;
  117. }
  118. //如果能处理,就不进行深度拆分了
  119. $needDeep = false;
  120. }
  121. else{
  122. //连音词的第一部分没查到,进行深度拆分
  123. $needDeep = true;
  124. }
  125. }
  126. else{
  127. $needDeep = true;
  128. }
  129. }
  130. if($needDeep){
  131. if(mb_strlen($oneword,"UTF-8")>35){
  132. mySplit2($oneword, 0, true, 0, 0.9, 0.95, true, false);
  133. }
  134. else{
  135. mySplit2($oneword, 0, false, 0, 0.5, 0.95, true, false);
  136. }
  137. if(count($result) < 1){
  138. mySplit2($oneword, 0, $_express, 0, 0.4, 0.8, true, true);
  139. }
  140. if (isset($_POST["debug"])) {
  141. echo "正切:" . count($result) . "<br>\n";
  142. }
  143. if(count($result) < 2){
  144. mySplit2($oneword, 0, $_express, 0, 0.4, 0.8, false, true);
  145. }
  146. if (isset($_POST["debug"])) {
  147. echo "反切:" . count($result) . "<br>\n";
  148. }
  149. arsort($result); //按信心指数排序
  150. $iMax = 5;
  151. $iCount = 0;
  152. foreach ($result as $row => $value) {
  153. $iCount++;
  154. $word_part = array();
  155. $word_part["word"] = $row;
  156. $word_part["confidence"] = $value;
  157. $wordlist[] = $word_part;
  158. //后处理 进一步切分没有意思的长词
  159. $new = split2($row);
  160. if($new!==$row){
  161. $word_part["word"] = $new;
  162. $word_part["confidence"] = $value;
  163. $wordlist[] = $word_part;
  164. #再处理一次
  165. $new2 = split2($new);
  166. if($new2!==$new){
  167. $word_part["word"] = $new2;
  168. $word_part["confidence"] = $value;
  169. $wordlist[] = $word_part;
  170. }
  171. }
  172. if ($iCount >= $iMax) {
  173. break;
  174. }
  175. }
  176. }
  177. $output[] = $wordlist;
  178. if (isset($_POST["debug"])) {
  179. echo "<h2>{$oneword}</h2>";
  180. echo "<h4>" . count($result) . "</h4>";
  181. }
  182. $iCount = 0;
  183. foreach ($result as $row => $value) {
  184. if ($iCount > 10) {
  185. break;
  186. }
  187. $iCount++;
  188. $level = $value * 90;
  189. if (isset($_POST["debug"])) {
  190. echo $row . "-[" . $value . "]<br>";
  191. }
  192. }
  193. /*
  194. 后处理
  195. -ssāpi=-[ssa]-api
  196. */
  197. }
  198. $t2 = microtime_float();
  199. $one_split["data"] = $output;
  200. $one_split["time"] = $auto_split_times;
  201. $one_split["second"] = $t2 - $t1;
  202. $allword[] = $one_split;
  203. if (isset($_POST["debug"])) {
  204. echo "<div>";
  205. echo "<br>查询【{$auto_split_times}】次";
  206. echo "time:" . ($t2 - $t1);
  207. echo "</div>";
  208. }
  209. }
  210. if (isset($_POST["debug"])) {
  211. echo "<pre style='margin:2em;padding:1em;background-color:#e9e9e9;'>";
  212. print_r($allword);
  213. echo "</pre>";
  214. }
  215. echo json_encode($allword, JSON_UNESCAPED_UNICODE);
  216. ?>