split.php 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. <?php
  2. //强力拆分复合词
  3. /*
  4. function: split compound word
  5. step 1 : split at diphthong . ~aa~ -> ~a-a~
  6. 第一步:先切开双元音
  7. step 2 : every part use sandhi rule
  8. 第二步:用$sandhi的方法切分(套用连音规则)
  9. algorithm:
  10. 算法:
  11. f(word){
  12. 1. cut one letter from the end of word by sandhi rule in array($sandhi)
  13. 1. 从单词尾部切去一个字母
  14. 2. lookup first part .
  15. 2. 查询剩余部分
  16. if confidence value>0.8
  17. 如果有结果
  18. - get the confidence value
  19. 获取该部分的信心指数
  20. - process the remaining part at same way
  21. 用同样的方法处理剩余部分
  22. - f(stack.first element)
  23. else
  24. apply other sandhi rule
  25. back to 1
  26. }
  27. this is a recursion, depth=16
  28. 此为递归算法,深度=16
  29. */
  30. require_once "../dict/turbo_split.php";
  31. //check input
  32. if (isset($_POST["word"])) {
  33. $input_word = mb_strtolower(trim($_POST["word"]), 'UTF-8');
  34. if (trim($input_word) == "") {
  35. echo "Empty";
  36. exit;
  37. }
  38. $arrWords = str_getcsv($input_word, "\n"); //支持批量拆分
  39. } else {
  40. ?>
  41. <!--debug only-->
  42. <form action="split.php" method="post">
  43. Words: <br>
  44. <textarea type="text" name="word" style="width:50em;height:20em;"></textarea><br>
  45. <input name="debug" type="hidden" />批量查询,单词之间用换行分隔。 input word. between two words insert 'enter'
  46. <div>
  47. <input type="checkbox" name = "express" checked /> 快速搜索(遇到第一个连音规则成功就返回) return when get first result
  48. </div>
  49. <input type="submit">
  50. </form>
  51. <?php
  52. return;
  53. }
  54. if (isset($_POST["express"])) {
  55. if ($_POST["express"] === "on") {
  56. $_express = true;
  57. } else {
  58. $_express = false;
  59. }
  60. } else {
  61. $_express = false;
  62. }
  63. //main
  64. $allword = array();
  65. foreach ($arrWords as $currword) {
  66. $t1 = microtime_float();
  67. $output = array();
  68. if (isset($_POST["debug"])) {
  69. echo "Look up:{$currword}<br>";
  70. }
  71. //预处理
  72. //将双元音拆开
  73. //step 1 : split at diphthong . ~aa~ -> ~a-a~
  74. //按连字符拆开处理
  75. $arrword = split_diphthong($currword);
  76. foreach ($arrword as $oneword) {
  77. $result = array(); //全局变量,递归程序的输出容器
  78. mySplit2($oneword, 0, false, 0, 0.5, 0.8, true, false);
  79. if(count($result) < 3){
  80. mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, true, true);
  81. }
  82. if (isset($_POST["debug"])) {
  83. echo "正切:" . count($result) . "\n";
  84. }
  85. if(count($result) < 3){
  86. mySplit2($oneword, 0, $_express, 0, 0.2, 0.8, false, true);
  87. }
  88. if (isset($_POST["debug"])) {
  89. echo "反切:" . count($result) . "\n";
  90. }
  91. /*
  92. if (count($result) < 5) {
  93. #sandhi advance
  94. mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false, true);
  95. if (isset($_POST["debug"])) {
  96. echo "反切:" . count($result) . "\n";
  97. }
  98. }
  99. if (count($result) < 5) {
  100. #反向
  101. mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false);
  102. }
  103. if (count($result) < 5) {
  104. #正向
  105. mySplit2($oneword, 0, $_express, 0, 0.8, 0, true);
  106. }
  107. if (count($result) < 5) {
  108. #反向
  109. mySplit2($oneword, 0, $_express, 0, 0.8, 0, false);
  110. }
  111. */
  112. arsort($result); //按信心指数排序
  113. #输出结果 ouput to json
  114. $wordlist = array();
  115. $iMax = 5;
  116. $iCount = 0;
  117. foreach ($result as $row => $value) {
  118. $iCount++;
  119. $word_part = array();
  120. $word_part["word"] = $row;
  121. $word_part["confidence"] = $value;
  122. $wordlist[] = $word_part;
  123. if ($iCount >= $iMax) {
  124. break;
  125. }
  126. }
  127. $output[] = $wordlist;
  128. if (isset($_POST["debug"])) {
  129. echo "<h2>{$oneword}</h2>";
  130. echo "<h4>" . count($result) . "</h4>";
  131. }
  132. $iCount = 0;
  133. foreach ($result as $row => $value) {
  134. if ($iCount > 10) {
  135. break;
  136. }
  137. $iCount++;
  138. $level = $value * 90;
  139. if (isset($_POST["debug"])) {
  140. echo $row . "-[" . $value . "]<br>";
  141. }
  142. }
  143. /*
  144. 后处理
  145. -ssāpi=-[ssa]-api
  146. */
  147. }
  148. $t2 = microtime_float();
  149. $one_split["data"] = $output;
  150. $one_split["time"] = $auto_split_times;
  151. $one_split["second"] = $t2 - $t1;
  152. $allword[] = $one_split;
  153. if (isset($_POST["debug"])) {
  154. echo "<div>";
  155. echo "<br>查询【{$auto_split_times}】次";
  156. echo "time:" . ($t2 - $t1);
  157. echo "</div>";
  158. }
  159. }
  160. if (isset($_POST["debug"])) {
  161. echo "<pre style='margin:2em;padding:1em;background-color:#e9e9e9;'>";
  162. print_r($allword);
  163. echo "</pre>";
  164. }
  165. echo json_encode($allword, JSON_UNESCAPED_UNICODE);
  166. ?>