turbo_split.php 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708
  1. <?php
  2. require_once '../public/casesuf.inc';
  3. //require_once '../studio/dict_find_un.inc';
  4. //require_once '../studio/sandhi.php';
  5. require_once "../path.php";
  6. require_once "../public/_pdo.php";
  7. require_once "../redis/function.php";
  8. global $redis;
  9. $redis = redis_connect();
  10. // open word part db
  11. global $dbh;
  12. $dns = "" . _FILE_DB_PART_;
  13. $dbh = new PDO($dns, "", "", array(PDO::ATTR_PERSISTENT => true));
  14. $dbh->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
  15. global $path;
  16. #当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
  17. global $currPathCf;
  18. #内存信心指数表
  19. global $confidence;
  20. global $result;
  21. global $part;
  22. define("MAX_RESULT",100);
  23. $part = array();
  24. $path[] = array("", 0);
  25. $path[] = array("", 0);
  26. $path[] = array("", 0);
  27. $path[] = array("", 0);
  28. $path[] = array("", 0);
  29. $path[] = array("", 0);
  30. $path[] = array("", 0);
  31. $path[] = array("", 0);
  32. $path[] = array("", 0);
  33. $path[] = array("", 0);
  34. $path[] = array("", 0);
  35. $path[] = array("", 0);
  36. $path[] = array("", 0);
  37. $path[] = array("", 0);
  38. $path[] = array("", 0);
  39. $path[] = array("", 0);
  40. $path[] = array("", 0);
  41. global $sandhi;
  42. //sandhi rules table 语尾表
  43. $sandhi[] = array("a" => "", "b" => "", "c" => "", "len" => 0, "adj_len" => 0, "advance" => false,"cf"=>1.0);
  44. $sandhi[] = array("a" => "a", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  45. $sandhi[] = array("a" => "ā", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  46. $sandhi[] = array("a" => "a", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  47. $sandhi[] = array("a" => "ā", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  48. $sandhi[] = array("a" => "a", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  49. $sandhi[] = array("a" => "a", "b" => "i", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  50. $sandhi[] = array("a" => "a", "b" => "o", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  51. $sandhi[] = array("a" => "a", "b" => "u", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  52. $sandhi[] = array("a" => "u", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  53. $sandhi[] = array("a" => "u", "b" => "u", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  54. $sandhi[] = array("a" => "a", "b" => "u", "c" => "u", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  55. $sandhi[] = array("a" => "a", "b" => "ī", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  56. $sandhi[] = array("a" => "a", "b" => "ū", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  57. $sandhi[] = array("a" => "a", "b" => "i", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  58. $sandhi[] = array("a" => "e", "b" => "a", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  59. $sandhi[] = array("a" => "i", "b" => "i", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  60. $sandhi[] = array("a" => "i", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  61. $sandhi[] = array("a" => "i", "b" => "a", "c" => "ya", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  62. $sandhi[] = array("a" => "a", "b" => "atth", "c" => "atth", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  63. $sandhi[] = array("a" => "taṃ", "b" => "n", "c" => "tann", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  64. $sandhi[] = array("a" => "[ṃ]", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  65. $sandhi[] = array("a" => "[ṃ]", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  66. $sandhi[] = array("a" => "[o]", "b" => "iva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  67. $sandhi[] = array("a" => "o", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  68. $sandhi[] = array("a" => "a", "b" => "ādi", "c" => "ādi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  69. $sandhi[] = array("a" => "a[ānaṃ]", "b" => "a", "c" => "ānama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  70. $sandhi[] = array("a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  71. $sandhi[] = array("a" => "[ṃ]", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  72. $sandhi[] = array("a" => "[ṃ]", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  73. $sandhi[] = array("a" => "[ṃ]", "b" => "a", "c" => "ma", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  74. $sandhi[] = array("a" => "ṃ", "b" => "a", "c" => "m", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  75. $sandhi[] = array("a" => "[ṃ]", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  76. $sandhi[] = array("a" => "[ṃ]", "b" => "u", "c" => "mu", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  77. $sandhi[] = array("a" => "[ṃ]", "b" => "h", "c" => "ñh", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  78. $sandhi[] = array("a" => "ā", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  79. $sandhi[] = array("a" => "a", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  80. $sandhi[] = array("a" => "ī", "b" => "[ṃ]", "c" => "im", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  81. $sandhi[] = array("a" => "ati", "b" => "tabba", "c" => "atabba", "len" => 6, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  82. $sandhi[] = array("a" => "ati", "b" => "tabba", "c" => "itabba", "len" => 6, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  83. $sandhi[] = array("a" => "iti", "b" => "a", "c" => "icca", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  84. $sandhi[] = array("a" => "uṃ", "b" => "a", "c" => "uma", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  85. $sandhi[] = array("a" => "u[ūnaṃ]", "b" => "a", "c" => "ūnama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  86. $sandhi[] = array("a" => "ī[īnaṃ]", "b" => "a", "c" => "īnama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  87. $sandhi[] = array("a" => "su", "b" => "a", "c" => "sva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  88. #other sandhi rule. can be use but program will be slow down
  89. #其他连音规则,如果使用则会让程序运行变慢
  90. $sandhi[] = array("a" => "ā", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  91. $sandhi[] = array("a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  92. $sandhi[] = array("a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  93. $sandhi[] = array("a" => "ī", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  94. $sandhi[] = array("a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  95. $sandhi[] = array("a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  96. $sandhi[] = array("a" => "ū", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  97. $sandhi[] = array("a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  98. $sandhi[] = array("a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  99. $sandhi[] = array("a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
  100. $sandhi[] = array("a" => "ṃ", "b" => "cāti", "c" => "ñcāti", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>1.0);
  101. $sandhi[] = array("a" => "ṃ", "b" => "cet", "c" => "ñcet", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  102. $sandhi[] = array("a" => "ṃ", "b" => "ev", "c" => "mev", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  103. /*
  104. $sandhi2[] = array("a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
  105. $sandhi2[] = array("a" => "ṃ", "b" => "hi", "c" => "ñhi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
  106. $sandhi2[] = array("a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  107. $sandhi2[] = array("a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  108. $sandhi2[] = array("a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  109. $sandhi2[] = array("a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  110. $sandhi2[] = array("a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  111. $sandhi2[] = array("a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  112. $sandhi2[] = array("a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  113. $sandhi2[] = array("a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  114. $sandhi2[] = array("a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  115. $sandhi2[] = array("a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  116. $sandhi2[] = array("a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  117. $sandhi2[] = array("a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  118. $sandhi2[] = array("a" => "u", "b" => "eva", "c" => "veva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  119. $sandhi2[] = array("a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  120. $sandhi2[] = array("a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  121. $sandhi2[] = array("a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  122. $sandhi2[] = array("a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  123. $sandhi2[] = array("a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  124. $sandhi2[] = array("a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  125. $sandhi2[] = array("a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  126. $sandhi2[] = array("a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  127. $sandhi2[] = array("a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  128. $sandhi2[] = array("a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  129. */
  130. $sandhi[] = array("a" => "a", "b" => "a", "c" => "a", "len" => 1, "adj_len" => -1, "advance" => true,"cf"=>0.99);
  131. $sandhi[] = array("a" => "ī", "b" => "", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => true,"cf"=>0.9);
  132. function split_diphthong($word)
  133. {
  134. //diphthong table双元音表
  135. $search = array('aa', 'ae', 'ai', 'ao', 'au', 'aā', 'aī', 'aū', 'ea', 'ee', 'ei', 'eo', 'eu', 'eā', 'eī', 'eū', 'ia', 'ie', 'ii', 'io', 'iu', 'iā', 'iī', 'iū', 'oa', 'oe', 'oi', 'oo', 'ou', 'oā', 'oī', 'oū', 'ua', 'ue', 'ui', 'uo', 'uu', 'uā', 'uī', 'uū', 'āa', 'āe', 'āi', 'āo', 'āu', 'āā', 'āī', 'āū', 'īa', 'īe', 'īi', 'īo', 'īu', 'īā', 'īī', 'īū', 'ūa', 'ūe', 'ūi', 'ūo', 'ūu', 'ūā', 'ūī', 'ūū');
  136. $replace = array('a-a', 'a-e', 'a-i', 'a-o', 'a-u', 'a-ā', 'a-ī', 'a-ū', 'e-a', 'e-e', 'e-i', 'e-o', 'e-u', 'e-ā', 'e-ī', 'e-ū', 'i-a', 'i-e', 'i-i', 'i-o', 'i-u', 'i-ā', 'i-ī', 'i-ū', 'o-a', 'o-e', 'o-i', 'o-o', 'o-u', 'o-ā', 'o-ī', 'o-ū', 'u-a', 'u-e', 'u-i', 'u-o', 'u-u', 'u-ā', 'u-ī', 'u-ū', 'ā-a', 'ā-e', 'ā-i', 'ā-o', 'ā-u', 'ā-ā', 'ā-ī', 'ā-ū', 'ī-a', 'ī-e', 'ī-i', 'ī-o', 'ī-u', 'ī-ā', 'ī-ī', 'ī-ū', 'ū-a', 'ū-e', 'ū-i', 'ū-o', 'ū-u', 'ū-ā', 'ū-ī', 'ū-ū');
  137. //将双元音拆开
  138. //step 1 : split at diphthong . ~aa~ -> ~a-a~
  139. $word1 = str_replace($search, $replace, $word);
  140. //按连字符拆开处理
  141. $arrword = str_getcsv($word1, "-");
  142. return $arrword;
  143. }
  144. /*
  145. 用于数组连接字符串
  146. */
  147. function myfunction($v1, $v2)
  148. {
  149. return $v1 . "+" . $v2;
  150. }
  151. function microtime_float()
  152. {
  153. list($usec, $sec) = explode(" ", microtime());
  154. return ((float) $usec + (float) $sec);
  155. }
  156. function dict_lookup2($word){
  157. global $redis;
  158. global $case;
  159. if (strlen($word) <= 0) {
  160. return 0;
  161. }
  162. if(mb_substr($word,0,1)==="["){
  163. $search = $word;
  164. }
  165. else{
  166. $search = str_replace("[","",$word);
  167. $search = str_replace("]","",$search);
  168. }
  169. $cf = $redis->hGet("dict://part.hash",$search);
  170. if($cf==false){
  171. //去除尾查
  172. $newWord = array();
  173. for ($row = 0; $row < count($case); $row++) {
  174. $len = mb_strlen($case[$row][1], "UTF-8");
  175. $end = mb_substr($search, 0 - $len, null, "UTF-8");
  176. if ($end == $case[$row][1]) {
  177. $base = mb_substr($search, 0, mb_strlen($search, "UTF-8") - $len, "UTF-8") . $case[$row][0];
  178. if ($base != $search) {
  179. $newWord[$base] = mb_strlen($case[$row][1],"UTF-8");
  180. }
  181. }
  182. }
  183. #找到最高频的base
  184. $base_weight = 0;
  185. $isFound = false;
  186. if(count($newWord)>0){
  187. foreach ($newWord as $x => $x_value) {
  188. $row = $redis->hGet("dict://part.hash",$x);
  189. if ($row !=false) {
  190. $isFound=true;
  191. if ($row > $base_weight) {
  192. $base_weight = $row;
  193. }
  194. }
  195. }
  196. if($isFound){
  197. $base_weight*=0.9999;
  198. $redis->hSet("dict://part.hash",$search,$base_weight);
  199. if (isset($_POST["debug"])) {
  200. echo "查到变格:{$search}:{$base_weight}\n";
  201. }
  202. }
  203. }
  204. return $base_weight;
  205. }
  206. else{
  207. if (isset($_POST["debug"])) {
  208. echo "查到:{$search}:{$cf}\n";
  209. }
  210. return $cf;
  211. }
  212. }
  213. function dict_lookup($word)
  214. {
  215. if (strlen($word) <= 1) {
  216. return 0;
  217. }
  218. global $case;
  219. global $dbh;
  220. if(mb_substr($word,0,1)==="["){
  221. $search = $word;
  222. }
  223. else{
  224. $search = str_replace("[","",$word);
  225. $search = str_replace("]","",$search);
  226. }
  227. $query = "SELECT weight from part where word = ? ";
  228. $stmt = $dbh->prepare($query);
  229. $stmt->execute(array($search));
  230. $row = $stmt->fetch(PDO::FETCH_NUM);
  231. if ($row) {
  232. return array($row[0],0);
  233. } else {
  234. //去除尾查
  235. $newWord = array();
  236. for ($row = 0; $row < count($case); $row++) {
  237. $len = mb_strlen($case[$row][1], "UTF-8");
  238. $end = mb_substr($search, 0 - $len, null, "UTF-8");
  239. if ($end == $case[$row][1]) {
  240. $base = mb_substr($search, 0, mb_strlen($search, "UTF-8") - $len, "UTF-8") . $case[$row][0];
  241. if ($base != $search) {
  242. $newWord[$base] = mb_strlen($case[$row][1],"UTF-8");
  243. }
  244. }
  245. }
  246. #找到最高频的base
  247. $base_weight = 0;
  248. $len = 0;
  249. foreach ($newWord as $x => $x_value) {
  250. $query = "SELECT weight from part where word = ? ";
  251. $stmt = $dbh->prepare($query);
  252. $stmt->execute(array($x));
  253. $row = $stmt->fetch(PDO::FETCH_NUM);
  254. if ($row) {
  255. if ($row[0] > $base_weight) {
  256. $base_weight = $row[0];
  257. $len=$x_value;
  258. }
  259. }
  260. }
  261. return array($base_weight,$len);
  262. }
  263. }
  264. /*
  265. 查找某个单词是否在现有词典出现
  266. 返回信心指数
  267. look up single word in dictionary vocabulary
  268. return the confidence value
  269. */
  270. function isExsit($word, $adj_len = 0)
  271. {
  272. global $auto_split_times;
  273. global $part;
  274. global $confidence;
  275. $auto_split_times++;
  276. if (isset($_POST["debug"])) {
  277. echo "<div>正在查询:{$word}</div>";
  278. }
  279. //return dict_lookup2($word);
  280. $isFound = false;
  281. $count = 0;
  282. if (isset($part["{$word}"])) {
  283. $word_count = $part["{$word}"][0];
  284. $case_len = $part["{$word}"][1];
  285. if ($word_count > 0) {
  286. if (isset($_POST["debug"])) {
  287. echo "查到:{$word}:{$word_count}个\n";
  288. }
  289. $isFound = true;
  290. $count = $word_count + 1;
  291. }
  292. } else {
  293. $db = dict_lookup($word);
  294. $word_count = $db[0];
  295. $case_len = $db[1];
  296. //加入查询缓存
  297. $part["{$word}"] = $db;
  298. if ($word_count > 0) {
  299. if (isset($_POST["debug"])) {
  300. echo "查到:{$word}:{$word_count}个\n";
  301. }
  302. $isFound = true;
  303. $count = $word_count + 1;
  304. }
  305. }
  306. //fomular of confidence value 信心值计算公式
  307. if ($isFound) {
  308. if (isset($confidence["{$word}"])) {
  309. $cf = $confidence["{$word}"];
  310. } else {
  311. //$len = mb_strlen($word, "UTF-8") + $adj_len;
  312. $len = mb_strlen($word, "UTF-8") - $case_len;
  313. $len_correct = 1.2;
  314. $count2 = 1.1 + pow($count, 1.18);
  315. $conf_num = pow(1 / $count2, pow(($len - 0.5), $len_correct));
  316. $cf = round(1 / (1 + 640 * $conf_num), 9);
  317. //$cf = round((1-0.02*$case_len) / (1 + 640 * $conf_num), 9);
  318. $confidence["{$word}"] = $cf;
  319. if (isset($_POST["debug"])) {
  320. echo "信心指数:{$word}:{$cf}\n";
  321. }
  322. }
  323. return ($cf);
  324. } else {
  325. return (-1);
  326. }
  327. }
  328. /*
  329. 核心拆分函数
  330. $strWord, word to be look up 要查询的词
  331. $deep, 当前递归深度
  332. $express=true, 快速查询
  333. $adj_len=0 长度校正系数
  334. $c_threshhold 信心指数阈值
  335. */
  336. function mySplit2($strWord, $deep = 0, $express = false, $adj_len = 0, $c_threshhold = 0.8, $w_threshhold = 0.8, $forward = true, $sandhi_advance = false)
  337. {
  338. global $path;
  339. global $result;
  340. global $sandhi;
  341. $output = array();
  342. #当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
  343. global $currPathCf;
  344. if($deep == 0){
  345. $currPathCf = 1;
  346. }
  347. //达到最大搜索深度,返回
  348. if ($deep >= 16) {
  349. $word = "";
  350. $cf = 1.0;
  351. for ($i = 0; $i < $deep; $i++) {
  352. if (!empty($path[$i][0])) {
  353. $word .= $path[$i][0] ;
  354. if (isset($_POST["debug"])) {
  355. $word .= "(" . $path[$i][1] . ")";
  356. }
  357. $word .= "+";
  358. $cf = $cf * $path[$i][1];
  359. }
  360. }
  361. $len = pow(mb_strlen($strWord, "UTF-8"), 3);
  362. $cf += (0 - $len) / ($len + 150);
  363. $word .= "{$strWord}";
  364. if ($forward == true) {
  365. $result[$word] = $cf;
  366. return;
  367. } else {
  368. $reverseWord = word_reverse($word);
  369. $result[$reverseWord] = $cf;
  370. return;
  371. }
  372. }
  373. //直接找到
  374. $confidence = isExsit($strWord, $adj_len);
  375. if ($confidence > $c_threshhold) {
  376. $output[] = array($strWord, "", $confidence);
  377. }
  378. else {
  379. $confidence = isExsit("[" . $strWord . "]");
  380. if ($confidence > $c_threshhold) {
  381. $output[] = array("[" . $strWord . "]", "", $confidence);
  382. }
  383. }
  384. //如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。
  385. $doubleword = "kkggccjjṭṭḍḍttddppbb";
  386. if (mb_strlen($strWord, "UTF-8") > 2) {
  387. $left2 = mb_substr($strWord, 0, 2, "UTF-8");
  388. if (mb_strpos($doubleword, $left2, 0, "UTF-8") !== false) {
  389. $strWord = mb_substr($strWord, 1, null, "UTF-8");
  390. }
  391. }
  392. $len = mb_strlen($strWord, "UTF-8");
  393. if ($len > 2) {
  394. if ($forward) {
  395. #正向切
  396. for ($i = $len; $i > 1; $i--) {
  397. foreach ($sandhi as $key => $row) {
  398. if ($sandhi_advance == false && $row["advance"] == true) {
  399. //continue;
  400. }
  401. if (mb_substr($strWord, $i - $row["len"], $row["len"], "UTF-8") == $row["c"]) {
  402. $str1 = mb_substr($strWord, 0, $i - $row["len"], "UTF-8") . $row["a"];
  403. $str2 = $row["b"] . mb_substr($strWord, $i, null, "UTF-8");
  404. $confidence = isExsit($str1, $adj_len)*$row["cf"];
  405. if ($confidence > $c_threshhold) {
  406. $output[] = array($str1, $str2, $confidence, $row["adj_len"]);
  407. if (isset($_POST["debug"])) {
  408. echo "插入:{$str1} 剩余{$str2} 应用:{$row["a"]}-{$row["b"]}-{$row["c"]}\n";
  409. }
  410. if ($express) {
  411. break;
  412. }
  413. }
  414. }
  415. }
  416. }
  417. } else {
  418. #反向切
  419. for ($i = 1; $i < $len - 1; $i++) {
  420. foreach ($sandhi as $key => $row) {
  421. if ($sandhi_advance == false && $row["advance"] == true) {
  422. //continue;
  423. }
  424. if (mb_substr($strWord, $i, $row["len"], "UTF-8") == $row["c"]) {
  425. $str1 = mb_substr($strWord, 0, $i, "UTF-8") . $row["a"];
  426. $str2 = $row["b"] . mb_substr($strWord, $i + $row["len"], null, "UTF-8");
  427. $confidence = isExsit($str2, $adj_len)*$row["cf"];
  428. if ($confidence > $c_threshhold) {
  429. $output[] = array($str2, $str1, $confidence, $row["adj_len"]);
  430. if (isset($_POST["debug"])) {
  431. echo "插入:{$str2}\n";
  432. }
  433. if ($express) {
  434. break;
  435. }
  436. }
  437. }
  438. }
  439. }
  440. }
  441. }
  442. $word = "";
  443. if (count($output) > 0) {
  444. foreach ($output as $part) {
  445. $checked = $part[0];
  446. $remainder = $part[1];
  447. $path[$deep][0] = $checked;
  448. $path[$deep][1] = $part[2];
  449. if (empty($remainder)) {
  450. #全切完了
  451. $word = "";
  452. $cf = 1.0;
  453. for ($i = 0; $i < $deep; $i++) {
  454. $word .= $path[$i][0];
  455. if (isset($_POST["debug"])) {
  456. $word .= "(" . $path[$i][1] . ")";
  457. }
  458. $word .= "+";
  459. $cf = $cf * $path[$i][1];
  460. }
  461. if (isset($_POST["debug"])) {
  462. $word .= $checked . "({$part[2]})";
  463. } else {
  464. $word .= $checked;
  465. }
  466. $cf = $cf * $part[2];
  467. if ($cf > $w_threshhold) {
  468. if ($forward == true) {
  469. $result[$word] = $cf;
  470. return;
  471. } else {
  472. $reverseWord = word_reverse($word);
  473. $result[$reverseWord] = $cf;
  474. return;
  475. }
  476. }
  477. } else {
  478. #计算当前信心指数
  479. $cf = 1.0;
  480. for ($i = 0; $i < $deep; $i++) {
  481. $cf = $cf * $path[$i][1];
  482. }
  483. if($cf<$w_threshhold)
  484. {
  485. if (isset($_POST["debug"])) {
  486. echo "信心指数过低,提前返回 {$cf}<br>";
  487. }
  488. return;
  489. }
  490. else
  491. {
  492. #接着切
  493. mySplit2($remainder, ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
  494. }
  495. }
  496. }
  497. }
  498. else {
  499. #尾巴查不到了
  500. $word = "";
  501. $cf = 1.0;
  502. for ($i = 0; $i < $deep; $i++) {
  503. $word .= $path[$i][0];
  504. if (isset($_POST["debug"])) {
  505. $word .= "(" . $path[$i][1] . ")";
  506. }
  507. $word .= "+";
  508. $cf = $cf * $path[$i][1];
  509. }
  510. $len = pow(mb_strlen($strWord, "UTF-8"), 3);
  511. if ($forward) {
  512. $cf =(1-$cf) * $len / ($len + 150);
  513. } else {
  514. $cf =(1-$cf) * $len / ($len + 5);
  515. }
  516. if (isset($_POST["debug"])) {
  517. $word = $word.$strWord . "(0)";
  518. } else {
  519. $word = $word .$strWord;
  520. }
  521. if ($cf > $w_threshhold) {
  522. if ($forward == true) {
  523. $result[$word] = $cf;
  524. return;
  525. }
  526. else {
  527. $reverseWord = word_reverse($word);
  528. $result[$reverseWord] = $cf;
  529. return;
  530. }
  531. }
  532. }
  533. }
  534. function word_reverse($word)
  535. {
  536. $reverse = array();
  537. $newword = explode("+", $word);
  538. $len = count($newword);
  539. if ($len > 0) {
  540. for ($i = $len - 1; $i >= 0; $i--) {
  541. # code...
  542. $reverse[] = $newword[$i];
  543. }
  544. $output = implode("+", $reverse);
  545. return $output;
  546. } else {
  547. return $word;
  548. }
  549. }
  550. #后处理
  551. function split2($word){
  552. global $redis;
  553. $input = explode("+",$word);
  554. $newword=array();
  555. foreach ($input as $value) {
  556. $word = strstr($value,"(",true);
  557. if($word==false){
  558. $word=$value;
  559. }
  560. if(mb_strlen($word,"UTF-8")>4){
  561. # 先看有没有中文意思
  562. if($redis->hExists("dict://ref/has_mean",$word)===TRUE && mb_strlen($word,"UTF-8")<7){
  563. $newword[]=$word;
  564. }
  565. else{
  566. #如果没有查巴缅替换拆分
  567. if($redis->hExists("dict://pm/part",$word)===TRUE){
  568. $pmPart = explode("+",$redis->hGet("dict://pm/part",$word)) ;
  569. foreach ($pmPart as $pm) {
  570. # code...
  571. $newword[]=$pm;
  572. }
  573. }
  574. else{
  575. #如果没有查规则变形
  576. if($redis->hExists("dict://regular/part",$word)===TRUE){
  577. $rglPart = explode("+",$redis->hGet("dict://regular/part",$word)) ;
  578. #看巴缅有没有第一部分
  579. if($redis->hExists("dict://pm/part",$rglPart[0])===TRUE){
  580. $pmPart = explode("+",$redis->hGet("dict://pm/part",$rglPart[0])) ;
  581. foreach ($pmPart as $pm) {
  582. # code...
  583. $newword[]=$pm;
  584. }
  585. }
  586. else{
  587. #没有
  588. $newword[]=$rglPart[0];
  589. }
  590. $newword[]=$rglPart[1];
  591. }
  592. else{
  593. #还没有就认命了
  594. $newword[]=$word;
  595. }
  596. }
  597. }
  598. }
  599. else{
  600. $newword[]=$word;
  601. }
  602. }
  603. return implode("+",$newword);
  604. }
  605. function preSandhi($word){
  606. $sandhi2[] = array("a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
  607. $sandhi2[] = array("a" => "ṃ", "b" => "hi", "c" => "ñhi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0);
  608. $sandhi2[] = array("a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  609. $sandhi2[] = array("a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  610. $sandhi2[] = array("a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  611. $sandhi2[] = array("a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  612. $sandhi2[] = array("a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  613. $sandhi2[] = array("a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999);
  614. $sandhi2[] = array("a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  615. $sandhi2[] = array("a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  616. $sandhi2[] = array("a" => "u", "b" => "eva", "c" => "uyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  617. $sandhi2[] = array("a" => "ṃ", "b" => "eva", "c" => "ṃyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  618. $sandhi2[] = array("a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  619. $sandhi2[] = array("a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  620. $sandhi2[] = array("a" => "ṃ", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  621. $sandhi2[] = array("a" => "u", "b" => "eva", "c" => "veva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  622. $sandhi2[] = array("a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  623. $sandhi2[] = array("a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  624. $sandhi2[] = array("a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  625. $sandhi2[] = array("a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  626. $sandhi2[] = array("a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  627. $sandhi2[] = array("a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  628. $sandhi2[] = array("a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  629. $sandhi2[] = array("a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  630. $sandhi2[] = array("a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  631. $sandhi2[] = array("a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  632. $sandhi2[] = array("a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  633. $sandhi2[] = array("a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999);
  634. $newWord = "";
  635. $firstWord=$word;
  636. do {
  637. $isFound = false;
  638. foreach ($sandhi2 as $key => $sandhi) {
  639. # code...
  640. $len = $sandhi["len"];
  641. $end = mb_substr($firstWord, 0 - $len, null, "UTF-8");
  642. if ($end == $sandhi["c"]) {
  643. $word1 = mb_substr($firstWord, 0, mb_strlen($firstWord, "UTF-8") - $len, "UTF-8") .$sandhi["a"];
  644. $word2 = $sandhi["b"];
  645. $newWord = $word2 . "+" .$newWord;
  646. $firstWord = $word1;
  647. $isFound=true;
  648. break;
  649. }
  650. }
  651. } while ($isFound);
  652. $newWord = $firstWord . "+" .$newWord;
  653. return mb_substr($newWord,0,-1, "UTF-8");
  654. }