TurboSplit.php 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678
  1. <?php
  2. namespace App\Tools;
  3. require_once __DIR__.'/../../public/app/public/casesuf.inc';
  4. use Illuminate\Support\Facades\Cache;
  5. use Illuminate\Support\Facades\Log;
  6. use Illuminate\Support\Facades\DB;
  7. use App\Models\WordPart;
  8. use App\Models\UserDict;
  9. class TurboSplit
  10. {
  11. protected $path = array();
  12. protected $isDebug = false;
  13. #当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
  14. protected $currPathCf;
  15. #内存信心指数表
  16. protected $confidence = array();
  17. //结果数组
  18. protected $result = array();
  19. protected $part = array();
  20. //最大结果数量
  21. protected $MAX_RESULT = 100;
  22. protected $MAX_RESULT2 = 5;
  23. //最大递归深度
  24. protected $MAX_DEEP = 16;
  25. //连音规则表
  26. protected $sandhi = [
  27. ["a" => "", "b" => "", "c" => "", "len" => 0, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  28. ["a" => "a", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  29. ["a" => "ā", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  30. ["a" => "a", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  31. ["a" => "ā", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  32. ["a" => "a", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  33. ["a" => "a", "b" => "i", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  34. ["a" => "a", "b" => "o", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  35. ["a" => "a", "b" => "u", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  36. ["a" => "u", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  37. ["a" => "u", "b" => "u", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  38. ["a" => "a", "b" => "u", "c" => "u", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  39. ["a" => "a", "b" => "ī", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  40. ["a" => "a", "b" => "ū", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  41. ["a" => "a", "b" => "i", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  42. ["a" => "e", "b" => "a", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  43. ["a" => "i", "b" => "i", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  44. ["a" => "i", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  45. ["a" => "i", "b" => "a", "c" => "ya", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  46. ["a" => "a", "b" => "atth", "c" => "atth", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  47. ["a" => "taṃ", "b" => "n", "c" => "tann", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  48. ["a" => "[ṃ]", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  49. ["a" => "[ṃ]", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  50. ["a" => "[o]", "b" => "iva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  51. ["a" => "o", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  52. ["a" => "a", "b" => "ādi", "c" => "ādi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  53. ["a" => "a[ānaṃ]", "b" => "a", "c" => "ānama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  54. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  55. ["a" => "[ṃ]", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  56. ["a" => "[ṃ]", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  57. ["a" => "[ṃ]", "b" => "a", "c" => "ma", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  58. ["a" => "ṃ", "b" => "a", "c" => "m", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  59. ["a" => "[ṃ]", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  60. ["a" => "[ṃ]", "b" => "u", "c" => "mu", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  61. ["a" => "[ṃ]", "b" => "h", "c" => "ñh", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  62. ["a" => "ā", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  63. ["a" => "a", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  64. ["a" => "ī", "b" => "[ṃ]", "c" => "im", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  65. ["a" => "ati", "b" => "tabba", "c" => "atabba", "len" => 6, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  66. ["a" => "ati", "b" => "tabba", "c" => "itabba", "len" => 6, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  67. ["a" => "iti", "b" => "a", "c" => "icca", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  68. ["a" => "uṃ", "b" => "a", "c" => "uma", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  69. ["a" => "u[ūnaṃ]", "b" => "a", "c" => "ūnama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  70. ["a" => "ī[īnaṃ]", "b" => "a", "c" => "īnama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  71. ["a" => "su", "b" => "a", "c" => "sva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  72. ["a" => "ā", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  73. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  74. ["a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  75. ["a" => "ī", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  76. ["a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  77. ["a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  78. ["a" => "ū", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  79. ["a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  80. ["a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  81. ["a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  82. ["a" => "ṃ", "b" => "cāti", "c" => "ñcāti", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  83. ["a" => "ṃ", "b" => "cet", "c" => "ñcet", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  84. ["a" => "ṃ", "b" => "ev", "c" => "mev", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  85. ["a" => "a", "b" => "a", "c" => "a", "len" => 1, "adj_len" => -1, "advance" => true,"cf"=>0.99],
  86. ["a" => "ī", "b" => "", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => true,"cf"=>0.9],
  87. ];
  88. protected $sandhi2 = [
  89. ["a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  90. ["a" => "ṃ", "b" => "hi", "c" => "ñhi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  91. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  92. ["a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  93. ["a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  94. ["a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  95. ["a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  96. ["a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  97. ["a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  98. ["a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  99. ["a" => "u", "b" => "eva", "c" => "uyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  100. ["a" => "ṃ", "b" => "eva", "c" => "ṃyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  101. ["a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  102. ["a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  103. ["a" => "ṃ", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  104. ["a" => "u", "b" => "eva", "c" => "veva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  105. ["a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  106. ["a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  107. ["a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  108. ["a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  109. ["a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  110. ["a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  111. ["a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  112. ["a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  113. ["a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  114. ["a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  115. ["a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  116. ["a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  117. ];
  118. /**
  119. * Create a new command instance.
  120. *
  121. * @return void
  122. */
  123. public function __construct()
  124. {
  125. for($i=0;$i<$this->MAX_DEEP;$i++ ){
  126. array_push($this->path, array("", 0));
  127. }
  128. return;
  129. }
  130. /**
  131. * 从双元音处切开
  132. * @param string $word
  133. * @return void
  134. */
  135. public function splitDiphthong($word)
  136. {
  137. //diphthong table双元音表
  138. $search = array('aa', 'ae', 'ai', 'ao', 'au', 'aā', 'aī', 'aū', 'ea', 'ee', 'ei', 'eo', 'eu', 'eā', 'eī', 'eū', 'ia', 'ie', 'ii', 'io', 'iu', 'iā', 'iī', 'iū', 'oa', 'oe', 'oi', 'oo', 'ou', 'oā', 'oī', 'oū', 'ua', 'ue', 'ui', 'uo', 'uu', 'uā', 'uī', 'uū', 'āa', 'āe', 'āi', 'āo', 'āu', 'āā', 'āī', 'āū', 'īa', 'īe', 'īi', 'īo', 'īu', 'īā', 'īī', 'īū', 'ūa', 'ūe', 'ūi', 'ūo', 'ūu', 'ūā', 'ūī', 'ūū');
  139. $replace = array('a-a', 'a-e', 'a-i', 'a-o', 'a-u', 'a-ā', 'a-ī', 'a-ū', 'e-a', 'e-e', 'e-i', 'e-o', 'e-u', 'e-ā', 'e-ī', 'e-ū', 'i-a', 'i-e', 'i-i', 'i-o', 'i-u', 'i-ā', 'i-ī', 'i-ū', 'o-a', 'o-e', 'o-i', 'o-o', 'o-u', 'o-ā', 'o-ī', 'o-ū', 'u-a', 'u-e', 'u-i', 'u-o', 'u-u', 'u-ā', 'u-ī', 'u-ū', 'ā-a', 'ā-e', 'ā-i', 'ā-o', 'ā-u', 'ā-ā', 'ā-ī', 'ā-ū', 'ī-a', 'ī-e', 'ī-i', 'ī-o', 'ī-u', 'ī-ā', 'ī-ī', 'ī-ū', 'ū-a', 'ū-e', 'ū-i', 'ū-o', 'ū-u', 'ū-ā', 'ū-ī', 'ū-ū');
  140. //将双元音拆开
  141. //step 1 : split at diphthong . ~aa~ -> ~a-a~
  142. $word1 = str_replace($search, $replace, $word);
  143. //按连字符拆开处理
  144. $arrword = str_getcsv($word1, "-");
  145. return $arrword;
  146. }
  147. /**
  148. * 查询单词是否存在
  149. * @param string $word
  150. * @return array(int $wordWeight, int $endingLenght)
  151. */
  152. public function dict_lookup($word)
  153. {
  154. global $case; //语尾表
  155. if (strlen($word) <= 1) {
  156. return array(0,0);
  157. }
  158. //去掉单词首尾的 []
  159. if(mb_substr($word,0,1) !== "["){
  160. $search = $word;
  161. }
  162. else{
  163. $search = str_replace("[","",$word);
  164. $search = str_replace("]","",$search);
  165. }
  166. //获取单词权重
  167. $row = Cache::remember('palicanon/wordpart/weight/'.$search, 100 , function() use($search) {
  168. return WordPart::where('word',$search)->value('weight');
  169. });
  170. if ($row) {
  171. //找到
  172. return array($row,0);
  173. } else {
  174. //去除尾查
  175. $newWord = array();
  176. for ($row = 0; $row < count($case); $row++) {
  177. $len = mb_strlen($case[$row][1], "UTF-8");
  178. $end = mb_substr($search, 0 - $len, null, "UTF-8");
  179. if ($end == $case[$row][1]) {
  180. $base = mb_substr($search, 0, mb_strlen($search, "UTF-8") - $len, "UTF-8") . $case[$row][0];
  181. if ($base != $search) {
  182. $newWord[$base] = mb_strlen($case[$row][1],"UTF-8");
  183. }
  184. }
  185. }
  186. #找到最高频的base
  187. $base_weight = 0;
  188. $len = 0;
  189. foreach ($newWord as $x => $x_value) {
  190. $row = Cache::remember('palicanon/wordpart/weight/'.$search, 100 , function() use($x) {
  191. return WordPart::where('word',$x)->value('weight');
  192. });
  193. if ($row) {
  194. if ($row > $base_weight) {
  195. $base_weight = $row;
  196. $len=$x_value;
  197. }
  198. }
  199. }
  200. return array($base_weight,$len);
  201. }
  202. }
  203. /**
  204. * 查找某个单词是否在现有词典出现
  205. * 返回信心指数
  206. * look up single word in dictionary vocabulary
  207. * return the confidence value
  208. *
  209. *
  210. *
  211. */
  212. public function isExsit($word, $adj_len = 0){
  213. $this->log("正在查询:{$word}");
  214. $isFound = false;
  215. $count = 0;
  216. $cacheKey = "turbosplit/part/";
  217. if (isset($this->part["{$word}"])) {
  218. $word_count = $this->part["{$word}"][0];
  219. $case_len = $this->part["{$word}"][1];
  220. if ($word_count > 0) {
  221. $this->log("查到:{$word}:{$word_count}个");
  222. $isFound = true;
  223. $count = $word_count + 1;
  224. }
  225. } else {
  226. $db = $this->dict_lookup($word);
  227. $word_count = $db[0];
  228. $case_len = $db[1];
  229. //加入查询缓存
  230. $this->part["{$word}"] = $db;
  231. if ($word_count > 0) {
  232. Log::info("查到:{$word}:{$word_count}个");
  233. $isFound = true;
  234. $count = $word_count + 1;
  235. }
  236. }
  237. //fomular of confidence value 信心值计算公式
  238. if ($isFound) {
  239. if (isset($this->confidence["{$word}"])) {
  240. $cf = $this->confidence["{$word}"];
  241. } else {
  242. $len = mb_strlen($word, "UTF-8") - $case_len;
  243. $len_correct = 1.2;
  244. $count2 = 1.1 + pow($count, 1.18);
  245. $conf_num = pow(1 / $count2, pow(($len - 0.5), $len_correct));
  246. $cf = round(1 / (1 + 640 * $conf_num), 9);
  247. //$cf = round((1-0.02*$case_len) / (1 + 640 * $conf_num), 9);
  248. $this->confidence["{$word}"] = $cf;
  249. Log::info("信心指数:{$word}:{$cf}");
  250. }
  251. return ($cf);
  252. } else {
  253. return (-1);
  254. }
  255. }
  256. /**
  257. * 核心拆分函数
  258. * $strWord, word to be look up 要查询的词
  259. * $deep, 当前递归深度
  260. * $express=true, 快速查询
  261. * $adj_len=0 长度校正系数
  262. * $c_threshhold 信心指数阈值
  263. *
  264. *
  265. *
  266. */
  267. function split($strWord, $deep = 0, $express = false, $adj_len = 0, $c_threshhold = 0.8, $w_threshhold = 0.8, $forward = true, $sandhi_advance = false)
  268. {
  269. $this->log("spliting word={$strWord} deep={$deep}");
  270. $output = array();
  271. #当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
  272. if($deep == 0){
  273. $this->currPathCf = 1;
  274. }
  275. //达到最大搜索深度,返回
  276. if ($deep >= $this->MAX_DEEP) {
  277. $word = "";
  278. $cf = 1.0;
  279. for ($i = 0; $i < $deep; $i++) {
  280. if (!empty($this->path[$i][0])) {
  281. $word .= $this->path[$i][0] ;
  282. if($isDebug) {
  283. $word .= "(" . $this->path[$i][1] . ")";
  284. }
  285. $word .= "+";
  286. $cf = $cf * $this->path[$i][1];
  287. }
  288. }
  289. $len = pow(mb_strlen($strWord, "UTF-8"), 3);
  290. $cf += (0 - $len) / ($len + 150);
  291. $word .= "{$strWord}";
  292. if ($forward == true) {
  293. $this->result[$word] = $cf;
  294. return 0;
  295. } else {
  296. $reverseWord = word_reverse($word);
  297. $this->result[$reverseWord] = $cf;
  298. return 0;
  299. }
  300. }
  301. //直接找到
  302. $confidence = isExsit($strWord, $adj_len);
  303. if ($confidence > $c_threshhold) {
  304. $output[] = array($strWord, "", $confidence);
  305. }
  306. else {
  307. $confidence = isExsit("[" . $strWord . "]");
  308. if ($confidence > $c_threshhold) {
  309. $output[] = array("[" . $strWord . "]", "", $confidence);
  310. }
  311. }
  312. //如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。
  313. $doubleword = "kkggccjjṭṭḍḍttddppbb";
  314. if (mb_strlen($strWord, "UTF-8") > 2) {
  315. $left2 = mb_substr($strWord, 0, 2, "UTF-8");
  316. if (mb_strpos($doubleword, $left2, 0, "UTF-8") !== false) {
  317. $strWord = mb_substr($strWord, 1, null, "UTF-8");
  318. }
  319. }
  320. $len = mb_strlen($strWord, "UTF-8");
  321. if ($len > 2) {
  322. if ($forward) {
  323. #正向切
  324. $this->log("正向切");
  325. for ($i = $len; $i > 1; $i--) {
  326. foreach ($this->sandhi as $key => $row) {
  327. //应用连音规则切分单词
  328. if ($sandhi_advance == false && $row["advance"] == true) {
  329. //continue;
  330. }
  331. if (mb_substr($strWord, $i - $row["len"], $row["len"], "UTF-8") == $row["c"]) {
  332. $str1 = mb_substr($strWord, 0, $i - $row["len"], "UTF-8") . $row["a"];
  333. $str2 = $row["b"] . mb_substr($strWord, $i, null, "UTF-8");
  334. $confidence = isExsit($str1, $adj_len)*$row["cf"];
  335. if ($confidence > $c_threshhold) {
  336. //信心指数大于预设的阈值,插入
  337. $output[] = array($str1, $str2, $confidence, $row["adj_len"]);
  338. $this->log("插入结构数组:{$str1} 剩余{$str2} 应用:{$row["a"]}-{$row["b"]}-{$row["c"]}");
  339. if ($express) {
  340. break;
  341. }
  342. }
  343. }
  344. }
  345. }
  346. } else {
  347. #反向切
  348. for ($i = 1; $i < $len - 1; $i++) {
  349. foreach ($this->sandhi as $key => $row) {
  350. if ($sandhi_advance == false && $row["advance"] == true) {
  351. //continue;
  352. }
  353. if (mb_substr($strWord, $i, $row["len"], "UTF-8") == $row["c"]) {
  354. $str1 = mb_substr($strWord, 0, $i, "UTF-8") . $row["a"];
  355. $str2 = $row["b"] . mb_substr($strWord, $i + $row["len"], null, "UTF-8");
  356. $confidence = isExsit($str2, $adj_len)*$row["cf"];
  357. if ($confidence > $c_threshhold) {
  358. $output[] = array($str2, $str1, $confidence, $row["adj_len"]);
  359. $this->log("将此次结果插入结果数组:剩余={$str2}");
  360. if ($express) {
  361. break;
  362. }
  363. }
  364. }
  365. }
  366. }
  367. }
  368. }
  369. $word = "";
  370. $this->log("结果数组个数:".count($output));
  371. if (count($output) > 0) {
  372. foreach ($output as $part) {
  373. $checked = $part[0];
  374. $remainder = $part[1];
  375. $this->log("剩余部分:{$remainder}");
  376. $this->path[$deep][0] = $checked;
  377. $this->path[$deep][1] = $part[2];
  378. if (empty($remainder)) {
  379. #全切完了
  380. $this->log("全切完了");
  381. $word = "";
  382. $cf = 1.0;
  383. for ($i = 0; $i < $deep; $i++) {
  384. $word .= $this->path[$i][0];
  385. if ($this->isDebug) {
  386. $word .= "(" . $this->path[$i][1] . ")";
  387. }
  388. $word .= "+";
  389. $cf = $cf * $this->path[$i][1];
  390. }
  391. if ($this->isDebug) {
  392. $word .= $checked . "({$part[2]})";
  393. } else {
  394. $word .= $checked;
  395. }
  396. $cf = $cf * $part[2];
  397. if ($cf > $w_threshhold) {
  398. if ($forward == true) {
  399. $this->result[$word] = $cf;
  400. return 0;
  401. } else {
  402. $reverseWord = word_reverse($word);
  403. $this->result[$reverseWord] = $cf;
  404. return 0;
  405. }
  406. }
  407. } else {
  408. #计算当前信心指数
  409. $cf = 1.0;
  410. for ($i = 0; $i < $deep; $i++) {
  411. $cf = $cf * $this->path[$i][1];
  412. }
  413. $this->log("计算当前信心指数:{$cf}");
  414. if($cf<$w_threshhold){
  415. $this->log("信心指数过低,提前返回 {$cf}");
  416. return 0;
  417. }else{
  418. #接着切
  419. $this->log("接着切:{$remainder}");
  420. $this->split($remainder, ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
  421. }
  422. }
  423. }
  424. }else {
  425. #尾巴查不到了
  426. $this->log("尾巴查不到了");
  427. $word = "";
  428. $cf = 1.0;
  429. for ($i = 0; $i < $deep; $i++) {
  430. $word .= $this->path[$i][0];
  431. if ($this->isDebug) {
  432. $word .= "(" . $this->path[$i][1] . ")";
  433. }
  434. $word .= "+";
  435. $cf = $cf * $this->path[$i][1];
  436. }
  437. $len = pow(mb_strlen($strWord, "UTF-8"), 3);
  438. if ($forward) {
  439. $cf =(1-$cf) * $len / ($len + 150);
  440. } else {
  441. $cf =(1-$cf) * $len / ($len + 5);
  442. }
  443. if ($this->isDebug) {
  444. $word = $word.$strWord . "(0)";
  445. } else {
  446. $word = $word .$strWord;
  447. }
  448. if ($cf > $w_threshhold) {
  449. if ($forward == true) {
  450. $this->result[$word] = $cf;
  451. return 0;
  452. }
  453. else {
  454. $reverseWord = word_reverse($word);
  455. $this->result[$reverseWord] = $cf;
  456. return 0;
  457. }
  458. }
  459. }
  460. }
  461. /**
  462. * 颠倒词序
  463. */
  464. public function word_reverse($word)
  465. {
  466. $reverse = array();
  467. $newword = explode("+", $word);
  468. $len = count($newword);
  469. if ($len > 0) {
  470. for ($i = $len - 1; $i >= 0; $i--) {
  471. # code...
  472. $reverse[] = $newword[$i];
  473. }
  474. $output = implode("+", $reverse);
  475. return $output;
  476. } else {
  477. return $word;
  478. }
  479. }
  480. /**
  481. * 拆分后的处理
  482. */
  483. public function split2($word){
  484. $input = explode("+",$word);
  485. $newword=array();
  486. foreach ($input as $value) {
  487. //去掉带小括号的调试信息
  488. $word = strstr($value,"(",true);
  489. if($word == false){
  490. $word = $value;
  491. }
  492. if(mb_strlen($word,"UTF-8")>4){
  493. # 先看有没有中文意思
  494. Log::info("先看有没有中文意思");
  495. if(UserDict::where('word',$word)->where('mean','<>','')->where('language','<>','my')->exists()){
  496. $newword[]=$word;
  497. }else{
  498. Log::info("如果没有查巴缅替换拆分");
  499. #如果没有查巴缅替换拆分
  500. if(UserDict::where('word',$word)->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->exists()){
  501. $pmPart = explode("+",UserDict::where('word',$word)->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->value('factors')) ;
  502. foreach ($pmPart as $pm) {
  503. # code...
  504. $newword[]=$pm;
  505. }
  506. }
  507. else{
  508. Log::info("如果没有查规则变形");
  509. #如果没有查规则变形
  510. if(UserDict::where('word',$word)->where('source','_SYS_REGULAR_')->exists()){
  511. $rglPart = explode("+",UserDict::where('word',$word)->where('source','_SYS_REGULAR_')->value('factors')) ;
  512. #看巴缅有没有第一部分
  513. Log::info("看巴缅有没有第一部分");
  514. if(UserDict::where('word',$rglPart[0])->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->exists()){
  515. $pmPart = explode("+",UserDict::where('word',$rglPart[0])->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->value('factors')) ;
  516. foreach ($pmPart as $pm) {
  517. # code...
  518. $newword[]=$pm;
  519. }
  520. }
  521. else{
  522. #没有
  523. $newword[]=$rglPart[0];
  524. }
  525. $newword[]=$rglPart[1];
  526. }
  527. else{
  528. #还没有就认命了
  529. Log::info("还没有就认命了");
  530. $newword[]=$word;
  531. }
  532. }
  533. }
  534. }
  535. else{
  536. $newword[]=$word;
  537. }
  538. }
  539. return implode("+",$newword);
  540. }
  541. /**
  542. * 预处理连音词
  543. */
  544. public function splitSandhi($word){
  545. $newWord = "";
  546. $firstWord=$word;
  547. do {
  548. $isFound = false;
  549. foreach ($this->sandhi2 as $key => $sandhi) {
  550. # code...
  551. $len = $sandhi["len"];
  552. $end = mb_substr($firstWord, 0 - $len, null, "UTF-8");
  553. if ($end == $sandhi["c"]) {
  554. $word1 = mb_substr($firstWord, 0, mb_strlen($firstWord, "UTF-8") - $len, "UTF-8") .$sandhi["a"];
  555. $word2 = $sandhi["b"];
  556. $newWord = $word2 . "-" .$newWord;
  557. $firstWord = $word1;
  558. $isFound=true;
  559. break;
  560. }
  561. }
  562. } while ($isFound);
  563. $newWord = $firstWord . "-" .$newWord;
  564. return mb_substr($newWord,0,-1, "UTF-8");
  565. }
  566. public function splitA($word){
  567. $output = array();
  568. //预处理连音词
  569. $word = $this->splitSandhi($word);
  570. # 处理双元音
  571. Log::info("处理双元音");
  572. $arrword = $this->splitDiphthong($word);
  573. if (count($arrword) > 1) {
  574. array_push($output,['word'=>$word,'factors'=>implode("+", $arrword),'confidence'=>0.9999]);
  575. }
  576. foreach ($arrword as $oneword) {
  577. $this->result = array(); //清空递归程序的输出容器
  578. if(mb_strlen($oneword)>35){
  579. //长词使用快速切分 正向切分 不使用少见sandi规则
  580. $this->split($oneword, 0, true, 0.8, 0.9, 0, true, false);
  581. $min_result = 1;
  582. }else{
  583. $this->split($oneword, 0, false, 0.8, 0.9, 0, true, false);
  584. $min_result=3;
  585. }
  586. Log::info("正向切分结束 结果数量".count($this->result));
  587. if(count($this->result)<$min_result){
  588. //有效结果过少
  589. $this->split($oneword, 0, false, 0.2, 0.8, 0, true, true);
  590. Log::info("有效结果过少 再次正切".count($this->result) );
  591. if(count($this->result)<2){
  592. $this->split($oneword, 0, false, 0.2, 0.8, 0, false, true);
  593. Log::info("有效结果过少 再次反切:结果数量" . count($this->result));
  594. }
  595. }
  596. //echo "{$start}-{$oneword}:" . count($result) . "\n";
  597. if (count($this->result) > 0) {
  598. arsort($this->result); //按信心指数排序
  599. $iCount = 0;
  600. foreach ($this->result as $row => $value) {
  601. array_push($output,['word'=>$oneword,'factors'=>$row,'confidence'=>$value]);
  602. //后处理 进一步切分没有意思的长词
  603. Log::info("后处理 进一步切分没有意思的长词");
  604. $new = $this->split2($row);
  605. if($new!==$row){
  606. array_push($output,['word'=>$oneword,'factors'=>$new,'confidence'=>$value]);
  607. #再处理一次
  608. $new2 = split2($new);
  609. if($new2!==$new){
  610. array_push($output,['word'=>$oneword,'factors'=>$new2,'confidence'=>$value]);
  611. }
  612. }
  613. $iCount++;
  614. if ($iCount > $this->MAX_RESULT2) {
  615. break;
  616. }
  617. }
  618. } else {
  619. Log::error("{$oneword} 切分失败");
  620. }
  621. }
  622. return $output;
  623. }
  624. public function setting($param=null){
  625. }
  626. public function getResult(){
  627. return $this->result;
  628. }
  629. private function log($message){
  630. if ($this->isDebug) {
  631. Log::info($message);
  632. }
  633. }
  634. private function pushResult($word,$cf){
  635. array_push($this->result,array($word=>$cf));
  636. }
  637. }