TurboSplit.php 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850
  1. <?php
  2. namespace App\Tools;
  3. require_once __DIR__.'/../../public/app/public/casesuf.inc';
  4. use Illuminate\Support\Facades\Cache;
  5. use Illuminate\Support\Facades\Log;
  6. use Illuminate\Support\Facades\DB;
  7. use App\Models\WordPart;
  8. use App\Models\UserDict;
  9. use Illuminate\Support\Arr;
  10. class TurboSplit
  11. {
  12. protected $options = [
  13. "express" => false,
  14. "c_threshhold" => 0.8,
  15. "w_threshhold" => 0.8,
  16. "forward" => true,
  17. "sandhi_advance" => false,
  18. "lookup_express" => true,/**快速查字典-不去尾 */
  19. ];
  20. protected $node = [];
  21. protected $path = array();
  22. protected $isDebug = false;
  23. #当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
  24. protected $currPathCf;
  25. //结果数组
  26. protected $result = array();
  27. //过程中最大结果数量
  28. protected $MAX_RESULT = 100;
  29. //返回值最大结果数量
  30. protected $MAX_RESULT2 = 5;
  31. //最大递归深度
  32. protected $MAX_DEEP = 16;
  33. //连音规则表
  34. protected $sandhi = [
  35. ["a" => "", "b" => "", "c" => "", "len" => 0, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  36. ["a" => "a", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  37. ["a" => "ā", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  38. ["a" => "a", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  39. ["a" => "ā", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  40. ["a" => "a", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  41. ["a" => "a", "b" => "i", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  42. ["a" => "a", "b" => "o", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  43. ["a" => "a", "b" => "u", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  44. ["a" => "u", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  45. ["a" => "u", "b" => "u", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  46. ["a" => "a", "b" => "u", "c" => "u", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  47. ["a" => "a", "b" => "ī", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  48. ["a" => "a", "b" => "ū", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  49. ["a" => "a", "b" => "i", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  50. ["a" => "e", "b" => "a", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  51. ["a" => "i", "b" => "i", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  52. ["a" => "i", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  53. ["a" => "i", "b" => "a", "c" => "ya", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  54. ["a" => "a", "b" => "atth", "c" => "atth", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  55. ["a" => "taṃ", "b" => "n", "c" => "tann", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  56. ["a" => "[ṃ]", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  57. ["a" => "[ṃ]", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  58. ["a" => "[o]", "b" => "iva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  59. ["a" => "o", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  60. ["a" => "a", "b" => "ādi", "c" => "ādi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  61. ["a" => "a[ānaṃ]", "b" => "a", "c" => "ānama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  62. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  63. ["a" => "[ṃ]", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  64. ["a" => "[ṃ]", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  65. ["a" => "[ṃ]", "b" => "a", "c" => "ma", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  66. ["a" => "ṃ", "b" => "a", "c" => "m", "len" => 1, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  67. ["a" => "[ṃ]", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
  68. ["a" => "ṃ", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.9],
  69. ["a" => "[ṃ]", "b" => "u", "c" => "mu", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
  70. ["a" => "[ṃ]", "b" => "h", "c" => "ñh", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
  71. ["a" => "ā", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
  72. ["a" => "a", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
  73. ["a" => "ī", "b" => "[ṃ]", "c" => "im", "len" => 2, "adj_len" => 0, "advance" => false,"cf"=>0.8],
  74. ["a" => "ati", "b" => "tabba", "c" => "atabba", "len" => 6, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  75. ["a" => "ati", "b" => "tabba", "c" => "itabba", "len" => 6, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  76. ["a" => "iti", "b" => "a", "c" => "icca", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  77. ["a" => "uṃ", "b" => "a", "c" => "uma", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  78. ["a" => "u[ūnaṃ]", "b" => "a", "c" => "ūnama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  79. ["a" => "ī[īnaṃ]", "b" => "a", "c" => "īnama", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  80. ["a" => "su", "b" => "a", "c" => "sva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  81. ["a" => "ā", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  82. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  83. ["a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  84. ["a" => "ī", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  85. ["a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  86. ["a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  87. ["a" => "ū", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  88. ["a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  89. ["a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  90. ["a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  91. ["a" => "ṃ", "b" => "cāti", "c" => "ñcāti", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  92. ["a" => "ṃ", "b" => "cet", "c" => "ñcet", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  93. ["a" => "ṃ", "b" => "ev", "c" => "mev", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.99999],
  94. ["a" => "a", "b" => "a", "c" => "a", "len" => 1, "adj_len" => -1, "advance" => true,"cf"=>0.99],
  95. ["a" => "ī", "b" => "", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => true,"cf"=>0.9],
  96. ];
  97. protected $sandhi2 = [
  98. ["a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  99. ["a" => "ṃ", "b" => "hi", "c" => "ñhi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  100. ["a" => "ena", "b" => "iti", "c" => "enāti", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>1.0],
  101. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9],
  102. ["a" => "ā", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.6],
  103. ["a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9],
  104. ["a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9],
  105. ["a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9],
  106. ["a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9],
  107. ["a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9],
  108. ["a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9],
  109. ["a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  110. ["a" => "u", "b" => "eva", "c" => "uyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  111. ["a" => "ṃ", "b" => "eva", "c" => "ṃyeva", "len" => 5, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  112. ["a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  113. ["a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  114. ["a" => "ṃ", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  115. ["a" => "u", "b" => "eva", "c" => "veva", "len" => 4, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  116. ["a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  117. ["a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  118. ["a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  119. ["a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  120. ["a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  121. ["a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  122. ["a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  123. ["a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  124. ["a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  125. ["a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  126. ["a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  127. ["a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false,"cf"=>0.9999],
  128. ];
  129. /**
  130. * Create a new class instance.
  131. *
  132. * @return void
  133. */
  134. public function __construct()
  135. {
  136. for($i=0;$i<$this->MAX_DEEP;$i++ ){
  137. array_push($this->path, array("", 0));
  138. }
  139. return;
  140. }
  141. /**
  142. * 从双元音处切开
  143. * @param string $word
  144. * @return void
  145. */
  146. public function splitDiphthong($word)
  147. {
  148. //diphthong table双元音表
  149. $search = array('aa', 'ae', 'ai', 'ao', 'au', 'aā', 'aī', 'aū', 'ea', 'ee', 'ei', 'eo', 'eu', 'eā', 'eī', 'eū', 'ia', 'ie', 'ii', 'io', 'iu', 'iā', 'iī', 'iū', 'oa', 'oe', 'oi', 'oo', 'ou', 'oā', 'oī', 'oū', 'ua', 'ue', 'ui', 'uo', 'uu', 'uā', 'uī', 'uū', 'āa', 'āe', 'āi', 'āo', 'āu', 'āā', 'āī', 'āū', 'īa', 'īe', 'īi', 'īo', 'īu', 'īā', 'īī', 'īū', 'ūa', 'ūe', 'ūi', 'ūo', 'ūu', 'ūā', 'ūī', 'ūū');
  150. $replace = array('a-a', 'a-e', 'a-i', 'a-o', 'a-u', 'a-ā', 'a-ī', 'a-ū', 'e-a', 'e-e', 'e-i', 'e-o', 'e-u', 'e-ā', 'e-ī', 'e-ū', 'i-a', 'i-e', 'i-i', 'i-o', 'i-u', 'i-ā', 'i-ī', 'i-ū', 'o-a', 'o-e', 'o-i', 'o-o', 'o-u', 'o-ā', 'o-ī', 'o-ū', 'u-a', 'u-e', 'u-i', 'u-o', 'u-u', 'u-ā', 'u-ī', 'u-ū', 'ā-a', 'ā-e', 'ā-i', 'ā-o', 'ā-u', 'ā-ā', 'ā-ī', 'ā-ū', 'ī-a', 'ī-e', 'ī-i', 'ī-o', 'ī-u', 'ī-ā', 'ī-ī', 'ī-ū', 'ū-a', 'ū-e', 'ū-i', 'ū-o', 'ū-u', 'ū-ā', 'ū-ī', 'ū-ū');
  151. //将双元音拆开
  152. //step 1 : split at diphthong . ~aa~ -> ~a-a~
  153. $word1 = str_replace($search, $replace, $word);
  154. //按连字符拆开处理
  155. $arrword = str_getcsv($word1, "-");
  156. return $arrword;
  157. }
  158. /**
  159. * 查询单词是否存在
  160. * @param string $word
  161. * @return array(int $wordWeight, int $endingLenght)
  162. */
  163. public function dict_lookup($word)
  164. {
  165. global $case; //语尾表
  166. if (strlen($word) <= 1) {
  167. return array(0,0);
  168. }
  169. $search = $word;
  170. //获取单词权重
  171. $row = Cache::remember('palicanon/wordpart/weight/'.$search,
  172. env('CACHE_EXPIRE',3600*24) ,
  173. function() use($search) {
  174. return WordPart::where('word',$search)->value('weight');
  175. });
  176. if ($row) {
  177. //找到
  178. return array($row,0);
  179. } else {
  180. if($this->options["lookup_express"]){
  181. return array(0,0);
  182. }
  183. //去除尾查
  184. $newWord = array();
  185. for ($row = 0; $row < count($case); $row++) {
  186. $len = mb_strlen($case[$row][1], "UTF-8");
  187. $end = mb_substr($search, 0 - $len, null, "UTF-8");
  188. if ($end == $case[$row][1]) {
  189. $base = mb_substr($search, 0, mb_strlen($search, "UTF-8") - $len, "UTF-8") . $case[$row][0];
  190. if ($base != $search) {
  191. $newWord[$base] = mb_strlen($case[$row][1],"UTF-8");
  192. }
  193. }
  194. }
  195. #找到最高频的base
  196. $base_weight = 0;
  197. $len = 0;
  198. foreach ($newWord as $x => $x_value) {
  199. $row = Cache::remember('palicanon/wordpart/weight/'.$search,
  200. env('CACHE_EXPIRE',3600*24) ,
  201. function() use($x) {
  202. return WordPart::where('word',$x)->value('weight');
  203. });
  204. if ($row) {
  205. if ($row > $base_weight) {
  206. $base_weight = $row;
  207. $len=$x_value;
  208. }
  209. }
  210. }
  211. return array($base_weight,$len);
  212. }
  213. }
  214. /**
  215. * 查找某个单词是否在现有词典出现
  216. * 返回信心指数
  217. * look up single word in dictionary vocabulary
  218. * return the confidence value
  219. *
  220. *
  221. *
  222. */
  223. public function isExsit($word, $adj_len = 0){
  224. $this->log("正在查询:{$word}");
  225. $isFound = false;
  226. $count = 0;
  227. $wordPart = Cache::remember("turbosplit/part/{$word}",
  228. env('CACHE_EXPIRE',3600*24),
  229. function() use($word){
  230. return implode(',',$this->dict_lookup($word));
  231. });
  232. $arrWordPart = explode(',',$wordPart);
  233. $word_count = $arrWordPart[0];
  234. $case_len = $arrWordPart[1];
  235. if ($word_count > 0) {
  236. $this->log("查到:{$word}:{$word_count}个");
  237. $isFound = true;
  238. $count = $word_count + 1;
  239. }
  240. //fomular of confidence value 信心值计算公式
  241. if ($isFound) {
  242. $cf = Cache::remember("turbosplit/confidence/".$word,
  243. env('CACHE_EXPIRE',1000),
  244. function() use($word,$count,$case_len){
  245. $len = mb_strlen($word, "UTF-8") - $case_len;
  246. $len_correct = 1.2;
  247. $count2 = 1.1 + pow($count, 1.18);
  248. $conf_num = pow(1 / $count2, pow(($len - 0.5), $len_correct));
  249. return round(1 / (1 + 640 * $conf_num), 9);
  250. });
  251. return ($cf);
  252. } else {
  253. return (-1);
  254. }
  255. }
  256. /**
  257. * word
  258. * cf
  259. * children[]
  260. */
  261. /**
  262. * 核心拆分函数
  263. * $strWord, word to be look up 要查询的词
  264. * $deep, 当前递归深度
  265. * $forward 搜索方向
  266. * true 正向
  267. * false 反向
  268. * $express=true, 快速查询
  269. * $adj_len=0 长度校正系数
  270. * $c_threshhold 信心指数阈值
  271. *
  272. *
  273. *
  274. */
  275. function split(&$node, $deep = 0, $express = false, $adj_len = 0, $c_threshhold = 0.8, $w_threshhold = 0.8, $forward = true, $sandhi_advance = false)
  276. {
  277. $strWord = $node["remain"];
  278. $this->log("spliting word={$strWord} deep={$deep}");
  279. $output = array();
  280. #currPathCf是当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
  281. if($deep == 0){
  282. $this->currPathCf = 1;
  283. }
  284. //达到最大搜索深度,返回
  285. if ($deep >= $this->MAX_DEEP) {
  286. return ;
  287. /*
  288. $word = "";
  289. $cf = 1.0;
  290. for ($i = 0; $i < $deep; $i++) {
  291. if (!empty($this->path[$i][0])) {
  292. $word .= $this->path[$i][0] ;
  293. if($this->isDebug) {
  294. $word .= "(" . $this->path[$i][1] . ")";
  295. }
  296. $word .= "+";
  297. $cf = $cf * $this->path[$i][1];
  298. }
  299. }
  300. $len = pow(mb_strlen($strWord, "UTF-8"), 3);
  301. $cf += (0 - $len) / ($len + 150);
  302. $word .= "{$strWord}";
  303. if ($forward == true) {
  304. $this->result[$word] = $cf;
  305. return 0;
  306. } else {
  307. $reverseWord = $this->word_reverse($word);
  308. $this->result[$reverseWord] = $cf;
  309. return 0;
  310. }
  311. */
  312. }
  313. //直接找到
  314. $confidence = $this->isExsit($strWord, $adj_len);
  315. if ($confidence > $c_threshhold) {
  316. array_push($output, array($strWord, "", $confidence));
  317. if(isset($node['sum_cf'])){
  318. $parent_sum_cf = $node['sum_cf'];
  319. }else{
  320. $parent_sum_cf = 1;
  321. }
  322. $sum_cf = $parent_sum_cf * $confidence;
  323. $node['children'][] = ['word'=>$strWord,'remain'=>"",'cf'=>$confidence,"sum_cf"=>$sum_cf];
  324. $this->log("直接找到{$strWord}-{$confidence}");
  325. }
  326. else if(mb_strlen($strWord,"UTF-8")<6){
  327. //按照语尾查询
  328. $search = "[{$strWord}]";
  329. $confidence = $this->isExsit($search);
  330. $this->log("查询:{$search}-信心指数{$confidence}");
  331. if ($confidence > $c_threshhold) {
  332. array_push($output, array($search, "", $confidence));
  333. if(isset($node['sum_cf'])){
  334. $parent_sum_cf = $node['sum_cf'];
  335. }else{
  336. $parent_sum_cf = 1;
  337. }
  338. $sum_cf = $parent_sum_cf * $confidence;
  339. $node['children'][] = ['word'=>$search,'remain'=>"",'cf'=>$confidence,"sum_cf"=>$sum_cf];
  340. $this->log("直接找到{$strWord}-{$confidence}");
  341. }
  342. }
  343. //如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。
  344. $doubleword = "kkggccjjṭṭḍḍttddppbb";
  345. if (mb_strlen($strWord, "UTF-8") > 2) {
  346. $left2 = mb_substr($strWord, 0, 2, "UTF-8");
  347. if (mb_strpos($doubleword, $left2, 0, "UTF-8") !== false) {
  348. $strWord = mb_substr($strWord, 1, null, "UTF-8");
  349. }
  350. }
  351. $len = mb_strlen($strWord, "UTF-8");
  352. if ($len > 2) {
  353. if ($forward) {
  354. #正向切
  355. $this->log("正向切");
  356. for ($i = $len; $i > 1; $i--) {
  357. //应用连音规则切分单词
  358. foreach ($this->sandhi as $key => $row) {
  359. if ($sandhi_advance == false && $row["advance"] == true) {
  360. //continue;
  361. }
  362. if (mb_substr($strWord, $i - $row["len"], $row["len"], "UTF-8") == $row["c"]) {
  363. $str1 = mb_substr($strWord, 0, $i - $row["len"], "UTF-8") . $row["a"];
  364. $str2 = $row["b"] . mb_substr($strWord, $i, null, "UTF-8");
  365. $confidence = $this->isExsit($str1, $adj_len)*$row["cf"];
  366. if ($confidence > $c_threshhold) {
  367. //信心指数大于预设的阈值,插入
  368. array_push($output, array($str1, $str2, $confidence, $row["adj_len"]));
  369. if(isset($node['sum_cf'])){
  370. $parent_sum_cf = $node['sum_cf'];
  371. }else{
  372. $parent_sum_cf = 1;
  373. }
  374. $sum_cf = $parent_sum_cf * $confidence;
  375. if($sum_cf>$c_threshhold){
  376. $node['children'][] = [
  377. 'word'=>$str1,
  378. 'remain'=>$str2,
  379. 'cf'=>$confidence,
  380. 'sum_cf'=>$sum_cf,
  381. 'children'=>[]
  382. ];
  383. }
  384. $this->log("插入结构数组:{$str1} 剩余{$str2} 应用:{$row["a"]}-{$row["b"]}-{$row["c"]}");
  385. if ($express) {
  386. break;
  387. }
  388. }
  389. }
  390. }
  391. }
  392. } else {
  393. #反向切
  394. for ($i = 1; $i < $len - 1; $i++) {
  395. foreach ($this->sandhi as $key => $row) {
  396. if ($sandhi_advance == false && $row["advance"] == true) {
  397. //continue;
  398. }
  399. if (mb_substr($strWord, $i, $row["len"], "UTF-8") == $row["c"]) {
  400. $str1 = mb_substr($strWord, 0, $i, "UTF-8") . $row["a"];
  401. $str2 = $row["b"] . mb_substr($strWord, $i + $row["len"], null, "UTF-8");
  402. $confidence = $this->isExsit($str2, $adj_len)*$row["cf"];
  403. if ($confidence > $c_threshhold) {
  404. array_push($output, array($str2, $str1, $confidence, $row["adj_len"]));
  405. if(isset($node['sum_cf'])){
  406. $parent_sum_cf = $node['sum_cf'];
  407. }else{
  408. $parent_sum_cf = 1;
  409. }
  410. $sum_cf = $parent_sum_cf * $confidence;
  411. if($sum_cf>$c_threshhold){
  412. $node['children'][] = [
  413. 'word'=>$str2,
  414. 'remain'=>$str1,
  415. 'cf'=>$confidence,
  416. 'sum_cf'=>$sum_cf,
  417. 'children'=>[],
  418. ];
  419. }
  420. $this->log("将此次结果插入结果数组:剩余={$str2}");
  421. if ($express) {
  422. break;
  423. }
  424. }
  425. }
  426. }
  427. }
  428. }
  429. }
  430. $word = "";
  431. $this->log("结果数组个数:".count($output));
  432. //print_r($node);
  433. //遍历children
  434. foreach ($node['children'] as $key => $child) {
  435. # code...
  436. if(isset($child) && !empty($child['remain'])){
  437. $this->split($node['children'][$key], ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
  438. }
  439. }
  440. /*
  441. if (count($output) > 0) {
  442. foreach ($output as $part) {
  443. $checked = $part[0];
  444. $remainder = $part[1];
  445. $this->log("剩余部分:{$remainder}");
  446. $this->path[$deep][0] = $checked;
  447. $this->path[$deep][1] = $part[2];
  448. if (empty($remainder)) {
  449. #全切完了
  450. $this->log("全切完了");
  451. $word = "";
  452. $cf = 1.0;
  453. for ($i = 0; $i < $deep; $i++) {
  454. $word .= $this->path[$i][0];
  455. if ($this->isDebug) {
  456. $word .= "(" . $this->path[$i][1] . ")";
  457. }
  458. $word .= "+";
  459. $cf = $cf * $this->path[$i][1];
  460. }
  461. if ($this->isDebug) {
  462. $word .= $checked . "({$part[2]})";
  463. } else {
  464. $word .= $checked;
  465. }
  466. $cf = $cf * $part[2];
  467. if ($cf > $w_threshhold) {
  468. if ($forward == true) {
  469. $this->result[$word] = $cf;
  470. return 0;
  471. } else {
  472. $reverseWord = $this->word_reverse($word);
  473. $this->result[$reverseWord] = $cf;
  474. return 0;
  475. }
  476. }
  477. } else {
  478. //还有剩余部分
  479. #计算当前信心指数
  480. $cf = 1.0;
  481. for ($i = 0; $i < $deep; $i++) {
  482. $cf = $cf * $this->path[$i][1];
  483. }
  484. $this->log("计算当前信心指数:{$cf}");
  485. if($cf<$w_threshhold){
  486. $this->log("信心指数过低,提前返回 {$cf}");
  487. return 0;
  488. }else{
  489. #接着切
  490. $this->log("接着切:{$remainder}");
  491. $this->split($remainder, ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
  492. }
  493. }
  494. }
  495. }else {
  496. #尾巴查不到了
  497. $this->log("尾巴查不到了");
  498. $word = "";
  499. $cf = 1.0;
  500. for ($i = 0; $i < $deep; $i++) {
  501. $word .= $this->path[$i][0];
  502. if ($this->isDebug) {
  503. $word .= "(" . $this->path[$i][1] . ")";
  504. }
  505. $word .= "+";
  506. $cf = $cf * $this->path[$i][1];
  507. }
  508. $len = pow(mb_strlen($strWord, "UTF-8"), 3);
  509. if ($forward) {
  510. $cf =(1-$cf) * $len / ($len + 150);
  511. } else {
  512. $cf =(1-$cf) * $len / ($len + 5);
  513. }
  514. if ($this->isDebug) {
  515. $word = $word.$strWord . "(0)";
  516. } else {
  517. $word = $word .$strWord;
  518. }
  519. if ($cf > $w_threshhold) {
  520. if ($forward == true) {
  521. $this->result[$word] = $cf;
  522. return 0;
  523. }
  524. else {
  525. $reverseWord = $this->word_reverse($word);
  526. $this->result[$reverseWord] = $cf;
  527. return 0;
  528. }
  529. }
  530. }
  531. */
  532. }
  533. /**
  534. * 遍历树状结构,获取结果
  535. */
  536. private function get_result($node,&$path){
  537. # code...
  538. $path[] = $node['word'];
  539. if(isset($node['children']) && count($node['children'])>0){
  540. foreach ($node['children'] as $key => $value) {
  541. $this->get_result($value,$path);
  542. }
  543. }else{
  544. if(empty($node['remain'])){
  545. $factors = trim(implode("+",$path),'+');
  546. $this->result[$factors] = $node['sum_cf'];
  547. }else{
  548. }
  549. }
  550. array_pop($path);
  551. }
  552. /**
  553. * 颠倒词序
  554. */
  555. public function word_reverse($word)
  556. {
  557. $reverse = array();
  558. $newword = explode("+", $word);
  559. $len = count($newword);
  560. if ($len > 0) {
  561. for ($i = $len - 1; $i >= 0; $i--) {
  562. # code...
  563. $reverse[] = $newword[$i];
  564. }
  565. $output = implode("+", $reverse);
  566. return $output;
  567. } else {
  568. return $word;
  569. }
  570. }
  571. /**
  572. * 拆分后的处理
  573. */
  574. public function split2($word){
  575. $input = explode("+",$word);
  576. $newword=array();
  577. foreach ($input as $value) {
  578. //去掉带小括号的调试信息
  579. $word = strstr($value,"(",true);
  580. if($word == false){
  581. $word = $value;
  582. }
  583. if(mb_strlen($word,"UTF-8")>4){
  584. # 先看有没有中文意思
  585. //$this->log("先看有没有中文意思");
  586. if(UserDict::where('word',$word)->where('mean','<>','')->where('language','<>','my')->exists()){
  587. $newword[]=$word;
  588. }else{
  589. //$this->log("如果没有查巴缅替换拆分");
  590. #如果没有查巴缅替换拆分
  591. if(UserDict::where('word',$word)->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->exists()){
  592. $pmPart = explode("+",UserDict::where('word',$word)->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->value('factors')) ;
  593. foreach ($pmPart as $pm) {
  594. # code...
  595. $newword[]=$pm;
  596. }
  597. }
  598. else{
  599. //$this->log("如果没有查规则变形");
  600. #如果没有查规则变形
  601. if(UserDict::where('word',$word)->where('source','_SYS_REGULAR_')->exists()){
  602. $rglPart = explode("+",UserDict::where('word',$word)->where('source','_SYS_REGULAR_')->value('factors')) ;
  603. #看巴缅有没有第一部分
  604. //$this->log("看巴缅有没有第一部分");
  605. if(UserDict::where('word',$rglPart[0])->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->exists()){
  606. $pmPart = explode("+",UserDict::where('word',$rglPart[0])->where('dict_id','61f23efb-b526-4a8e-999e-076965034e60')->value('factors')) ;
  607. foreach ($pmPart as $pm) {
  608. # code...
  609. $newword[]=$pm;
  610. }
  611. }
  612. else{
  613. #没有
  614. $newword[]=$rglPart[0];
  615. }
  616. $newword[]=$rglPart[1];
  617. }
  618. else{
  619. #还没有就认命了
  620. //$this->log("还没有就认命了");
  621. $newword[]=$word;
  622. }
  623. }
  624. }
  625. }
  626. else{
  627. $newword[]=$word;
  628. }
  629. }
  630. return implode("+",$newword);
  631. }
  632. /**
  633. * 预处理连音词
  634. */
  635. public function splitSandhi($word){
  636. $newWord = "";
  637. $firstWord=$word;
  638. do {
  639. $isFound = false;
  640. foreach ($this->sandhi2 as $key => $sandhi) {
  641. # code...
  642. $len = $sandhi["len"];
  643. $end = mb_substr($firstWord, 0 - $len, null, "UTF-8");
  644. if ($end == $sandhi["c"]) {
  645. $word1 = mb_substr($firstWord, 0, mb_strlen($firstWord, "UTF-8") - $len, "UTF-8") .$sandhi["a"];
  646. $word2 = $sandhi["b"];
  647. $newWord = $word2 . "-" .$newWord;
  648. $firstWord = $word1;
  649. $isFound=true;
  650. break;
  651. }
  652. }
  653. } while ($isFound);
  654. $newWord = $firstWord . "-" .$newWord;
  655. return mb_substr($newWord,0,-1, "UTF-8");
  656. }
  657. public function splitA($word){
  658. $output = array();
  659. //预处理连音词
  660. $word1 = $this->splitSandhi($word);
  661. # 处理双元音
  662. $this->log("处理双元音");
  663. $arrword = $this->splitDiphthong($word1);
  664. if (count($arrword) > 1) {
  665. array_push($output,['word'=>$word,'type'=>'.un.','grammar'=>'','parent'=>'','factors'=>implode("+", $arrword),'confidence'=>0.9999]);
  666. }
  667. foreach ($arrword as $oneword) {
  668. $this->result = array(); //清空递归程序的输出容器
  669. $node = ['word'=>"",'remain'=>$oneword,'children'=>[]];
  670. if(mb_strlen($oneword)>35){
  671. //长词使用快速切分 正向切分 不使用少见sandi规则
  672. $this->split($node, 0, true, 0.8, 0.9, 0, true, false);
  673. $min_result = 1;
  674. }else{
  675. $this->split($node, 0, false, 0.8, 0.9, 0, true, false);
  676. $min_result=2;
  677. }
  678. $path = [];
  679. $this->log($node);
  680. $this->get_result($node,$path);
  681. $this->log("正向切分结束 结果数量".count($this->result));
  682. if(count($this->result)<$min_result){
  683. //有效结果过少
  684. $node = ['word'=>"",'remain'=>$oneword,'children'=>[]];
  685. $this->split($node, 0, false, 0.2, 0.8, 0, true, true);
  686. $this->log("有效结果过少 再次正切".count($this->result) );
  687. if(count($this->result)<2){
  688. $node = ['word'=>"",'remain'=>$oneword,'children'=>[]];
  689. $this->split($node, 0, false, 0.2, 0.8, 0, false, true);
  690. $this->log("有效结果过少 再次反切:结果数量" . count($this->result));
  691. }
  692. }
  693. $this->log("{$oneword}:" . count($this->result));
  694. if (count($this->result) > 0) {
  695. arsort($this->result); //按信心指数升序排序
  696. $iCount = 0;
  697. foreach ($this->result as $row => $value) {
  698. $factors = $row;
  699. if(strpos($row,']+') !== FALSE){
  700. $type = '.un.';
  701. $factors = \str_replace(['+[ṃ]+','[ṃ]+'],'ṃ+',$row);
  702. }else{
  703. $type = '.cp.';
  704. }
  705. $newword = ['word'=>$oneword,'type'=>$type,'grammar'=>'','parent'=>'','factors'=>$factors,'confidence'=>$value];
  706. array_push($output,$newword);
  707. if($iCount==0){
  708. //对于最优结果进行处理 找到base
  709. $wordWithType = ['word'=>$oneword,'type'=>'','grammar'=>'','parent'=>'','factors'=>$factors,'confidence'=>$value];
  710. $this->log("查找base");
  711. $factors = explode('+',$row);
  712. $endOfFactor = end($factors);
  713. if(strpos($endOfFactor,"[") !== FALSE){
  714. if(count($factors)>=2){
  715. $endOfFactor = $factors[count($factors)-2];
  716. }
  717. }
  718. $this->log("结尾词:".$endOfFactor);
  719. $caseman = new CaseMan();
  720. //猜测单词的base
  721. $parents = $caseman->WordToBase($oneword,1,false);
  722. //找到结尾单词的base
  723. $end_parents = $caseman->WordToBase($endOfFactor);
  724. if(count($parents)>0){
  725. foreach ($parents as $base=>$case) {
  726. # code...
  727. if(count($end_parents)>0){
  728. foreach ($end_parents as $base2=>$case2) {
  729. if(\mb_substr($base2,-2)===\mb_substr($base,-2)){
  730. $this->log("{$base} ok");
  731. foreach ($case as $value) {
  732. # code...
  733. foreach ($case2 as $value2) {
  734. //验证语法信息是否正确
  735. if($value['type'] == $value2['type'] &&
  736. substr($value['grammar'],0,3) === substr($value2['grammar'],0,3) &&
  737. $value['confidence']>0.5){
  738. $wordWithType['type'] = $value['type'];
  739. $wordWithType['grammar'] = $value['grammar'];
  740. $wordWithType['factors'] = $value['factors'];
  741. $wordWithType['parent'] = $base;
  742. $wordWithType['confidence'] = $value2['confidence'];
  743. $this->log("word:{$wordWithType['word']} ; type:{$wordWithType['type']}; grammar:{$wordWithType['grammar']};parent:{$wordWithType['parent']}");
  744. array_push($output,$wordWithType);
  745. }
  746. }
  747. }
  748. }
  749. }
  750. }else{
  751. foreach ($case as $value) {
  752. $wordWithType['type'] = $value['type'];
  753. $wordWithType['grammar'] = $value['grammar'];
  754. $wordWithType['factors'] = $value['factors'];
  755. $wordWithType['parent'] = $base;
  756. $wordWithType['confidence'] = 0.1;
  757. array_push($output,$wordWithType);
  758. }
  759. }
  760. }
  761. }
  762. }
  763. //后处理 进一步切分没有意思的长词
  764. $this->log("后处理 进一步切分没有意思的长词");
  765. $new = $this->split2($row);
  766. if($new !== $row){
  767. $newword['factors'] = $new;
  768. array_push($output,$newword);
  769. #再处理一次
  770. $new2 = $this->split2($new);
  771. if($new2!==$new){
  772. $newword['factors'] = $new2;
  773. array_push($output,$newword);
  774. }
  775. }
  776. $iCount++;
  777. if ($iCount > $this->MAX_RESULT2) {
  778. break;
  779. }
  780. }
  781. } else {
  782. Log::error("{$oneword} 切分失败");
  783. }
  784. }
  785. return $output;
  786. }
  787. public function setting($param=null){
  788. }
  789. public function getResult(){
  790. return $this->result;
  791. }
  792. public function debug($debug){
  793. $this->isDebug = $debug;
  794. }
  795. private function log($message){
  796. if ($this->isDebug) {
  797. Log::info($message);
  798. }
  799. }
  800. private function pushResult($word,$cf){
  801. array_push($this->result,array($word=>$cf));
  802. }
  803. }