2
0

TurboSplit.php 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803
  1. <?php
  2. namespace App\Tools;
  3. require_once __DIR__ . '/../../public/app/public/casesuf.inc';
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\WordPart;
  6. use App\Models\UserDict;
  7. use App\Tools\RedisClusters;
  8. class TurboSplit
  9. {
  10. protected $options = [
  11. "express" => false,
  12. "c_threshhold" => 0.8,
  13. "w_threshhold" => 0.8,
  14. "forward" => true,
  15. "sandhi_advance" => false,
  16. "lookup_declension" => true,
  17. 'timeout' => 1000, //超时放弃
  18. /**快速查字典-不去尾 */
  19. ];
  20. protected $started_at = null;
  21. protected $node = [];
  22. protected $path = array();
  23. protected $isDebug = false;
  24. #当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
  25. protected $currPathCf;
  26. //结果数组
  27. protected $result = array();
  28. //过程中最大结果数量
  29. protected $MAX_RESULT = 100;
  30. //返回值最大结果数量
  31. protected $MAX_RESULT2 = 8;
  32. //最大递归深度
  33. protected $MAX_DEEP = 16;
  34. //连音规则表
  35. protected $sandhi = [
  36. ["a" => "", "b" => "", "c" => "", "len" => 0, "adj_len" => 0, "advance" => false, "cf" => 1.0],
  37. ["a" => "a", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.99999],
  38. ["a" => "ā", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  39. ["a" => "a", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  40. ["a" => "ā", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  41. ["a" => "a", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  42. ["a" => "a", "b" => "i", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  43. ["a" => "a", "b" => "o", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  44. ["a" => "a", "b" => "u", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  45. ["a" => "u", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  46. ["a" => "u", "b" => "u", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  47. ["a" => "a", "b" => "u", "c" => "u", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  48. ["a" => "a", "b" => "ī", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  49. ["a" => "a", "b" => "ū", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  50. ["a" => "a", "b" => "i", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  51. ["a" => "e", "b" => "a", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  52. ["a" => "i", "b" => "i", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  53. ["a" => "i", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  54. ["a" => "i", "b" => "a", "c" => "ya", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  55. ["a" => "a", "b" => "atth", "c" => "atth", "len" => 4, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  56. ["a" => "taṃ", "b" => "n", "c" => "tann", "len" => 4, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  57. ["a" => "[ṃ]", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  58. ["a" => "[ṃ]", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  59. ["a" => "[o]", "b" => "iva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  60. ["a" => "o", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  61. ["a" => "a", "b" => "ādi", "c" => "ādi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  62. ["a" => "a[ānaṃ]", "b" => "a", "c" => "ānama", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  63. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  64. ["a" => "[ṃ]", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  65. ["a" => "[ṃ]", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  66. ["a" => "[ṃ]", "b" => "a", "c" => "ma", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  67. ["a" => "ṃ", "b" => "a", "c" => "m", "len" => 1, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  68. ["a" => "[ṃ]", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.8],
  69. ["a" => "ṃ", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.9],
  70. ["a" => "[ṃ]", "b" => "u", "c" => "mu", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.8],
  71. ["a" => "[ṃ]", "b" => "h", "c" => "ñh", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.8],
  72. ["a" => "ā", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.8],
  73. ["a" => "a", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.8],
  74. ["a" => "ī", "b" => "[ṃ]", "c" => "im", "len" => 2, "adj_len" => 0, "advance" => false, "cf" => 0.8],
  75. ["a" => "ati", "b" => "tabba", "c" => "atabba", "len" => 6, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  76. ["a" => "ati", "b" => "tabba", "c" => "itabba", "len" => 6, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  77. ["a" => "iti", "b" => "a", "c" => "icca", "len" => 4, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  78. ["a" => "uṃ", "b" => "a", "c" => "uma", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  79. ["a" => "u[ūnaṃ]", "b" => "a", "c" => "ūnama", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  80. ["a" => "ī[īnaṃ]", "b" => "a", "c" => "īnama", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  81. ["a" => "su", "b" => "a", "c" => "sva", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  82. ["a" => "ā", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  83. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.99999],
  84. ["a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.99999],
  85. ["a" => "ī", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  86. ["a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.99999],
  87. ["a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.99999],
  88. ["a" => "ū", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  89. ["a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.99999],
  90. ["a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.99999],
  91. ["a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 1.0],
  92. ["a" => "ṃ", "b" => "cāti", "c" => "ñcāti", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 1.0],
  93. ["a" => "ṃ", "b" => "cet", "c" => "ñcet", "len" => 4, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  94. ["a" => "ṃ", "b" => "ev", "c" => "mev", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.99999],
  95. ["a" => "a", "b" => "a", "c" => "a", "len" => 1, "adj_len" => -1, "advance" => true, "cf" => 0.99],
  96. ["a" => "ī", "b" => "", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => true, "cf" => 0.9],
  97. ];
  98. protected $sandhi2 = [
  99. ["a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 1.0],
  100. ["a" => "ṃ", "b" => "hi", "c" => "ñhi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 1.0],
  101. ["a" => "ena", "b" => "iti", "c" => "enāti", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 1.0],
  102. ["a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9],
  103. ["a" => "ā", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.6],
  104. ["a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9],
  105. ["a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9],
  106. ["a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9],
  107. ["a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9],
  108. ["a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9],
  109. ["a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 0.9],
  110. ["a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  111. ["a" => "u", "b" => "eva", "c" => "uyeva", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  112. ["a" => "ṃ", "b" => "eva", "c" => "ṃyeva", "len" => 5, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  113. ["a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  114. ["a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  115. ["a" => "ṃ", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  116. ["a" => "u", "b" => "eva", "c" => "veva", "len" => 4, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  117. ["a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  118. ["a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  119. ["a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  120. ["a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  121. ["a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  122. ["a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  123. ["a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  124. ["a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  125. ["a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  126. ["a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  127. ["a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  128. ["a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false, "cf" => 0.9999],
  129. ];
  130. /**
  131. * Create a new class instance.
  132. *
  133. * @return void
  134. */
  135. public function __construct($options = [])
  136. {
  137. for ($i = 0; $i < $this->MAX_DEEP; $i++) {
  138. array_push($this->path, array("", 0));
  139. }
  140. foreach ($options as $key => $value) {
  141. $this->options[$key] = $value;
  142. }
  143. return;
  144. }
  145. /**
  146. * 从双元音处切开
  147. * @param string $word
  148. * @return array
  149. */
  150. public function splitDiphthong($word)
  151. {
  152. //diphthong table双元音表
  153. $search = array('aa', 'ae', 'ai', 'ao', 'au', 'aā', 'aī', 'aū', 'ea', 'ee', 'ei', 'eo', 'eu', 'eā', 'eī', 'eū', 'ia', 'ie', 'ii', 'io', 'iu', 'iā', 'iī', 'iū', 'oa', 'oe', 'oi', 'oo', 'ou', 'oā', 'oī', 'oū', 'ua', 'ue', 'ui', 'uo', 'uu', 'uā', 'uī', 'uū', 'āa', 'āe', 'āi', 'āo', 'āu', 'āā', 'āī', 'āū', 'īa', 'īe', 'īi', 'īo', 'īu', 'īā', 'īī', 'īū', 'ūa', 'ūe', 'ūi', 'ūo', 'ūu', 'ūā', 'ūī', 'ūū');
  154. $replace = array('a-a', 'a-e', 'a-i', 'a-o', 'a-u', 'a-ā', 'a-ī', 'a-ū', 'e-a', 'e-e', 'e-i', 'e-o', 'e-u', 'e-ā', 'e-ī', 'e-ū', 'i-a', 'i-e', 'i-i', 'i-o', 'i-u', 'i-ā', 'i-ī', 'i-ū', 'o-a', 'o-e', 'o-i', 'o-o', 'o-u', 'o-ā', 'o-ī', 'o-ū', 'u-a', 'u-e', 'u-i', 'u-o', 'u-u', 'u-ā', 'u-ī', 'u-ū', 'ā-a', 'ā-e', 'ā-i', 'ā-o', 'ā-u', 'ā-ā', 'ā-ī', 'ā-ū', 'ī-a', 'ī-e', 'ī-i', 'ī-o', 'ī-u', 'ī-ā', 'ī-ī', 'ī-ū', 'ū-a', 'ū-e', 'ū-i', 'ū-o', 'ū-u', 'ū-ā', 'ū-ī', 'ū-ū');
  155. //将双元音拆开
  156. //step 1 : split at diphthong . ~aa~ -> ~a-a~
  157. $word1 = str_replace($search, $replace, $word);
  158. //按连字符拆开处理
  159. $arrword = str_getcsv($word1, "-");
  160. return $arrword;
  161. }
  162. /**
  163. * 查询单词是否存在于词干表
  164. * 如果存在,返回单词权重
  165. * 如果不存在,变格后返回权重和语尾长度
  166. *
  167. * @param string $word
  168. * @return array(int $wordWeight, int $endingLength)
  169. */
  170. public function dict_lookup($word)
  171. {
  172. global $case; //语尾表
  173. if (strlen($word) <= 1) {
  174. return array(0, 0);
  175. }
  176. $search = $word;
  177. //获取单词权重
  178. $weight = RedisClusters::remember(
  179. 'palicanon/wordpart/weight/' . $search,
  180. config('mint.cache.expire'),
  181. function () use ($search) {
  182. return WordPart::where('word', $search)->value('weight');
  183. }
  184. );
  185. if ($weight) {
  186. //找到
  187. $this->log($search . '=' . $weight);
  188. return array($weight, 0);
  189. } else {
  190. //没找到
  191. if ($this->options["lookup_declension"]) {
  192. return array(0, 0);
  193. }
  194. //去除尾查
  195. $newWord = array();
  196. for ($row = 0; $row < count($case); $row++) {
  197. $len = mb_strlen($case[$row][1], "UTF-8");
  198. $end = mb_substr($search, 0 - $len, null, "UTF-8");
  199. if ($end == $case[$row][1]) {
  200. $base = mb_substr($search, 0, mb_strlen($search, "UTF-8") - $len, "UTF-8") . $case[$row][0];
  201. if ($base != $search) {
  202. $newWord[$base] = mb_strlen($case[$row][1], "UTF-8");
  203. }
  204. }
  205. }
  206. #找到权重最高的base
  207. $base_weight = 0;
  208. $len = 0;
  209. foreach ($newWord as $x => $x_len) {
  210. $weight = RedisClusters::remember(
  211. 'palicanon/wordpart/weight/' . $x,
  212. config('mint.cache.expire'),
  213. function () use ($x) {
  214. return WordPart::where('word', $x)->value('weight');
  215. }
  216. );
  217. if ($weight) {
  218. if ($weight > $base_weight) {
  219. $base_weight = $weight;
  220. $len = $x_len;
  221. }
  222. }
  223. }
  224. return array($base_weight, $len);
  225. }
  226. }
  227. /**
  228. * 查找某个单词是否在现有词典出现
  229. * 返回信心指数
  230. * look up single word in dictionary vocabulary
  231. * return the confidence value
  232. *
  233. *
  234. *
  235. */
  236. public function isExist($word, $adj_len = 0)
  237. {
  238. $this->log("正在查询:{$word}");
  239. $isFound = false;
  240. $count = 0;
  241. $wordPart = RedisClusters::remember(
  242. "turbosplit/part/{$word}",
  243. config('mint.cache.expire'),
  244. function () use ($word) {
  245. return implode(',', $this->dict_lookup($word));
  246. }
  247. );
  248. $arrWordPart = explode(',', $wordPart);
  249. $word_count = $arrWordPart[0];
  250. if (isset($arrWordPart[1])) {
  251. $case_len = $arrWordPart[1];
  252. } else {
  253. $case_len = 0;
  254. Log::error('wordPart error value=' . $wordPart);
  255. }
  256. if ($word_count > 0) {
  257. $this->log("查到:{$word}:{$word_count}个");
  258. $isFound = true;
  259. $count = $word_count + 1;
  260. }
  261. //fomular of confidence value 信心值计算公式
  262. if ($isFound) {
  263. $cf = RedisClusters::remember(
  264. "turbosplit/confidence/" . $word,
  265. config('mint.cache.expire'),
  266. function () use ($word, $count, $case_len) {
  267. $len = mb_strlen($word, "UTF-8") - $case_len;
  268. $len_correct = 1.2;
  269. $count2 = 1.1 + pow($count, 1.18);
  270. $conf_num = pow(1 / $count2, pow(($len - 0.5), $len_correct));
  271. return round(1 / (1 + 640 * $conf_num), 9);
  272. }
  273. );
  274. return ($cf);
  275. } else {
  276. return (-1);
  277. }
  278. }
  279. /**
  280. * 判断是否超时
  281. * @return boolean
  282. */
  283. private function isTimeOut()
  284. {
  285. if ($this->started_at) {
  286. $time = time() - $this->started_at;
  287. if ($time > $this->options['timeout']) {
  288. Log::warning('split timeout');
  289. return true;
  290. }
  291. }
  292. return false;
  293. }
  294. /**
  295. * 核心拆分函数
  296. *
  297. * @param array $node word to be look up 要查询的词
  298. * @param int $deep 当前递归深度
  299. * @param boolean $forward 搜索方向 true 正向 false 反向
  300. * @param boolean $express=true, 快速查询
  301. * @param int $adj_len=0 长度校正系数
  302. * @param int $c_threshhold 信心指数阈值
  303. * @return void
  304. */
  305. private function split(&$node, $deep = 0, $express = false, $adj_len = 0, $c_threshhold = 0.8, $w_threshhold = 0.8, $forward = true, $sandhi_advance = false)
  306. {
  307. $strWord = $node["remain"];
  308. $this->log("spliting word={$strWord} deep={$deep}");
  309. $output = array();
  310. #currPathCf是当前搜索路径信心指数,如果过低,马上终止这个路径的搜索
  311. if ($deep == 0) {
  312. $this->currPathCf = 1;
  313. }
  314. //达到最大搜索深度,返回
  315. if ($deep >= $this->MAX_DEEP) {
  316. return;
  317. }
  318. //直接找到
  319. $confidence = $this->isExist($strWord, $adj_len);
  320. if ($confidence > $c_threshhold) {
  321. array_push($output, array($strWord, "", $confidence));
  322. if (isset($node['sum_cf'])) {
  323. $parent_sum_cf = $node['sum_cf'];
  324. } else {
  325. $parent_sum_cf = 1;
  326. }
  327. $sum_cf = $parent_sum_cf * $confidence;
  328. $node['children'][] = ['word' => $strWord, 'remain' => "", 'cf' => $confidence, "sum_cf" => $sum_cf];
  329. $this->log("直接找到{$strWord}-{$confidence}");
  330. } else if (mb_strlen($strWord, "UTF-8") < 6) {
  331. //按照语尾查询
  332. $search = "[{$strWord}]";
  333. $confidence = $this->isExist($search);
  334. $this->log("查询:{$search}-信心指数{$confidence}");
  335. if ($confidence > $c_threshhold) {
  336. array_push($output, array($search, "", $confidence));
  337. if (isset($node['sum_cf'])) {
  338. $parent_sum_cf = $node['sum_cf'];
  339. } else {
  340. $parent_sum_cf = 1;
  341. }
  342. $sum_cf = $parent_sum_cf * $confidence;
  343. $node['children'][] = ['word' => $search, 'remain' => "", 'cf' => $confidence, "sum_cf" => $sum_cf];
  344. $this->log("直接找到{$strWord}-{$confidence}");
  345. }
  346. }
  347. //如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。
  348. $doubleword = "kkggccjjṭṭḍḍttddppbb";
  349. if (mb_strlen($strWord, "UTF-8") > 2) {
  350. $left2 = mb_substr($strWord, 0, 2, "UTF-8");
  351. if (mb_strpos($doubleword, $left2, 0, "UTF-8") !== false) {
  352. $strWord = mb_substr($strWord, 1, null, "UTF-8");
  353. }
  354. }
  355. $len = mb_strlen($strWord, "UTF-8");
  356. if ($len > 2) {
  357. if ($forward) {
  358. #正向切
  359. $this->log("正向切");
  360. for ($i = $len; $i > 1; $i--) {
  361. if ($this->isTimeOut()) {
  362. Log::warning('line ' . __LINE__);
  363. return;
  364. }
  365. //应用连音规则切分单词
  366. foreach ($this->sandhi as $key => $row) {
  367. if ($sandhi_advance == false && $row["advance"] == true) {
  368. //continue;
  369. }
  370. if (mb_substr($strWord, $i - $row["len"], $row["len"], "UTF-8") == $row["c"]) {
  371. $str1 = mb_substr($strWord, 0, $i - $row["len"], "UTF-8") . $row["a"];
  372. $str2 = $row["b"] . mb_substr($strWord, $i, null, "UTF-8");
  373. $confidence = $this->isExist($str1, $adj_len) * $row["cf"];
  374. if ($confidence > $c_threshhold) {
  375. //信心指数大于预设的阈值,插入
  376. array_push($output, array($str1, $str2, $confidence, $row["adj_len"]));
  377. if (isset($node['sum_cf'])) {
  378. $parent_sum_cf = $node['sum_cf'];
  379. } else {
  380. $parent_sum_cf = 1;
  381. }
  382. $sum_cf = $parent_sum_cf * $confidence;
  383. if ($sum_cf > $c_threshhold) {
  384. $node['children'][] = [
  385. 'word' => $str1,
  386. 'remain' => $str2,
  387. 'cf' => $confidence,
  388. 'sum_cf' => $sum_cf,
  389. 'children' => []
  390. ];
  391. }
  392. $this->log("插入结构数组:{$str1} 剩余{$str2} 应用:{$row["a"]}-{$row["b"]}-{$row["c"]}");
  393. if ($express) {
  394. break;
  395. }
  396. }
  397. }
  398. }
  399. }
  400. } else {
  401. #反向切
  402. for ($i = 1; $i < $len - 1; $i++) {
  403. foreach ($this->sandhi as $key => $row) {
  404. if ($this->isTimeOut()) {
  405. Log::warning('line ' . __LINE__);
  406. return;
  407. }
  408. if ($sandhi_advance == false && $row["advance"] == true) {
  409. //continue;
  410. }
  411. if (mb_substr($strWord, $i, $row["len"], "UTF-8") == $row["c"]) {
  412. $str1 = mb_substr($strWord, 0, $i, "UTF-8") . $row["a"];
  413. $str2 = $row["b"] . mb_substr($strWord, $i + $row["len"], null, "UTF-8");
  414. $confidence = $this->isExist($str2, $adj_len) * $row["cf"];
  415. if ($confidence > $c_threshhold) {
  416. array_push($output, array($str2, $str1, $confidence, $row["adj_len"]));
  417. if (isset($node['sum_cf'])) {
  418. $parent_sum_cf = $node['sum_cf'];
  419. } else {
  420. $parent_sum_cf = 1;
  421. }
  422. $sum_cf = $parent_sum_cf * $confidence;
  423. if ($sum_cf > $c_threshhold) {
  424. $node['children'][] = [
  425. 'word' => $str2,
  426. 'remain' => $str1,
  427. 'cf' => $confidence,
  428. 'sum_cf' => $sum_cf,
  429. 'children' => [],
  430. ];
  431. }
  432. $this->log("将此次结果插入结果数组:剩余={$str2}");
  433. if ($express) {
  434. break;
  435. }
  436. }
  437. }
  438. }
  439. }
  440. }
  441. }
  442. $word = "";
  443. $this->log("结果数组个数:" . count($output));
  444. //print_r($node);
  445. //遍历children
  446. foreach ($node['children'] as $key => $child) {
  447. if ($this->isTimeOut()) {
  448. Log::warning('line ' . __LINE__);
  449. return;
  450. }
  451. # code...
  452. if (isset($child) && !empty($child['remain'])) {
  453. $this->split($node['children'][$key], ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
  454. }
  455. }
  456. }
  457. /**
  458. * 遍历树状结构,获取结果
  459. */
  460. private function get_result($node, &$path)
  461. {
  462. # code...
  463. $path[] = $node['word'];
  464. if (isset($node['children']) && count($node['children']) > 0) {
  465. foreach ($node['children'] as $key => $value) {
  466. $this->get_result($value, $path);
  467. }
  468. } else {
  469. if (empty($node['remain'])) {
  470. $factors = trim(implode("+", $path), '+');
  471. $this->result[$factors] = $node['sum_cf'];
  472. } else {
  473. }
  474. }
  475. array_pop($path);
  476. }
  477. /**
  478. * 颠倒词序
  479. */
  480. public function word_reverse($word)
  481. {
  482. $reverse = array();
  483. $newword = explode("+", $word);
  484. $len = count($newword);
  485. if ($len > 0) {
  486. for ($i = $len - 1; $i >= 0; $i--) {
  487. # code...
  488. $reverse[] = $newword[$i];
  489. }
  490. $output = implode("+", $reverse);
  491. return $output;
  492. } else {
  493. return $word;
  494. }
  495. }
  496. /**
  497. * 拆分后的处理
  498. */
  499. public function split2($word)
  500. {
  501. $input = explode("+", $word);
  502. $newword = array();
  503. foreach ($input as $value) {
  504. //去掉带小括号的调试信息
  505. $word = strstr($value, "(", true);
  506. if ($word == false) {
  507. $word = $value;
  508. }
  509. if (mb_strlen($word, "UTF-8") > 4) {
  510. # 先看有没有中文意思
  511. //$this->log("先看有没有中文意思");
  512. if (UserDict::where('word', $word)->where('mean', '<>', '')->where('language', '<>', 'my')->exists()) {
  513. $newword[] = $word;
  514. } else {
  515. //$this->log("如果没有查巴缅替换拆分");
  516. #如果没有查巴缅替换拆分
  517. if (UserDict::where('word', $word)->where('dict_id', '61f23efb-b526-4a8e-999e-076965034e60')->exists()) {
  518. $pmPart = explode("+", UserDict::where('word', $word)->where('dict_id', '61f23efb-b526-4a8e-999e-076965034e60')->value('factors'));
  519. foreach ($pmPart as $pm) {
  520. # code...
  521. $newword[] = $pm;
  522. }
  523. } else {
  524. //$this->log("如果没有查规则变形");
  525. #如果没有查规则变形
  526. if (UserDict::where('word', $word)->where('source', '_SYS_REGULAR_')->exists()) {
  527. $rglPart = explode("+", UserDict::where('word', $word)->where('source', '_SYS_REGULAR_')->value('factors'));
  528. #看巴缅有没有第一部分
  529. //$this->log("看巴缅有没有第一部分");
  530. if (UserDict::where('word', $rglPart[0])->where('dict_id', '61f23efb-b526-4a8e-999e-076965034e60')->exists()) {
  531. $pmPart = explode("+", UserDict::where('word', $rglPart[0])->where('dict_id', '61f23efb-b526-4a8e-999e-076965034e60')->value('factors'));
  532. foreach ($pmPart as $pm) {
  533. # code...
  534. $newword[] = $pm;
  535. }
  536. } else {
  537. #没有
  538. $newword[] = $rglPart[0];
  539. }
  540. $newword[] = $rglPart[1];
  541. } else {
  542. #还没有就认命了
  543. //$this->log("还没有就认命了");
  544. $newword[] = $word;
  545. }
  546. }
  547. }
  548. } else {
  549. $newword[] = $word;
  550. }
  551. }
  552. return implode("+", $newword);
  553. }
  554. /**
  555. * 预处理连音词
  556. */
  557. public function splitSandhi($word)
  558. {
  559. $newWord = "";
  560. $firstWord = $word;
  561. do {
  562. $isFound = false;
  563. foreach ($this->sandhi2 as $key => $sandhi) {
  564. # code...
  565. $len = $sandhi["len"];
  566. $end = mb_substr($firstWord, 0 - $len, null, "UTF-8");
  567. if ($end == $sandhi["c"]) {
  568. $word1 = mb_substr($firstWord, 0, mb_strlen($firstWord, "UTF-8") - $len, "UTF-8") . $sandhi["a"];
  569. $word2 = $sandhi["b"];
  570. $newWord = $word2 . "-" . $newWord;
  571. $firstWord = $word1;
  572. $isFound = true;
  573. break;
  574. }
  575. }
  576. } while ($isFound);
  577. $newWord = $firstWord . "-" . $newWord;
  578. return mb_substr($newWord, 0, -1, "UTF-8");
  579. }
  580. /**
  581. * 切分函数
  582. * @param string $word 需要切分的单词
  583. * @return array
  584. */
  585. public function splitA($word)
  586. {
  587. $this->started_at = time();
  588. $caseman = new CaseMan();
  589. $output = array();
  590. //预处理连音词
  591. $word1 = $this->splitSandhi($word);
  592. # 处理双元音
  593. $this->log("处理双元音");
  594. $arrword = $this->splitDiphthong($word1);
  595. if (count($arrword) > 1) {
  596. array_push($output, [
  597. 'word' => $word,
  598. 'type' => '.un.',
  599. 'grammar' => '',
  600. 'parent' => '',
  601. 'factors' => implode("+", $arrword),
  602. 'confidence' => 0.9999
  603. ]);
  604. }
  605. foreach ($arrword as $oneword) {
  606. if (mb_strlen($oneword) < 5) {
  607. continue;
  608. }
  609. $this->result = array(); //清空递归程序的输出容器
  610. $node = ['word' => "", 'remain' => $oneword, 'children' => []];
  611. if (mb_strlen($oneword) > 35) {
  612. //长词使用快速切分 正向切分 不使用少见sandi规则
  613. $this->split($node, 0, true, 0.8, 0.9, 0, true, false);
  614. $min_result = 1;
  615. } else {
  616. $this->split($node, 0, false, 0.8, 0.9, 0, true, false);
  617. $min_result = 2;
  618. }
  619. $path = [];
  620. $this->log($node);
  621. $this->get_result($node, $path);
  622. $this->log("正向切分结束 结果数量" . count($this->result));
  623. if (count($this->result) < $min_result) {
  624. //有效结果过少
  625. $node = ['word' => "", 'remain' => $oneword, 'children' => []];
  626. $this->split($node, 0, false, 0.2, 0.8, 0, true, true);
  627. $this->log("有效结果过少 再次正切" . count($this->result));
  628. if (count($this->result) < 2) {
  629. $node = ['word' => "", 'remain' => $oneword, 'children' => []];
  630. $this->split($node, 0, false, 0.2, 0.8, 0, false, true);
  631. $this->log("有效结果过少 再次反切:结果数量" . count($this->result));
  632. }
  633. }
  634. $this->log("{$oneword}:" . count($this->result));
  635. if (count($this->result) > 0) {
  636. arsort($this->result); //按信心指数升序排序
  637. $iCount = 0;
  638. foreach ($this->result as $row => $value) {
  639. $factors = $row;
  640. if (strpos($row, ']+') !== FALSE) {
  641. $type = '.un.';
  642. $factors = \str_replace(['+[ṃ]+', '[ṃ]+'], 'ṃ+', $row);
  643. } else {
  644. $type = '.cp.';
  645. }
  646. $newword = ['word' => $oneword, 'type' => $type, 'grammar' => '', 'parent' => '', 'factors' => $factors, 'confidence' => $value];
  647. array_push($output, $newword);
  648. if ($iCount == 0) {
  649. //对于最优结果进行处理 找到base
  650. $wordWithType = ['word' => $oneword, 'type' => '', 'grammar' => '', 'parent' => '', 'factors' => $factors, 'confidence' => $value];
  651. $this->log("查找base");
  652. $factors = explode('+', $row);
  653. $endOfFactor = end($factors);
  654. if (strpos($endOfFactor, "[") !== FALSE) {
  655. if (count($factors) >= 2) {
  656. $endOfFactor = $factors[count($factors) - 2];
  657. }
  658. }
  659. $this->log("结尾词:" . $endOfFactor);
  660. //猜测单词的base
  661. $parents = $caseman->WordToBase($oneword, 1, false);
  662. //找到结尾单词的base
  663. $end_parents = $caseman->WordToBase($endOfFactor);
  664. if (count($parents) > 0) {
  665. foreach ($parents as $base => $case) {
  666. # code...
  667. if (count($end_parents) > 0) {
  668. foreach ($end_parents as $base2 => $case2) {
  669. if (\mb_substr($base2, -2) === \mb_substr($base, -2)) {
  670. $this->log("{$base} ok");
  671. foreach ($case as $value) {
  672. # code...
  673. foreach ($case2 as $value2) {
  674. //验证语法信息是否正确
  675. if (
  676. $value['type'] == $value2['type'] &&
  677. substr($value['grammar'], 0, 3) === substr($value2['grammar'], 0, 3) &&
  678. $value['confidence'] > 0.5
  679. ) {
  680. $wordWithType['type'] = $value['type'];
  681. $wordWithType['grammar'] = $value['grammar'];
  682. $wordWithType['factors'] = $value['factors'];
  683. $wordWithType['parent'] = $base;
  684. $wordWithType['confidence'] = $value2['confidence'];
  685. $this->log("word:{$wordWithType['word']} ; type:{$wordWithType['type']}; grammar:{$wordWithType['grammar']};parent:{$wordWithType['parent']}");
  686. array_push($output, $wordWithType);
  687. }
  688. }
  689. }
  690. }
  691. }
  692. } else {
  693. foreach ($case as $value) {
  694. $wordWithType['type'] = $value['type'];
  695. $wordWithType['grammar'] = $value['grammar'];
  696. $wordWithType['factors'] = $value['factors'];
  697. $wordWithType['parent'] = $base;
  698. $wordWithType['confidence'] = 0.1;
  699. array_push($output, $wordWithType);
  700. }
  701. }
  702. }
  703. }
  704. }
  705. //后处理 进一步切分没有意思的长词
  706. $this->log("后处理 进一步切分没有意思的长词");
  707. $new = $this->split2($row);
  708. if ($new !== $row) {
  709. $newword['factors'] = $new;
  710. array_push($output, $newword);
  711. #再处理一次
  712. $new2 = $this->split2($new);
  713. if ($new2 !== $new) {
  714. $newword['factors'] = $new2;
  715. array_push($output, $newword);
  716. }
  717. }
  718. $iCount++;
  719. if ($iCount > $this->MAX_RESULT2) {
  720. break;
  721. }
  722. }
  723. } else {
  724. $this->log("{$oneword} 切分失败");
  725. $this->log("猜测可能的格位");
  726. //猜测单词的base
  727. $wordWithType = ['word' => $oneword, 'type' => '', 'grammar' => '', 'parent' => '', 'factors' => '', 'confidence' => 0];
  728. $parents = $caseman->WordToBase($oneword, 1, false);
  729. foreach ($parents as $base => $case) {
  730. foreach ($case as $value) {
  731. $wordWithType['type'] = $value['type'];
  732. $wordWithType['grammar'] = $value['grammar'];
  733. $wordWithType['factors'] = $value['factors'];
  734. $wordWithType['parent'] = $base;
  735. $wordWithType['confidence'] = $value['confidence'];
  736. $this->log("word:{$wordWithType['word']} ; type:{$wordWithType['type']}; grammar:{$wordWithType['grammar']};parent:{$wordWithType['parent']}");
  737. array_push($output, $wordWithType);
  738. }
  739. }
  740. }
  741. }
  742. return $output;
  743. }
  744. public function setting($param = null) {}
  745. public function getResult()
  746. {
  747. return $this->result;
  748. }
  749. public function debug($debug)
  750. {
  751. $this->isDebug = $debug;
  752. }
  753. private function log($message)
  754. {
  755. if ($this->isDebug) {
  756. Log::info($message);
  757. }
  758. }
  759. private function pushResult($word, $cf)
  760. {
  761. array_push($this->result, array($word => $cf));
  762. }
  763. }