split.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. <?php
  2. //强力拆分复合词
  3. /*
  4. function: split compound word
  5. step 1 : split at diphthong . ~aa~ -> ~a-a~
  6. 第一步:先切开双元音
  7. step 2 : every part use sandhi rule
  8. 第二步:用$sandhi的方法切分(套用连音规则)
  9. algorithm:
  10. 算法:
  11. f(word){
  12. 1. cut one letter from the end of word by sandhi rule in array($sandhi)
  13. 1. 从单词尾部切去一个字母
  14. 2. lookup first part .
  15. 2. 查询剩余部分
  16. if successful
  17. 如果有结果
  18. - get the confidence index of first part
  19. 获取该部分的信心指数
  20. - to pull first part and confidence in stack
  21. 把第一部分的拼写及其信心指数压入堆栈
  22. - process the remaining part at same way
  23. 用同样的方法处理剩余部分
  24. - f(stack.first element)
  25. else
  26. add sandhi rule
  27. goto 1
  28. }
  29. this is a recursion, depth=16
  30. 此为递归算法,深度=16
  31. */
  32. require_once '../public/casesuf.inc';
  33. require_once '../studio/dict_find_un.inc';
  34. require_once '../studio/sandhi.php';
  35. require_once "../path.php";
  36. require_once "../public/_pdo.php";
  37. //check input
  38. if(isset($_POST["word"])){
  39. $input_word=mb_strtolower(trim($_POST["word"]),'UTF-8');
  40. if(trim($input_word)==""){
  41. echo "Empty";
  42. exit;
  43. }
  44. $arrWords = str_getcsv($input_word,"\n");//支持批量拆分
  45. }
  46. else{
  47. ?>
  48. <!--debug only-->
  49. <form action="split.php" method="post">
  50. Words: <textarea type="text" name="word"></textarea>
  51. <input name="debug" type="hidden" />批量查询,单词之间用换行分隔。 input word. between two words insert 'enter'
  52. <div>
  53. <input type="checkbox" name = "express" checked /> 快速搜索(遇到第一个连音规则成功就返回) return when get first result
  54. </div>
  55. <input type="submit">
  56. </form>
  57. <?php
  58. return;
  59. }
  60. if(isset($_POST["express"])){
  61. if($_POST["express"]==="on"){
  62. $_express = true;
  63. }
  64. else{
  65. $_express = false;
  66. }
  67. }
  68. else{
  69. $_express = false;
  70. }
  71. // open word part db
  72. global $dbh;
  73. $dns = "sqlite:"._FILE_DB_PART_;
  74. $dbh = new PDO($dns, "", "",array(PDO::ATTR_PERSISTENT=>true));
  75. $dbh->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
  76. global $path;
  77. global $confidence;
  78. global $result;
  79. global $part ;
  80. $part= array();
  81. $path[]=array("",0);
  82. $path[]=array("",0);
  83. $path[]=array("",0);
  84. $path[]=array("",0);
  85. $path[]=array("",0);
  86. $path[]=array("",0);
  87. $path[]=array("",0);
  88. $path[]=array("",0);
  89. $path[]=array("",0);
  90. $path[]=array("",0);
  91. $path[]=array("",0);
  92. $path[]=array("",0);
  93. $path[]=array("",0);
  94. $path[]=array("",0);
  95. $path[]=array("",0);
  96. $path[]=array("",0);
  97. $path[]=array("",0);
  98. global $sandhi ;
  99. //sandhi table 语尾表
  100. $sandhi[]=array("a"=>"","b"=>"","c"=>"","len"=>0,"adj_len"=>0,"advance"=>false);
  101. $sandhi[]=array("a"=>"a","b"=>"a","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
  102. $sandhi[]=array("a"=>"ā","b"=>"ā","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
  103. $sandhi[]=array("a"=>"a","b"=>"ā","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
  104. $sandhi[]=array("a"=>"ā","b"=>"a","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
  105. $sandhi[]=array("a"=>"a","b"=>"e","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
  106. $sandhi[]=array("a"=>"a","b"=>"i","c"=>"i","len"=>1,"adj_len"=>0,"advance"=>false);
  107. $sandhi[]=array("a"=>"a","b"=>"o","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
  108. $sandhi[]=array("a"=>"a","b"=>"u","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
  109. $sandhi[]=array("a"=>"u","b"=>"a","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
  110. $sandhi[]=array("a"=>"u","b"=>"u","c"=>"ū","len"=>1,"adj_len"=>0,"advance"=>false);
  111. $sandhi[]=array("a"=>"a","b"=>"u","c"=>"u","len"=>1,"adj_len"=>0,"advance"=>false);
  112. $sandhi[]=array("a"=>"a","b"=>"ī","c"=>"ī","len"=>1,"adj_len"=>0,"advance"=>false);
  113. $sandhi[]=array("a"=>"a","b"=>"ū","c"=>"ū","len"=>1,"adj_len"=>0,"advance"=>false);
  114. $sandhi[]=array("a"=>"a","b"=>"i","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
  115. $sandhi[]=array("a"=>"i","b"=>"i","c"=>"ī","len"=>1,"adj_len"=>0,"advance"=>false);
  116. $sandhi[]=array("a"=>"i","b"=>"e","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
  117. $sandhi[]=array("a"=>"i","b"=>"a","c"=>"ya","len"=>2,"adj_len"=>0,"advance"=>false);
  118. $sandhi[]=array("a"=>"a","b"=>"atth","c"=>"atth","len"=>4,"adj_len"=>0,"advance"=>false);
  119. $sandhi[]=array("a"=>"taṃ","b"=>"n","c"=>"tann","len"=>4,"adj_len"=>0,"advance"=>false);
  120. $sandhi[]=array("a"=>"[ṃ]","b"=>"api","c"=>"mpi","len"=>3,"adj_len"=>0,"advance"=>false);
  121. $sandhi[]=array("a"=>"[ṃ]","b"=>"eva","c"=>"meva","len"=>4,"adj_len"=>0,"advance"=>false);
  122. $sandhi[]=array("a"=>"[o]","b"=>"iva","c"=>"ova","len"=>3,"adj_len"=>0,"advance"=>false);
  123. $sandhi[]=array("a"=>"a","b"=>"ādi","c"=>"ādi","len"=>3,"adj_len"=>0,"advance"=>false);
  124. $sandhi[]=array("a"=>"a[ānaṃ]","b"=>"a","c"=>"ānama","len"=>5,"adj_len"=>0,"advance"=>false);
  125. $sandhi[]=array("a"=>"a","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0,"advance"=>false);
  126. $sandhi[]=array("a"=>"[ṃ]","b"=>"ca","c"=>"ñca","len"=>3,"adj_len"=>0,"advance"=>false);
  127. $sandhi[]=array("a"=>"[ṃ]","b"=>"iti","c"=>"nti","len"=>3,"adj_len"=>0,"advance"=>false);
  128. $sandhi[]=array("a"=>"[ṃ]","b"=>"a","c"=>"ma","len"=>2,"adj_len"=>0,"advance"=>false);
  129. $sandhi[]=array("a"=>"[ṃ]","b"=>"ā","c"=>"mā","len"=>2,"adj_len"=>0,"advance"=>false);
  130. $sandhi[]=array("a"=>"[ṃ]","b"=>"u","c"=>"mu","len"=>2,"adj_len"=>0,"advance"=>false);
  131. $sandhi[]=array("a"=>"[ṃ]","b"=>"h","c"=>"ñh","len"=>2,"adj_len"=>0,"advance"=>false);
  132. $sandhi[]=array("a"=>"ā","b"=>"[ṃ]","c"=>"am","len"=>2,"adj_len"=>0,"advance"=>false);
  133. $sandhi[]=array("a"=>"ī","b"=>"[ṃ]","c"=>"im","len"=>2,"adj_len"=>0,"advance"=>false);
  134. $sandhi[]=array("a"=>"ati","b"=>"tabba","c"=>"atabba","len"=>6,"adj_len"=>0,"advance"=>false);
  135. $sandhi[]=array("a"=>"ati","b"=>"tabba","c"=>"itabba","len"=>6,"adj_len"=>0,"advance"=>false);
  136. $sandhi[]=array("a"=>"iti","b"=>"a","c"=>"icca","len"=>4,"adj_len"=>0,"advance"=>false);
  137. /*
  138. other sandhi rule. can be use but program must be slow
  139. $sandhi[]=array("a"=>"u[ūnaṃ]","b"=>"a","c"=>"ūnama","len"=>5,"adj_len"=>0,"advance"=>false);
  140. $sandhi[]=array("a"=>"ī[īnaṃ]","b"=>"a","c"=>"īnama","len"=>5,"adj_len"=>0,"advance"=>false);
  141. $sandhi[]=array("a"=>"ā","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0);
  142. $sandhi[]=array("a"=>"a","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0);
  143. $sandhi[]=array("a"=>"e","b"=>"iti","c"=>"eti","len"=>3,"adj_len"=>0);
  144. $sandhi[]=array("a"=>"ī","b"=>"iti","c"=>"īti","len"=>3,"adj_len"=>0);
  145. $sandhi[]=array("a"=>"i","b"=>"iti","c"=>"īti","len"=>3,"adj_len"=>0);
  146. $sandhi[]=array("a"=>"o","b"=>"iti","c"=>"oti","len"=>3,"adj_len"=>0);
  147. $sandhi[]=array("a"=>"ū","b"=>"iti","c"=>"ūti","len"=>3,"adj_len"=>0);
  148. $sandhi[]=array("a"=>"u","b"=>"iti","c"=>"ūti","len"=>3,"adj_len"=>0);
  149. $sandhi[]=array("a"=>"ṃ","b"=>"iti","c"=>"nti","len"=>3,"adj_len"=>0);
  150. $sandhi[]=array("a"=>"ṃ","b"=>"ca","c"=>"ñca","len"=>3,"adj_len"=>0);
  151. $sandhi[]=array("a"=>"a","b"=>"eva","c"=>"eva","len"=>3,"adj_len"=>0);
  152. $sandhi[]=array("a"=>"ā","b"=>"eva","c"=>"āyeva","len"=>5,"adj_len"=>0);
  153. $sandhi[]=array("a"=>"e","b"=>"eva","c"=>"eva","len"=>3,"adj_len"=>0);
  154. $sandhi[]=array("a"=>"i","b"=>"eva","c"=>"yeva","len"=>4,"adj_len"=>0);
  155. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"iyeva","len"=>5,"adj_len"=>0);
  156. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"īyeva","len"=>5,"adj_len"=>0);
  157. $sandhi[]=array("a"=>"o","b"=>"eva","c"=>"ova","len"=>3,"adj_len"=>0);
  158. $sandhi[]=array("a"=>"u","b"=>"eva","c"=>"veva","len"=>3,"adj_len"=>0);
  159. $sandhi[]=array("a"=>"a","b"=>"eva","c"=>"evā","len"=>3,"adj_len"=>0);
  160. $sandhi[]=array("a"=>"e","b"=>"eva","c"=>"evā","len"=>3,"adj_len"=>0);
  161. $sandhi[]=array("a"=>"i","b"=>"eva","c"=>"yevā","len"=>4,"adj_len"=>0);
  162. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"yevā","len"=>4,"adj_len"=>0);
  163. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"iyevā","len"=>4,"adj_len"=>0);
  164. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"īyevā","len"=>4,"adj_len"=>0);
  165. $sandhi[]=array("a"=>"o","b"=>"eva","c"=>"ovā","len"=>4,"adj_len"=>0);
  166. $sandhi[]=array("a"=>"ā","b"=>"api","c"=>"āpi","len"=>3,"adj_len"=>0);
  167. $sandhi[]=array("a"=>"a","b"=>"api","c"=>"āpi","len"=>3,"adj_len"=>0);
  168. $sandhi[]=array("a"=>"e","b"=>"api","c"=>"epi","len"=>3,"adj_len"=>0);
  169. $sandhi[]=array("a"=>"ī","b"=>"api","c"=>"īpi","len"=>3,"adj_len"=>0);
  170. $sandhi[]=array("a"=>"i","b"=>"api","c"=>"īpi","len"=>3,"adj_len"=>0);
  171. $sandhi[]=array("a"=>"o","b"=>"api","c"=>"opi","len"=>3,"adj_len"=>0);
  172. $sandhi[]=array("a"=>"ū","b"=>"api","c"=>"ūpi","len"=>3,"adj_len"=>0);
  173. $sandhi[]=array("a"=>"u","b"=>"api","c"=>"ūpi","len"=>3,"adj_len"=>0);
  174. $sandhi[]=array("a"=>"u","b"=>"api","c"=>"upi","len"=>3,"adj_len"=>0);
  175. $sandhi[]=array("a"=>"ṃ","b"=>"api","c"=>"mpi","len"=>3,"adj_len"=>0);
  176. */
  177. //$sandhi[]=array("a"=>"a","b"=>"a","c"=>"a","len"=>1,"adj_len"=>-1,"advance"=>true);
  178. //$sandhi[]=array("a"=>"ī","b"=>"","c"=>"i","len"=>1,"adj_len"=>0,"advance"=>true);
  179. //diphthong table
  180. $search = array('aa', 'ae', 'ai', 'ao', 'au', 'aā', 'aī', 'aū', 'ea', 'ee', 'ei', 'eo', 'eu', 'eā', 'eī', 'eū', 'ia', 'ie', 'ii', 'io', 'iu', 'iā', 'iī', 'iū', 'oa', 'oe', 'oi', 'oo', 'ou', 'oā', 'oī', 'oū', 'ua', 'ue', 'ui', 'uo', 'uu', 'uā', 'uī', 'uū', 'āa', 'āe', 'āi', 'āo', 'āu', 'āā', 'āī', 'āū', 'īa', 'īe', 'īi', 'īo', 'īu', 'īā', 'īī', 'īū', 'ūa', 'ūe', 'ūi', 'ūo', 'ūu', 'ūā', 'ūī', 'ūū');
  181. $replace = array('a-a', 'a-e', 'a-i', 'a-o', 'a-u', 'a-ā', 'a-ī', 'a-ū', 'e-a', 'e-e', 'e-i', 'e-o', 'e-u', 'e-ā', 'e-ī', 'e-ū', 'i-a', 'i-e', 'i-i', 'i-o', 'i-u', 'i-ā', 'i-ī', 'i-ū', 'o-a', 'o-e', 'o-i', 'o-o', 'o-u', 'o-ā', 'o-ī', 'o-ū', 'u-a', 'u-e', 'u-i', 'u-o', 'u-u', 'u-ā', 'u-ī', 'u-ū', 'ā-a', 'ā-e', 'ā-i', 'ā-o', 'ā-u', 'ā-ā', 'ā-ī', 'ā-ū', 'ī-a', 'ī-e', 'ī-i', 'ī-o', 'ī-u', 'ī-ā', 'ī-ī', 'ī-ū', 'ū-a', 'ū-e', 'ū-i', 'ū-o', 'ū-u', 'ū-ā', 'ū-ī', 'ū-ū');
  182. //main
  183. $allword = array();
  184. foreach($arrWords as $oneword){
  185. //预处理
  186. //将双元音拆开
  187. //step 1 : split at diphthong . ~aa~ -> ~a-a~
  188. $word = str_replace($search, $replace, $oneword);
  189. if(isset($_POST["debug"])){
  190. echo "Look up:{$word}<br>";
  191. }
  192. //按连字符拆开处理
  193. $arrword = str_getcsv($word,"-");
  194. $t1=microtime_float();
  195. $output = array();
  196. foreach($arrword as $oneword){
  197. $result = array();//全局变量,递归程序的输出容器
  198. if(mb_strlen($oneword,"UTF-8")<30){
  199. mySplit2($oneword,0,$_express);
  200. }
  201. else{
  202. mySplit2($oneword,0,$_express);
  203. }
  204. arsort($result);//按信心指数排序
  205. $wordlist = array();
  206. $iMax = 5;
  207. $iCount = 0;
  208. foreach($result as $row=>$value){
  209. $iCount++;
  210. $word_part = array();
  211. $word_part["word"] = $row;
  212. $word_part["confidence"] = $value;
  213. $wordlist[] = $word_part;
  214. if($iCount>=$iMax){
  215. break;
  216. }
  217. }
  218. $output[] = $wordlist;
  219. if(isset($_POST["debug"])){
  220. echo "<h2>{$oneword}</h2>";
  221. echo "<h4>".count($result)."</h4>";
  222. }
  223. $iCount=0;
  224. foreach($result as $row=>$value){
  225. if($iCount>10){
  226. break;
  227. }
  228. $iCount++;
  229. $level=$value*90;
  230. if(isset($_POST["debug"])){
  231. echo $row."-[".$value."]<br>";
  232. }
  233. }
  234. /*
  235. 后处理
  236. -ssāpi=-[ssa]-api
  237. */
  238. }
  239. $t2 = microtime_float();
  240. $one_split["data"]=$output;
  241. $one_split["time"]= $auto_split_times;
  242. $one_split["second"]= $t2-$t1;
  243. $allword[] = $one_split;
  244. if(isset($_POST["debug"])){
  245. echo "<div>";
  246. echo "<br>查询【{$auto_split_times}】次";
  247. echo "time:".($t2-$t1);
  248. echo "</div>";
  249. }
  250. }
  251. if(isset($_POST["debug"])){
  252. echo "<pre style='margin:2em;padding:1em;background-color:#e9e9e9;'>";
  253. print_r($allword);
  254. echo "</pre>";
  255. }
  256. echo json_encode($allword,JSON_UNESCAPED_UNICODE);
  257. /*
  258. 用于数组连接字符串
  259. */
  260. function myfunction($v1,$v2)
  261. {
  262. return $v1 . "+" . $v2;
  263. }
  264. function microtime_float()
  265. {
  266. list($usec, $sec) = explode(" ", microtime());
  267. return ((float)$usec + (float)$sec);
  268. }
  269. function dict_lookup($word){
  270. global $dbh;
  271. $query = "select weight from part where \"word\" = ? ";
  272. $stmt = $dbh->prepare($query);
  273. $stmt->execute(array($word));
  274. $row = $stmt->fetch(PDO::FETCH_NUM);
  275. if ($row) {
  276. return $row[0];
  277. } else {
  278. return 0;
  279. }
  280. }
  281. /*
  282. 查找某个单词是否在现有词典出现
  283. 返回信心指数
  284. look up single word in dictionary vocabulary
  285. return the confidence value
  286. */
  287. function isExsit($word,$adj_len=0){
  288. global $auto_split_times;
  289. global $result;
  290. global $part;
  291. global $confidence;
  292. $auto_split_times++;
  293. if(isset($_POST["debug"])){
  294. echo "<div>正在查询:{$word}</div>";
  295. }
  296. $isFound=false;
  297. if(isset($part["{$word}"]))
  298. {
  299. if($part["{$word}"]>0){
  300. $isFound=true;
  301. $count=$part["{$word}"]+1;
  302. }
  303. }
  304. else{
  305. $db=dict_lookup($word);
  306. //加入查询缓存
  307. $part["{$word}"] = $db;
  308. if($db>0){
  309. $isFound=true;
  310. $count=$db+1;
  311. }
  312. else{
  313. }
  314. }
  315. if($isFound)
  316. {
  317. if(isset($confidence["{$word}"])){
  318. $cf=$confidence["{$word}"];
  319. }
  320. else{
  321. $len=mb_strlen($word,"UTF-8")+$adj_len;
  322. $len_correct=1.2;
  323. $count2=1.1+pow($count,1.18);
  324. $conf_num=pow(1/$count2,pow(($len-1),$len_correct));
  325. $cf=round(1/(1+640*$conf_num),9);
  326. $confidence["{$word}"]=$cf;
  327. }
  328. return($cf);
  329. }
  330. else{
  331. return(-1);
  332. }
  333. }
  334. /*
  335. 核心拆分函数
  336. $strWord, 要查询的词
  337. $deep, 当前递归深度
  338. $express=true, 快速查询
  339. $adj_len=0 长度校正系数
  340. $c_threshhold 信心指数阈值
  341. */
  342. function mySplit2($strWord,$deep,$express=false,$adj_len=0,$c_threshhold=0.8){
  343. global $path;
  344. global $result;
  345. global $sandhi ;
  346. $output = array();
  347. //达到最大搜索深度,返回
  348. if($deep>=16){
  349. $word = "";
  350. $cf=1.0;
  351. for($i=0;$i<$deep;$i++){
  352. $word .= $path[$i][0];
  353. if(isset($_POST["debug"])){
  354. $word .="(".$path[$i][1].")-";
  355. }
  356. else{
  357. $word .= "-";
  358. }
  359. $cf=$cf*$path[$i][1];
  360. }
  361. $len=pow(mb_strlen($strWord,"UTF-8"),3);
  362. $cf+=(0-$len)/($len+150);
  363. $word .= "{$strWord}";
  364. $result[$word]=$cf;
  365. return;
  366. }
  367. //直接找到
  368. $confidence=isExsit($strWord,$adj_len);
  369. if($confidence>=0){
  370. $output[] = array($strWord,"",$confidence);
  371. }
  372. else{
  373. $confidence=isExsit("[".$strWord."]");
  374. if($confidence>=0){
  375. $output[] = array("[".$strWord."]","",$confidence);
  376. }
  377. }
  378. //如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。
  379. $doubleword="kkggccjjṭṭḍḍttddppbb";
  380. if(mb_strlen($strWord,"UTF-8")>2){
  381. $left2=mb_substr($strWord,0,2,"UTF-8");
  382. if(mb_strpos($doubleword,$left2,0,"UTF-8")!==FALSE){
  383. $strWord=mb_substr($strWord,1,NULL,"UTF-8");
  384. }
  385. }
  386. $len=mb_strlen($strWord,"UTF-8");
  387. if($len>2){
  388. for($i=$len;$i>1;$i--){
  389. foreach($sandhi as $row){
  390. if(mb_substr($strWord,$i-$row["len"],$row["len"],"UTF-8")==$row["c"]){
  391. $str1=mb_substr($strWord,0,$i-$row["len"],"UTF-8").$row["a"];
  392. $str2=$row["b"].mb_substr($strWord,$i,NULL,"UTF-8");
  393. $confidence=isExsit($str1,$adj_len);
  394. if($confidence > $c_threshhold){
  395. $output[] = array($str1,$str2,$confidence,$row["adj_len"]);
  396. if($express){
  397. break;
  398. }
  399. }
  400. }
  401. }
  402. }
  403. }
  404. if(count($output)>0){
  405. foreach($output as $part){
  406. $path[$deep][0]=$part[0];
  407. $path[$deep][1]=$part[2];
  408. if($part[1]!=""){
  409. mySplit2($part[1],($deep+1),$express,$part[3],$c_threshhold);
  410. }
  411. else{
  412. $word = "";
  413. $cf=1.0;
  414. for($i=0;$i<$deep;$i++){
  415. $word .= $path[$i][0]."+";
  416. if(isset($_POST["debug"])){
  417. $word .= "(".$path[$i][1].")-";
  418. }
  419. $cf=$cf*$path[$i][1];
  420. }
  421. $word .= $part[0];
  422. if(isset($_POST["debug"])){
  423. $word .= "({$part[2]})";
  424. }
  425. $cf=$cf+$part[2]*0.1;
  426. if($cf >= $c_threshhold){
  427. $result[$word]=$cf;
  428. }
  429. }
  430. }
  431. }
  432. else{
  433. $word = "";
  434. $cf=1.0;
  435. for($i=0;$i<$deep;$i++){
  436. $word .= $path[$i][0]."+";
  437. if(isset($_POST["debug"])){
  438. $word .= "(".$path[$i][1].")-";
  439. }
  440. $cf=$cf*$path[$i][1];
  441. }
  442. $len=pow(mb_strlen($strWord,"UTF-8"),3);
  443. $cf+=(0-$len)/($len+150);
  444. if(isset($_POST["debug"])){
  445. $word .= $strWord."(0)";
  446. }
  447. else{
  448. $word .= $strWord;
  449. }
  450. if($cf >= $c_threshhold){
  451. $result[$word]=$cf;
  452. }
  453. }
  454. }
  455. ?>