dict_find4.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. <?php
  2. //查询参考字典
  3. require_once '../public/casesuf.inc';
  4. require_once 'dict_find_un.inc';
  5. require_once 'sandhi.php';
  6. require_once "../config.php";
  7. require_once "../public/_pdo.php";
  8. require_once '../public/load_lang.php';
  9. if(isset($_POST["word"])){
  10. $input_word=mb_strtolower($_POST["word"],'UTF-8');
  11. if(trim($input_word)==""){
  12. echo "Empty";
  13. return;
  14. }
  15. $arrWords = str_getcsv($input_word,"\n");
  16. }
  17. else{
  18. ?>
  19. <form action="dict_find4.php" method="post">
  20. Words: <textarea type="text" name="word"></textarea>
  21. <input name="debug" />
  22. <input type="submit">
  23. </form>
  24. <?php
  25. return;
  26. }
  27. global $path;
  28. global $confidence;
  29. global $PDO;
  30. global $result;
  31. require_once 'part.php';
  32. $path[]=array("",0);
  33. $path[]=array("",0);
  34. $path[]=array("",0);
  35. $path[]=array("",0);
  36. $path[]=array("",0);
  37. $path[]=array("",0);
  38. $path[]=array("",0);
  39. $path[]=array("",0);
  40. $path[]=array("",0);
  41. $path[]=array("",0);
  42. $path[]=array("",0);
  43. $path[]=array("",0);
  44. $path[]=array("",0);
  45. $path[]=array("",0);
  46. $path[]=array("",0);
  47. $path[]=array("",0);
  48. $path[]=array("",0);
  49. $search = array('aa', 'ae', 'ai', 'ao', 'au', 'aā', 'aī', 'aū', 'ea', 'ee', 'ei', 'eo', 'eu', 'eā', 'eī', 'eū', 'ia', 'ie', 'ii', 'io', 'iu', 'iā', 'iī', 'iū', 'oa', 'oe', 'oi', 'oo', 'ou', 'oā', 'oī', 'oū', 'ua', 'ue', 'ui', 'uo', 'uu', 'uā', 'uī', 'uū', 'āa', 'āe', 'āi', 'āo', 'āu', 'āā', 'āī', 'āū', 'īa', 'īe', 'īi', 'īo', 'īu', 'īā', 'īī', 'īū', 'ūa', 'ūe', 'ūi', 'ūo', 'ūu', 'ūā', 'ūī', 'ūū');
  50. $replace = array('a-a', 'a-e', 'a-i', 'a-o', 'a-u', 'a-ā', 'a-ī', 'a-ū', 'e-a', 'e-e', 'e-i', 'e-o', 'e-u', 'e-ā', 'e-ī', 'e-ū', 'i-a', 'i-e', 'i-i', 'i-o', 'i-u', 'i-ā', 'i-ī', 'i-ū', 'o-a', 'o-e', 'o-i', 'o-o', 'o-u', 'o-ā', 'o-ī', 'o-ū', 'u-a', 'u-e', 'u-i', 'u-o', 'u-u', 'u-ā', 'u-ī', 'u-ū', 'ā-a', 'ā-e', 'ā-i', 'ā-o', 'ā-u', 'ā-ā', 'ā-ī', 'ā-ū', 'ī-a', 'ī-e', 'ī-i', 'ī-o', 'ī-u', 'ī-ā', 'ī-ī', 'ī-ū', 'ū-a', 'ū-e', 'ū-i', 'ū-o', 'ū-u', 'ū-ā', 'ū-ī', 'ū-ū');
  51. foreach($arrWords as $oneword){
  52. //预处理
  53. $word = str_replace($search, $replace, $oneword);
  54. echo "Look up:{$word}<br>";
  55. $arrword = str_getcsv($word,"-");
  56. $t1=microtime_float();
  57. foreach($arrword as $oneword){
  58. $result = array();
  59. if(mb_strlen($oneword,"UTF-8")<30){
  60. mySplit2($oneword,0);
  61. }
  62. else{
  63. mySplit2($oneword,0,true);
  64. }
  65. arsort($result);
  66. echo "<h2>{$oneword}</h2>";
  67. echo "<h4>".count($result)."</h4>";
  68. $iCount=0;
  69. foreach($result as $row=>$value){
  70. if($iCount>10){
  71. break;
  72. }
  73. $iCount++;
  74. $level=$value*90;
  75. if(isset($_POST["debug"])){
  76. echo $row."-[".$value."]<br>";
  77. }
  78. else{
  79. echo "<button onclick=\"add_part_to_word('{$row}')\">Apply</button> ";
  80. echo $row."-[".round($level)."] ";
  81. echo "<button onclick=\"add_part_to_input('{$row}')\">Lookup</button>";
  82. echo "<br>";
  83. }
  84. }
  85. /*
  86. 后处理
  87. -ssāpi=-[ssa]-api
  88. */
  89. echo "-";
  90. }
  91. echo "<br>查询【{$auto_split_times}】次";
  92. $t2 = microtime_float();
  93. echo "time:".($t2-$t1);
  94. }
  95. function myfunction($v1,$v2)
  96. {
  97. return $v1 . "+" . $v2;
  98. }
  99. function microtime_float()
  100. {
  101. list($usec, $sec) = explode(" ", microtime());
  102. return ((float)$usec + (float)$sec);
  103. }
  104. /*
  105. 查找某个单词是否在现有词典出现
  106. 返回信心指数
  107. */
  108. function isExsit($word,$adj_len=0){
  109. global $PDO;
  110. global $auto_split_times;
  111. global $result;
  112. global $part;
  113. global $confidence;
  114. $auto_split_times++;
  115. //echo "<div>正在查询:{$word}</div>";
  116. $isFound=false;
  117. if(isset($part["{$word}"]))
  118. {
  119. $isFound=true;
  120. $count=$part["{$word}"]+1;
  121. }
  122. if($isFound)
  123. {
  124. if(isset($confidence["{$word}"])){
  125. $cf=$confidence["{$word}"];
  126. }
  127. else{
  128. $len=mb_strlen($word,"UTF-8")+$adj_len;
  129. $len_correct=1.2;
  130. $count2=1.1+pow($count,1.18);
  131. $conf_num=pow(1/$count2,pow(($len-1),$len_correct));
  132. $cf=round(1/(1+640*$conf_num),9);
  133. $confidence["{$word}"]=$cf;
  134. }
  135. return($cf);
  136. }
  137. else{
  138. return(-1);
  139. }
  140. }
  141. function mySplit2($strWord,$deep,$turbo=false,$adj_len=0){
  142. global $path;
  143. global $result;
  144. $output = array();
  145. $min_part = 2;
  146. if($deep>=16){
  147. $word = "";
  148. $cf=1.0;
  149. for($i=0;$i<$deep;$i++){
  150. $word .= $path[$i][0];
  151. if(isset($_POST["debug"])){
  152. $word .="(".$path[$i][1].")-";
  153. }
  154. else{
  155. $word .= "-";
  156. }
  157. $cf=$cf*$path[$i][1];
  158. }
  159. $len=pow(mb_strlen($strWord,"UTF-8"),3);
  160. $cf+=(0-$len)/($len+150);
  161. $word .= "{$strWord}(0)";
  162. $result[$word]=$cf;
  163. return;
  164. }
  165. //直接找到
  166. $confidence=isExsit($strWord,$adj_len);
  167. if($confidence>=0){
  168. $output[] = array($strWord,"",$confidence);
  169. }
  170. else{
  171. $confidence=isExsit("[".$strWord."]");
  172. if($confidence>=0){
  173. $output[] = array("[".$strWord."]","",$confidence);
  174. }
  175. }
  176. //如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。
  177. $doubleword="kkggccjjṭṭḍḍttddppbb";
  178. if(mb_strlen($strWord,"UTF-8")>2){
  179. $left2=mb_substr($strWord,0,2,"UTF-8");
  180. if(mb_strpos($doubleword,$left2,0,"UTF-8")!==FALSE){
  181. $strWord=mb_substr($strWord,1,NULL,"UTF-8");
  182. }
  183. }
  184. $sandhi[]=array("a"=>"","b"=>"","c"=>"","len"=>0,"adj_len"=>0,"advance"=>false);
  185. $sandhi[]=array("a"=>"a","b"=>"a","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
  186. $sandhi[]=array("a"=>"ā","b"=>"ā","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
  187. $sandhi[]=array("a"=>"a","b"=>"ā","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
  188. $sandhi[]=array("a"=>"a","b"=>"e","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
  189. $sandhi[]=array("a"=>"a","b"=>"i","c"=>"i","len"=>1,"adj_len"=>0,"advance"=>false);
  190. $sandhi[]=array("a"=>"a","b"=>"o","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
  191. $sandhi[]=array("a"=>"a","b"=>"u","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
  192. $sandhi[]=array("a"=>"u","b"=>"a","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
  193. $sandhi[]=array("a"=>"u","b"=>"u","c"=>"ū","len"=>1,"adj_len"=>0,"advance"=>false);
  194. $sandhi[]=array("a"=>"a","b"=>"u","c"=>"u","len"=>1,"adj_len"=>0,"advance"=>false);
  195. $sandhi[]=array("a"=>"a","b"=>"ī","c"=>"ī","len"=>1,"adj_len"=>0,"advance"=>false);
  196. $sandhi[]=array("a"=>"a","b"=>"ū","c"=>"ū","len"=>1,"adj_len"=>0,"advance"=>false);
  197. $sandhi[]=array("a"=>"a","b"=>"i","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
  198. $sandhi[]=array("a"=>"i","b"=>"i","c"=>"ī","len"=>1,"adj_len"=>0,"advance"=>false);
  199. $sandhi[]=array("a"=>"i","b"=>"e","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
  200. $sandhi[]=array("a"=>"i","b"=>"a","c"=>"ya","len"=>2,"adj_len"=>0,"advance"=>false);
  201. $sandhi[]=array("a"=>"a","b"=>"atth","c"=>"atth","len"=>4,"adj_len"=>0,"advance"=>false);
  202. $sandhi[]=array("a"=>"taṃ","b"=>"n","c"=>"tann","len"=>4,"adj_len"=>0,"advance"=>false);
  203. $sandhi[]=array("a"=>"[ṃ]","b"=>"api","c"=>"mpi","len"=>3,"adj_len"=>0,"advance"=>false);
  204. $sandhi[]=array("a"=>"[ṃ]","b"=>"eva","c"=>"meva","len"=>4,"adj_len"=>0,"advance"=>false);
  205. $sandhi[]=array("a"=>"[o]","b"=>"iva","c"=>"ova","len"=>3,"adj_len"=>0,"advance"=>false);
  206. $sandhi[]=array("a"=>"a","b"=>"ādi","c"=>"ādi","len"=>3,"adj_len"=>0,"advance"=>false);
  207. $sandhi[]=array("a"=>"a[ānaṃ]","b"=>"a","c"=>"ānama","len"=>5,"adj_len"=>0,"advance"=>false);
  208. $sandhi[]=array("a"=>"a","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0,"advance"=>false);
  209. $sandhi[]=array("a"=>"[ṃ]","b"=>"ca","c"=>"ñca","len"=>3,"adj_len"=>0,"advance"=>false);
  210. $sandhi[]=array("a"=>"[ṃ]","b"=>"iti","c"=>"nti","len"=>3,"adj_len"=>0,"advance"=>false);
  211. $sandhi[]=array("a"=>"[ṃ]","b"=>"a","c"=>"ma","len"=>2,"adj_len"=>0,"advance"=>false);
  212. $sandhi[]=array("a"=>"[ṃ]","b"=>"ā","c"=>"mā","len"=>2,"adj_len"=>0,"advance"=>false);
  213. $sandhi[]=array("a"=>"[ṃ]","b"=>"u","c"=>"mu","len"=>2,"adj_len"=>0,"advance"=>false);
  214. $sandhi[]=array("a"=>"[ṃ]","b"=>"h","c"=>"ñh","len"=>2,"adj_len"=>0,"advance"=>false);
  215. $sandhi[]=array("a"=>"ā","b"=>"[ṃ]","c"=>"am","len"=>2,"adj_len"=>0,"advance"=>false);
  216. $sandhi[]=array("a"=>"ī","b"=>"[ṃ]","c"=>"im","len"=>2,"adj_len"=>0,"advance"=>false);
  217. $sandhi[]=array("a"=>"ati","b"=>"tabba","c"=>"atabba","len"=>6,"adj_len"=>0,"advance"=>false);
  218. $sandhi[]=array("a"=>"ati","b"=>"tabba","c"=>"itabba","len"=>6,"adj_len"=>0,"advance"=>false);
  219. $sandhi[]=array("a"=>"iti","b"=>"a","c"=>"icca","len"=>4,"adj_len"=>0,"advance"=>false);
  220. /*
  221. $sandhi[]=array("a"=>"u[ūnaṃ]","b"=>"a","c"=>"ūnama","len"=>5,"adj_len"=>0,"advance"=>false);
  222. $sandhi[]=array("a"=>"ī[īnaṃ]","b"=>"a","c"=>"īnama","len"=>5,"adj_len"=>0,"advance"=>false);
  223. $sandhi[]=array("a"=>"ā","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0);
  224. $sandhi[]=array("a"=>"a","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0);
  225. $sandhi[]=array("a"=>"e","b"=>"iti","c"=>"eti","len"=>3,"adj_len"=>0);
  226. $sandhi[]=array("a"=>"ī","b"=>"iti","c"=>"īti","len"=>3,"adj_len"=>0);
  227. $sandhi[]=array("a"=>"i","b"=>"iti","c"=>"īti","len"=>3,"adj_len"=>0);
  228. $sandhi[]=array("a"=>"o","b"=>"iti","c"=>"oti","len"=>3,"adj_len"=>0);
  229. $sandhi[]=array("a"=>"ū","b"=>"iti","c"=>"ūti","len"=>3,"adj_len"=>0);
  230. $sandhi[]=array("a"=>"u","b"=>"iti","c"=>"ūti","len"=>3,"adj_len"=>0);
  231. $sandhi[]=array("a"=>"ṃ","b"=>"iti","c"=>"nti","len"=>3,"adj_len"=>0);
  232. $sandhi[]=array("a"=>"ṃ","b"=>"ca","c"=>"ñca","len"=>3,"adj_len"=>0);
  233. $sandhi[]=array("a"=>"a","b"=>"eva","c"=>"eva","len"=>3,"adj_len"=>0);
  234. $sandhi[]=array("a"=>"ā","b"=>"eva","c"=>"āyeva","len"=>5,"adj_len"=>0);
  235. $sandhi[]=array("a"=>"e","b"=>"eva","c"=>"eva","len"=>3,"adj_len"=>0);
  236. $sandhi[]=array("a"=>"i","b"=>"eva","c"=>"yeva","len"=>4,"adj_len"=>0);
  237. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"iyeva","len"=>5,"adj_len"=>0);
  238. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"īyeva","len"=>5,"adj_len"=>0);
  239. $sandhi[]=array("a"=>"o","b"=>"eva","c"=>"ova","len"=>3,"adj_len"=>0);
  240. $sandhi[]=array("a"=>"u","b"=>"eva","c"=>"veva","len"=>3,"adj_len"=>0);
  241. $sandhi[]=array("a"=>"a","b"=>"eva","c"=>"evā","len"=>3,"adj_len"=>0);
  242. $sandhi[]=array("a"=>"e","b"=>"eva","c"=>"evā","len"=>3,"adj_len"=>0);
  243. $sandhi[]=array("a"=>"i","b"=>"eva","c"=>"yevā","len"=>4,"adj_len"=>0);
  244. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"yevā","len"=>4,"adj_len"=>0);
  245. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"iyevā","len"=>4,"adj_len"=>0);
  246. $sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"īyevā","len"=>4,"adj_len"=>0);
  247. $sandhi[]=array("a"=>"o","b"=>"eva","c"=>"ovā","len"=>4,"adj_len"=>0);
  248. $sandhi[]=array("a"=>"ā","b"=>"api","c"=>"āpi","len"=>3,"adj_len"=>0);
  249. $sandhi[]=array("a"=>"a","b"=>"api","c"=>"āpi","len"=>3,"adj_len"=>0);
  250. $sandhi[]=array("a"=>"e","b"=>"api","c"=>"epi","len"=>3,"adj_len"=>0);
  251. $sandhi[]=array("a"=>"ī","b"=>"api","c"=>"īpi","len"=>3,"adj_len"=>0);
  252. $sandhi[]=array("a"=>"i","b"=>"api","c"=>"īpi","len"=>3,"adj_len"=>0);
  253. $sandhi[]=array("a"=>"o","b"=>"api","c"=>"opi","len"=>3,"adj_len"=>0);
  254. $sandhi[]=array("a"=>"ū","b"=>"api","c"=>"ūpi","len"=>3,"adj_len"=>0);
  255. $sandhi[]=array("a"=>"u","b"=>"api","c"=>"ūpi","len"=>3,"adj_len"=>0);
  256. $sandhi[]=array("a"=>"u","b"=>"api","c"=>"upi","len"=>3,"adj_len"=>0);
  257. $sandhi[]=array("a"=>"ṃ","b"=>"api","c"=>"mpi","len"=>3,"adj_len"=>0);
  258. */
  259. //$sandhi[]=array("a"=>"a","b"=>"a","c"=>"a","len"=>1,"adj_len"=>-1,"advance"=>true);
  260. //$sandhi[]=array("a"=>"ī","b"=>"","c"=>"i","len"=>1,"adj_len"=>0,"advance"=>true);
  261. $len=mb_strlen($strWord,"UTF-8");
  262. if($len>2){
  263. for($i=$len;$i>1;$i--){
  264. foreach($sandhi as $row){
  265. if(mb_substr($strWord,$i-$row["len"],$row["len"],"UTF-8")==$row["c"]){
  266. $str1=mb_substr($strWord,0,$i-$row["len"],"UTF-8").$row["a"];
  267. $str2=$row["b"].mb_substr($strWord,$i,NULL,"UTF-8");
  268. $confidence=isExsit($str1,$adj_len);
  269. if($confidence>=0.1){
  270. $output[] = array($str1,$str2,$confidence,$row["adj_len"]);
  271. if($turbo){
  272. break;
  273. }
  274. }
  275. }
  276. }
  277. }
  278. }
  279. if(count($output)>0){
  280. foreach($output as $part){
  281. $path[$deep][0]=$part[0];
  282. $path[$deep][1]=$part[2];
  283. if($part[1]!=""){
  284. mySplit2($part[1],($deep+1),$turbo,$part[3]);
  285. }
  286. else{
  287. $word = "";
  288. $cf=1.0;
  289. for($i=0;$i<$deep;$i++){
  290. $word .= $path[$i][0]."+";
  291. if(isset($_POST["debug"])){
  292. $word .= "(".$path[$i][1].")-";
  293. }
  294. $cf=$cf*$path[$i][1];
  295. }
  296. $word .= $part[0];
  297. if(isset($_POST["debug"])){
  298. $word .= "({$part[2]})";
  299. }
  300. $cf=$cf+$part[2]*0.1;
  301. $result[$word]=$cf;
  302. }
  303. }
  304. }
  305. else{
  306. $word = "";
  307. $cf=1.0;
  308. for($i=0;$i<$deep;$i++){
  309. $word .= $path[$i][0]."+";
  310. if(isset($_POST["debug"])){
  311. $word .= "(".$path[$i][1].")-";
  312. }
  313. $cf=$cf*$path[$i][1];
  314. }
  315. $len=pow(mb_strlen($strWord,"UTF-8"),3);
  316. $cf+=(0-$len)/($len+150);
  317. $word .= "{$strWord}(0)";
  318. $result[$word]=$cf;
  319. }
  320. }
  321. ?>