Ver código fonte

更新自动拆分算法

visuddhinanda 5 anos atrás
pai
commit
35e0ffa7ae
5 arquivos alterados com 738 adições e 541 exclusões
  1. 62 60
      app/dict/index.php
  2. 46 0
      app/dict/pali_word_list_to_redis.php
  3. 141 470
      app/dict/split.php
  4. 480 0
      app/dict/turbo_split.php
  5. 9 11
      app/redis/function.php

+ 62 - 60
app/dict/index.php

@@ -4,13 +4,12 @@ require_once "../pcdl/html_head.php";
 
 <body>
 	<a name="toc_root"></a>
-	<?php
-	if(!(isset($_GET["inline"]) && $_GET["inline"]=='1')){
-		require_once("../pcdl/head_bar.php");
-	}
-	
-	?>
-
+<?php
+if (!(isset($_GET["builtin"]) && $_GET["builtin"] == 'true')) {
+    require_once "../pcdl/head_bar.php";
+}
+?>
+	<script language="javascript" src="./dict.js"></script>
 	<style>
 		body {
 			margin: unset;
@@ -218,6 +217,16 @@ require_once "../pcdl/html_head.php";
 		.dict_find_gramma guide{
 			color:unset;
 		}
+
+		#pre_search_result{
+			background-color: var(--btn-color);
+			z-index: 50;
+			display:none;
+		}
+
+		#dt_title {
+			border-bottom: 2px solid var(--link-hover-color);
+		}
 	</style>
 	<link type="text/css" rel="stylesheet" href="./css/style.css" >
 	<link type="text/css" rel="stylesheet" href="./css/style_mobile.css" media="screen and (max-width:800px)">
@@ -241,14 +250,15 @@ require_once "../pcdl/html_head.php";
 			<div style="flex:6;">
 				<div>
 					<div>
-						<input id="dict_ref_search_input" type="input" placeholder="<?php echo $_local->gui->search; ?>" onkeyup="dict_input_keyup(event,this)" style="" onfocus="dict_input_onfocus()" />
+						<input id="dict_ref_search_input" type="input" placeholder="<?php echo $_local->gui->search; ?> 单词里面添加+ 预览拆词结果" onkeyup="dict_input_keyup(event,this)" style="" onfocus="dict_input_onfocus()" />
 					</div>
+					<div id="result_msg"></div>
 					<div id="word_parts">
 						<div id="input_parts" style="font-size: 1.1em;padding: 2px 1em;"></div>
 					</div>
 				</div>
 
-				<div id="pre_search_result" style="background-color: var(--btn-color);z-index: 50;">
+				<div id="pre_search_result" >
 					<div id="pre_search_word" class="pre_serach_block">
 						<div id="pre_search_word_title" class="pre_serach_block_title">
 							<div id="pre_search_word_title_left"><?php echo $_local->gui->vocabulary_list; ?></div>
@@ -261,8 +271,7 @@ require_once "../pcdl/html_head.php";
 			</div>
 			<span style="display:flex;">
 				<button id="trubo_split" onclick="trubo_split()" >
-					<?php echo $_local->gui->turbo_split; //强力拆分
-					?>
+					<?php echo $_local->gui->turbo_split; /*强力拆分*/ ?>
 				</button>
 				<guide gid="comp_split"></guide>
 			</span>
@@ -312,60 +321,53 @@ require_once "../pcdl/html_head.php";
 			</a>
 		</button>
 	</div>
-	<script>
-		window.addEventListener('scroll', winScroll);
-
-		function winScroll(e) {
-			if (GetPageScroll().y > 150) {
-				$("#search_toolbar_1").css("top", 0);
-			} else {
-				$("#search_toolbar_1").css("top", GetPageScroll().y - 150);
-			}
-			if (GetPageScroll().y > $(window).height() * 0.9) {
-				$("#tool_btn").show();
-			} else {
-				$("#tool_btn").hide();
-			}
-
-		}
-		//滚动条位置
-		function GetPageScroll() {
-			var pos = new Object();
-			var x, y;
-			if (window.pageYOffset) { // all except IE	
-				y = window.pageYOffset;
-				x = window.pageXOffset;
-			} else if (document.documentElement && document.documentElement.scrollTop) { // IE 6 Strict	
-				y = document.documentElement.scrollTop;
-				x = document.documentElement.scrollLeft;
-			} else if (document.body) { // all other IE	
-				y = document.body.scrollTop;
-				x = document.body.scrollLeft;
-			}
-			pos.x = x;
-			pos.y = y;
-			return (pos);
-		}
-	</script>
-	<style>
-		#dt_title {
-			border-bottom: 2px solid var(--link-hover-color);
-		}
-	</style>
-	<script language="javascript" src="./dict.js"></script>
+
 
 	<div id="dict_search_result" style="background-color:white;color:black;">
 	</div>
+	<script>
+<?php
+if (isset($_GET["key"]) && !empty($_GET["key"])) {
+    echo "var _key='{$_GET["key"]}';\n";
+    echo "search_on_load(\"{$_GET["key"]}\")";
+}
+?>
 
-	<?php
-	if (!empty($_GET["key"])) {
-		echo "<script>";
-		echo "dict_pre_word_click(\"{$_GET["key"]}\")";
-		echo "</script>";
+window.addEventListener('scroll', winScroll);
+
+function winScroll(e) {
+	if (GetPageScroll().y > 150) {
+		$("#search_toolbar_1").css("top", 0);
+	} else {
+		$("#search_toolbar_1").css("top", GetPageScroll().y - 150);
+	}
+	if (GetPageScroll().y > $(window).height() * 0.9) {
+		$("#tool_btn").show();
+	} else {
+		$("#tool_btn").hide();
 	}
-	?>
 
+}
+//滚动条位置
+function GetPageScroll() {
+	var pos = new Object();
+	var x, y;
+	if (window.pageYOffset) { // all except IE
+		y = window.pageYOffset;
+		x = window.pageXOffset;
+	} else if (document.documentElement && document.documentElement.scrollTop) { // IE 6 Strict
+		y = document.documentElement.scrollTop;
+		x = document.documentElement.scrollLeft;
+	} else if (document.body) { // all other IE
+		y = document.body.scrollTop;
+		x = document.body.scrollLeft;
+	}
+	pos.x = x;
+	pos.y = y;
+	return (pos);
+}
+</script>
 
 	<?php
-	include "../pcdl/html_foot.php";
-	?>
+include "../pcdl/html_foot.php";
+?>

+ 46 - 0
app/dict/pali_word_list_to_redis.php

@@ -0,0 +1,46 @@
+<?php
+require_once "../path.php";
+require_once "../install/filelist.php";
+require_once "../redis/function.php";
+
+if (PHP_SAPI == "cli") {
+    if ($argc >= 2) {
+        $command = $argv[1];
+    } else {
+        $redis = redis_connect();
+        if ($redis == false) {
+            echo "no redis connect\n";
+            exit;
+        }
+        $dirXmlBase = _DIR_PALI_CSV_ . "/";
+
+        $book = array(1, 2, 3, 4, 5, 6, 7, 8, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 153, 152, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217);
+        $redis->delete('pali_word');
+        foreach ($book as $key => $value) {
+            # code...
+            echo "runing:{$value}\n";
+            $outputFileNameHead = $_filelist[$value];
+            $dirXml = $outputFileNameHead . "/";
+            // 打开文件并读取数据
+            $irow = 0;
+            if (($fp = fopen($dirXmlBase . $dirXml . $outputFileNameHead . ".csv", "r")) !== false) {
+                while (($str = fgets($fp)) !== false) {
+                    $data = explode(",", $str);
+                    $irow++;
+                    if ($irow > 1) {
+                        if ($data[6] != ".ctl." && $data[5] != "") {
+                            $redis->sadd('pali_word', $data[5]);
+                        }
+                    }
+                }
+                fclose($fp);
+            } else {
+                echo "can not open csv file. filename=" . $dirXmlBase . $dirXml . $outputFileNameHead . ".csv";
+            }
+        }
+    }
+} else {
+    echo "cli";
+}
+
+echo "<h2>齐活!功德无量!all done!</h2>";

+ 141 - 470
app/dict/split.php

@@ -4,501 +4,172 @@
 function: split compound word
 step 1 : split at diphthong . ~aa~ -> ~a-a~
 第一步:先切开双元音
-step 2 : every part use sandhi rule 
+step 2 : every part use sandhi rule
 第二步:用$sandhi的方法切分(套用连音规则)
-algorithm: 
+algorithm:
 算法:
 f(word){
-	1. cut one letter from the end of word by sandhi rule in array($sandhi)
-	1. 从单词尾部切去一个字母
-	2. lookup first part . 
-	2. 查询剩余部分
-	if confidence value>0.8 
-	如果有结果
-		- get the confidence value 
-		获取该部分的信心指数
-		- process the remaining part at same way
-		用同样的方法处理剩余部分
-		- f(stack.first element)
-	else
-		apply other sandhi rule
-		back to 1
+1. cut one letter from the end of word by sandhi rule in array($sandhi)
+1. 从单词尾部切去一个字母
+2. lookup first part .
+2. 查询剩余部分
+if confidence value>0.8
+如果有结果
+- get the confidence value
+获取该部分的信心指数
+- process the remaining part at same way
+用同样的方法处理剩余部分
+- f(stack.first element)
+else
+apply other sandhi rule
+back to 1
 }
 this is a recursion, depth=16
 此为递归算法,深度=16
-*/
-require_once '../public/casesuf.inc';
-require_once '../studio/dict_find_un.inc';
-require_once '../studio/sandhi.php';
-require_once "../path.php";
-require_once "../public/_pdo.php";
+ */
+require_once "../dict/turbo_split.php";
 
 //check input
-if(isset($_POST["word"])){
-	$input_word=mb_strtolower(trim($_POST["word"]),'UTF-8');
-	if(trim($input_word)==""){
-		echo "Empty";
-		exit;
-	}
-	$arrWords = str_getcsv($input_word,"\n");//支持批量拆分 
-}
-else{
-?>
+if (isset($_POST["word"])) {
+    $input_word = mb_strtolower(trim($_POST["word"]), 'UTF-8');
+    if (trim($input_word) == "") {
+        echo "Empty";
+        exit;
+    }
+    $arrWords = str_getcsv($input_word, "\n"); //支持批量拆分
+} else {
+    ?>
 <!--debug only-->
 <form action="split.php" method="post">
-Words: <textarea type="text" name="word"></textarea>
+Words: <br>
+<textarea type="text" name="word" style="width:50em;height:20em;"></textarea><br>
 <input name="debug" type="hidden" />批量查询,单词之间用换行分隔。 input word. between two words insert 'enter'
 <div>
-<input type="checkbox" name = "express" checked /> 快速搜索(遇到第一个连音规则成功就返回) return when get first result 
+<input type="checkbox" name = "express" checked /> 快速搜索(遇到第一个连音规则成功就返回) return when get first result
 </div>
 <input type="submit">
 </form>
 
 <?php
-	return;
-}
-
-if(isset($_POST["express"])){
-	if($_POST["express"]==="on"){
-		$_express = true;
-	}
-	else{
-		$_express = false;
-	}
+return;
 }
-else{
-	$_express = false;
-}
-
-// open word part db
-global $dbh;
-$dns = "sqlite:"._FILE_DB_PART_;
-$dbh = new PDO($dns, "", "",array(PDO::ATTR_PERSISTENT=>true));
-$dbh->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
-
-global $path;
-global $confidence;
-global $result;
-global $part ;
-$part= array();
-
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-$path[]=array("",0);
-
-	global $sandhi ;
-	//sandhi rules table 语尾表
-	$sandhi[]=array("a"=>"","b"=>"","c"=>"","len"=>0,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"a","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"ā","b"=>"ā","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"ā","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"ā","b"=>"a","c"=>"ā","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"e","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"i","c"=>"i","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"o","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"u","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"u","b"=>"a","c"=>"o","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"u","b"=>"u","c"=>"ū","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"u","c"=>"u","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"ī","c"=>"ī","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"ū","c"=>"ū","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"i","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"i","b"=>"i","c"=>"ī","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"i","b"=>"e","c"=>"e","len"=>1,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"i","b"=>"a","c"=>"ya","len"=>2,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"atth","c"=>"atth","len"=>4,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"taṃ","b"=>"n","c"=>"tann","len"=>4,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[ṃ]","b"=>"api","c"=>"mpi","len"=>3,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[ṃ]","b"=>"eva","c"=>"meva","len"=>4,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[o]","b"=>"iva","c"=>"ova","len"=>3,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"ādi","c"=>"ādi","len"=>3,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a[ānaṃ]","b"=>"a","c"=>"ānama","len"=>5,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"a","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[ṃ]","b"=>"ca","c"=>"ñca","len"=>3,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[ṃ]","b"=>"iti","c"=>"nti","len"=>3,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[ṃ]","b"=>"a","c"=>"ma","len"=>2,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[ṃ]","b"=>"ā","c"=>"mā","len"=>2,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[ṃ]","b"=>"u","c"=>"mu","len"=>2,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"[ṃ]","b"=>"h","c"=>"ñh","len"=>2,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"ā","b"=>"[ṃ]","c"=>"am","len"=>2,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"ī","b"=>"[ṃ]","c"=>"im","len"=>2,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"ati","b"=>"tabba","c"=>"atabba","len"=>6,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"ati","b"=>"tabba","c"=>"itabba","len"=>6,"adj_len"=>0,"advance"=>false);
-	$sandhi[]=array("a"=>"iti","b"=>"a","c"=>"icca","len"=>4,"adj_len"=>0,"advance"=>false);
-
-/*
-other sandhi rule. can be use but program will be slow down
-其他连音规则,如果使用则会让程序运行变慢
-
-$sandhi[]=array("a"=>"u[ūnaṃ]","b"=>"a","c"=>"ūnama","len"=>5,"adj_len"=>0,"advance"=>false);
-$sandhi[]=array("a"=>"ī[īnaṃ]","b"=>"a","c"=>"īnama","len"=>5,"adj_len"=>0,"advance"=>false);
-
-$sandhi[]=array("a"=>"ā","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"a","b"=>"iti","c"=>"āti","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"e","b"=>"iti","c"=>"eti","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"ī","b"=>"iti","c"=>"īti","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"i","b"=>"iti","c"=>"īti","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"o","b"=>"iti","c"=>"oti","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"ū","b"=>"iti","c"=>"ūti","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"u","b"=>"iti","c"=>"ūti","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"ṃ","b"=>"iti","c"=>"nti","len"=>3,"adj_len"=>0);
-
-$sandhi[]=array("a"=>"ṃ","b"=>"ca","c"=>"ñca","len"=>3,"adj_len"=>0);
-
-$sandhi[]=array("a"=>"a","b"=>"eva","c"=>"eva","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"ā","b"=>"eva","c"=>"āyeva","len"=>5,"adj_len"=>0);
-$sandhi[]=array("a"=>"e","b"=>"eva","c"=>"eva","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"i","b"=>"eva","c"=>"yeva","len"=>4,"adj_len"=>0);
-$sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"iyeva","len"=>5,"adj_len"=>0);
-$sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"īyeva","len"=>5,"adj_len"=>0);
-$sandhi[]=array("a"=>"o","b"=>"eva","c"=>"ova","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"u","b"=>"eva","c"=>"veva","len"=>3,"adj_len"=>0);
-
-$sandhi[]=array("a"=>"a","b"=>"eva","c"=>"evā","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"e","b"=>"eva","c"=>"evā","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"i","b"=>"eva","c"=>"yevā","len"=>4,"adj_len"=>0);
-$sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"yevā","len"=>4,"adj_len"=>0);
-$sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"iyevā","len"=>4,"adj_len"=>0);
-$sandhi[]=array("a"=>"ī","b"=>"eva","c"=>"īyevā","len"=>4,"adj_len"=>0);
-$sandhi[]=array("a"=>"o","b"=>"eva","c"=>"ovā","len"=>4,"adj_len"=>0);
-
-$sandhi[]=array("a"=>"ā","b"=>"api","c"=>"āpi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"a","b"=>"api","c"=>"āpi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"e","b"=>"api","c"=>"epi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"ī","b"=>"api","c"=>"īpi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"i","b"=>"api","c"=>"īpi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"o","b"=>"api","c"=>"opi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"ū","b"=>"api","c"=>"ūpi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"u","b"=>"api","c"=>"ūpi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"u","b"=>"api","c"=>"upi","len"=>3,"adj_len"=>0);
-$sandhi[]=array("a"=>"ṃ","b"=>"api","c"=>"mpi","len"=>3,"adj_len"=>0);
-*/
-	//$sandhi[]=array("a"=>"a","b"=>"a","c"=>"a","len"=>1,"adj_len"=>-1,"advance"=>true);
-	//$sandhi[]=array("a"=>"ī","b"=>"","c"=>"i","len"=>1,"adj_len"=>0,"advance"=>true);
-
-
-//diphthong table双元音表
-$search  = array('aa', 'ae', 'ai', 'ao', 'au', 'aā', 'aī', 'aū', 'ea', 'ee', 'ei', 'eo', 'eu', 'eā', 'eī', 'eū', 'ia', 'ie', 'ii', 'io', 'iu', 'iā', 'iī', 'iū', 'oa', 'oe', 'oi', 'oo', 'ou', 'oā', 'oī', 'oū', 'ua', 'ue', 'ui', 'uo', 'uu', 'uā', 'uī', 'uū', 'āa', 'āe', 'āi', 'āo', 'āu', 'āā', 'āī', 'āū', 'īa', 'īe', 'īi', 'īo', 'īu', 'īā', 'īī', 'īū', 'ūa', 'ūe', 'ūi', 'ūo', 'ūu', 'ūā', 'ūī', 'ūū');
-$replace = array('a-a', 'a-e', 'a-i', 'a-o', 'a-u', 'a-ā', 'a-ī', 'a-ū', 'e-a', 'e-e', 'e-i', 'e-o', 'e-u', 'e-ā', 'e-ī', 'e-ū', 'i-a', 'i-e', 'i-i', 'i-o', 'i-u', 'i-ā', 'i-ī', 'i-ū', 'o-a', 'o-e', 'o-i', 'o-o', 'o-u', 'o-ā', 'o-ī', 'o-ū', 'u-a', 'u-e', 'u-i', 'u-o', 'u-u', 'u-ā', 'u-ī', 'u-ū', 'ā-a', 'ā-e', 'ā-i', 'ā-o', 'ā-u', 'ā-ā', 'ā-ī', 'ā-ū', 'ī-a', 'ī-e', 'ī-i', 'ī-o', 'ī-u', 'ī-ā', 'ī-ī', 'ī-ū', 'ū-a', 'ū-e', 'ū-i', 'ū-o', 'ū-u', 'ū-ā', 'ū-ī', 'ū-ū');
-
-//main 
-
-$allword = array();
-foreach($arrWords as $oneword){
-	//预处理
-	//将双元音拆开
-	//step 1 : split at diphthong . ~aa~ -> ~a-a~
-	$word = str_replace($search, $replace, $oneword);
-
-	if(isset($_POST["debug"])){
-		echo "Look up:{$word}<br>";
-	}
-	
-	//按连字符拆开处理
-	$arrword = str_getcsv($word,"-");
-
-	$t1=microtime_float();
-	$output = array();
-	foreach($arrword as $oneword){
-		$result = array();//全局变量,递归程序的输出容器
-
-		if(mb_strlen($oneword,"UTF-8")<30){
-			mySplit2($oneword,0,$_express);
-		}
-		else{
-			mySplit2($oneword,0,$_express);
-		}
-		
-		arsort($result);//按信心指数排序
-		$wordlist = array();
-		$iMax = 5;
-		$iCount = 0;
-		foreach($result as $row=>$value){
-			$iCount++;
-			$word_part  = array();
-			$word_part["word"] = $row;
-			$word_part["confidence"] = $value;
-			$wordlist[] = $word_part;
-			if($iCount>=$iMax){
-			break;
-			}
 
-		}
-		$output[] = $wordlist;
-
-		if(isset($_POST["debug"])){
-			echo "<h2>{$oneword}</h2>";
-			echo "<h4>".count($result)."</h4>";
-		}
-		$iCount=0;
-		foreach($result as $row=>$value){
-			if($iCount>10){
-				break;
-			}
-			$iCount++;
-			$level=$value*90;
-			if(isset($_POST["debug"])){
-				echo $row."-[".$value."]<br>";
-			}
-		}
-		
-		/*
-		后处理
-		-ssāpi=-[ssa]-api
-		*/
-	}
-	$t2 = microtime_float();
-	$one_split["data"]=$output;
-	$one_split["time"]= $auto_split_times;
-	$one_split["second"]= $t2-$t1;	
-	$allword[] = $one_split;
-
-	if(isset($_POST["debug"])){
-		echo "<div>";
-		echo "<br>查询【{$auto_split_times}】次";	
-		echo "time:".($t2-$t1);
-		echo "</div>";
-	}
-}
-
-if(isset($_POST["debug"])){
-	echo "<pre style='margin:2em;padding:1em;background-color:#e9e9e9;'>";
-	print_r($allword);
-	echo "</pre>";
-}
-echo json_encode($allword,JSON_UNESCAPED_UNICODE);
-
-/*
-用于数组连接字符串
-*/
-function myfunction($v1,$v2)
-{
-	return $v1 . "+" . $v2;
-}
-function microtime_float()
-{
-    list($usec, $sec) = explode(" ", microtime());
-    return ((float)$usec + (float)$sec);
-}
-
-function dict_lookup($word){
-	global $dbh;
-	$query = "SELECT weight from part where word = ? ";
-	$stmt = $dbh->prepare($query);
-	$stmt->execute(array($word));
-    $row = $stmt->fetch(PDO::FETCH_NUM);
-    if ($row) {
-        return $row[0];
+if (isset($_POST["express"])) {
+    if ($_POST["express"] === "on") {
+        $_express = true;
     } else {
-        return 0;
-    }	
+        $_express = false;
+    }
+} else {
+    $_express = false;
 }
 
-/*
-查找某个单词是否在现有词典出现
-返回信心指数
-look up single word in dictionary vocabulary
-return the confidence value
-*/
-function isExsit($word,$adj_len=0){
-
-	global $auto_split_times;
-	global $result;
-	global $part;
-	global $confidence;
-	$auto_split_times++;
-	
-	if(isset($_POST["debug"])){
-		echo "<div>正在查询:{$word}</div>";
-	}
-	$isFound=false;
-	if(isset($part["{$word}"]))
-	{
-		if($part["{$word}"]>0){
-			$isFound=true;
-			$count=$part["{$word}"]+1;			
-		}
-	}
-	else{
-		$db=dict_lookup($word);
-		//加入查询缓存
-		$part["{$word}"] = $db;
-		if($db>0){
-			$isFound=true;
-			$count=$db+1;
-		}
-		else{
-			
-		}
-	} 
-//fomular of confidence value 信心值计算公式
-	if($isFound)
-	{
-		if(isset($confidence["{$word}"])){
-			$cf=$confidence["{$word}"];
-		}
-		else{
-			$len=mb_strlen($word,"UTF-8")+$adj_len;
-			$len_correct=1.2;
-			$count2=1.1+pow($count,1.18);
-			$conf_num=pow(1/$count2,pow(($len-1),$len_correct));
-			$cf=round(1/(1+640*$conf_num),9);
+//main
 
-			$confidence["{$word}"]=$cf;
-		}
-		return($cf);
-		
-	}
-	else{
-		return(-1);
-	}
+$allword = array();
+foreach ($arrWords as $currword) {
+    $t1 = microtime_float();
+    $output = array();
+    if (isset($_POST["debug"])) {
+        echo "Look up:{$currword}<br>";
+    }
+
+    //预处理
+    //将双元音拆开
+    //step 1 : split at diphthong . ~aa~ -> ~a-a~
+    //按连字符拆开处理
+    $arrword = split_diphthong($currword);
+    foreach ($arrword as $oneword) {
+        $result = array(); //全局变量,递归程序的输出容器
+
+        mySplit2($oneword, 0, $_express, 0, 0.1, 0.01, true, true);
+        if (isset($_POST["debug"])) {
+            echo "正切:" . count($result) . "\n";
+        }
+        mySplit2($oneword, 0, $_express, 0, 0.1, 0.01, false, true);
+        if (isset($_POST["debug"])) {
+            echo "反切:" . count($result) . "\n";
+        }
+        /*
+        if (count($result) < 5) {
+        #sandhi advance
+        mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false, true);
+        if (isset($_POST["debug"])) {
+        echo "反切:" . count($result) . "\n";
+        }
+        }
+        if (count($result) < 5) {
+        #反向
+        mySplit2($oneword, 0, $_express, 0, 0.8, 0.8, false);
+        }
+        if (count($result) < 5) {
+        #正向
+        mySplit2($oneword, 0, $_express, 0, 0.8, 0, true);
+        }
+        if (count($result) < 5) {
+        #反向
+        mySplit2($oneword, 0, $_express, 0, 0.8, 0, false);
+        }
+         */
+        arsort($result); //按信心指数排序
+
+        #输出结果 ouput to json
+        $wordlist = array();
+        $iMax = 5;
+        $iCount = 0;
+        foreach ($result as $row => $value) {
+            $iCount++;
+            $word_part = array();
+            $word_part["word"] = $row;
+            $word_part["confidence"] = $value;
+            $wordlist[] = $word_part;
+            if ($iCount >= $iMax) {
+                break;
+            }
+
+        }
+        $output[] = $wordlist;
+
+        if (isset($_POST["debug"])) {
+            echo "<h2>{$oneword}</h2>";
+            echo "<h4>" . count($result) . "</h4>";
+        }
+        $iCount = 0;
+        foreach ($result as $row => $value) {
+            if ($iCount > 10) {
+                break;
+            }
+            $iCount++;
+            $level = $value * 90;
+            if (isset($_POST["debug"])) {
+                echo $row . "-[" . $value . "]<br>";
+            }
+        }
+
+        /*
+    后处理
+    -ssāpi=-[ssa]-api
+     */
+    }
+    $t2 = microtime_float();
+    $one_split["data"] = $output;
+    $one_split["time"] = $auto_split_times;
+    $one_split["second"] = $t2 - $t1;
+    $allword[] = $one_split;
+
+    if (isset($_POST["debug"])) {
+        echo "<div>";
+        echo "<br>查询【{$auto_split_times}】次";
+        echo "time:" . ($t2 - $t1);
+        echo "</div>";
+    }
 }
 
-/*
-核心拆分函数
-
-$strWord, word to be look up 要查询的词
-$deep, 当前递归深度
-$express=true, 快速查询
-$adj_len=0 长度校正系数
-$c_threshhold 信心指数阈值
-*/
-
-function mySplit2($strWord,$deep,$express=false,$adj_len=0,$c_threshhold=0.8){
-	global $path;
-	global $result;
-	global $sandhi ;
-	$output = array();
-	
-	//达到最大搜索深度,返回
-	if($deep>=16){
-		$word = "";
-		$cf=1.0;
-		for($i=0;$i<$deep;$i++){
-			$word .= $path[$i][0];
-			if(isset($_POST["debug"])){
-				$word .="(".$path[$i][1].")-";
-			}
-			else{
-				$word .= "-";
-			}
-			$cf=$cf*$path[$i][1];
-		}
-		$len=pow(mb_strlen($strWord,"UTF-8"),3);
-		$cf+=(0-$len)/($len+150);
-		$word .= "{$strWord}";
-		$result[$word]=$cf;
-		return;
-	}
-	//直接找到
-	$confidence=isExsit($strWord,$adj_len);
-	if($confidence>=0){
-		$output[] = array($strWord,"",$confidence);
-	}
-	else{
-		$confidence=isExsit("[".$strWord."]");
-		if($confidence>=0){
-			$output[] = array("[".$strWord."]","",$confidence);
-		}
-	}
-
-	//如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。
-	$doubleword="kkggccjjṭṭḍḍttddppbb";
-	if(mb_strlen($strWord,"UTF-8")>2){
-		$left2=mb_substr($strWord,0,2,"UTF-8");
-		if(mb_strpos($doubleword,$left2,0,"UTF-8")!==FALSE){
-			$strWord=mb_substr($strWord,1,NULL,"UTF-8");
-		}
-	}
-
-
-	$len=mb_strlen($strWord,"UTF-8");
-	if($len>2){
-		for($i=$len;$i>1;$i--){
-			foreach($sandhi as $row){
-				if(mb_substr($strWord,$i-$row["len"],$row["len"],"UTF-8")==$row["c"]){
-					$str1=mb_substr($strWord,0,$i-$row["len"],"UTF-8").$row["a"];
-					$str2=$row["b"].mb_substr($strWord,$i,NULL,"UTF-8");
-					$confidence=isExsit($str1,$adj_len);
-					if($confidence > $c_threshhold){
-						$output[] = array($str1,$str2,$confidence,$row["adj_len"]);
-						if($express){
-							break;
-						}
-					}
-
-				}
-			}
-		}
-	}
-
-	if(count($output)>0){
-		foreach($output as $part){
-			$path[$deep][0]=$part[0];
-			$path[$deep][1]=$part[2];
-			if($part[1]!=""){
-				mySplit2($part[1],($deep+1),$express,$part[3],$c_threshhold);
-			}
-			else{
-				$word = "";
-				$cf=1.0;
-				for($i=0;$i<$deep;$i++){
-					$word .= $path[$i][0]."+";
-					if(isset($_POST["debug"])){
-						$word .= "(".$path[$i][1].")-";
-					}
-					$cf=$cf*$path[$i][1];
-				}
-				$word .= $part[0];
-				if(isset($_POST["debug"])){
-					$word .= "({$part[2]})";
-				}
-				$cf=$cf+$part[2]*0.1;
-				if($cf >= $c_threshhold){
-					$result[$word]=$cf;
-				}
-			}
-		}
-	}
-	else{
-		$word = "";
-		$cf=1.0;
-		for($i=0;$i<$deep;$i++){
-			$word .= $path[$i][0]."+";
-			if(isset($_POST["debug"])){
-				$word .= "(".$path[$i][1].")-";
-			}
-			$cf=$cf*$path[$i][1];
-		}
-		$len=pow(mb_strlen($strWord,"UTF-8"),3);
-		$cf+=(0-$len)/($len+150);
-		if(isset($_POST["debug"])){
-			$word .= $strWord."(0)";
-		}
-		else{
-			$word .= $strWord;
-		}
-		
-		if($cf >= $c_threshhold){
-			$result[$word]=$cf;
-		}
-	}
+if (isset($_POST["debug"])) {
+    echo "<pre style='margin:2em;padding:1em;background-color:#e9e9e9;'>";
+    print_r($allword);
+    echo "</pre>";
 }
-
-
+echo json_encode($allword, JSON_UNESCAPED_UNICODE);
 
 ?>

+ 480 - 0
app/dict/turbo_split.php

@@ -0,0 +1,480 @@
+<?php
+require_once '../public/casesuf.inc';
+//require_once '../studio/dict_find_un.inc';
+//require_once '../studio/sandhi.php';
+require_once "../path.php";
+require_once "../public/_pdo.php";
+// open word part db
+global $dbh;
+$dns = "sqlite:" . _FILE_DB_PART_;
+$dbh = new PDO($dns, "", "", array(PDO::ATTR_PERSISTENT => true));
+$dbh->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
+
+global $path;
+global $confidence;
+global $result;
+global $part;
+$part = array();
+
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+$path[] = array("", 0);
+
+global $sandhi;
+//sandhi rules table 语尾表
+$sandhi[] = array("a" => "", "b" => "", "c" => "", "len" => 0, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ā", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "ā", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ā", "b" => "a", "c" => "ā", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "i", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "o", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "u", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "u", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "u", "b" => "u", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "u", "c" => "u", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "ī", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "ū", "c" => "ū", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "i", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "e", "b" => "a", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "i", "b" => "i", "c" => "ī", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "i", "b" => "e", "c" => "e", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "i", "b" => "a", "c" => "ya", "len" => 2, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "atth", "c" => "atth", "len" => 4, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "taṃ", "b" => "n", "c" => "tann", "len" => 4, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[ṃ]", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[ṃ]", "b" => "eva", "c" => "meva", "len" => 4, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[o]", "b" => "iva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "o", "b" => "a", "c" => "o", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "ādi", "c" => "ādi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a[ānaṃ]", "b" => "a", "c" => "ānama", "len" => 5, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[ṃ]", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[ṃ]", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[ṃ]", "b" => "a", "c" => "ma", "len" => 2, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ṃ", "b" => "a", "c" => "m", "len" => 1, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[ṃ]", "b" => "ā", "c" => "mā", "len" => 2, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[ṃ]", "b" => "u", "c" => "mu", "len" => 2, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "[ṃ]", "b" => "h", "c" => "ñh", "len" => 2, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ā", "b" => "[ṃ]", "c" => "am", "len" => 2, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī", "b" => "[ṃ]", "c" => "im", "len" => 2, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ati", "b" => "tabba", "c" => "atabba", "len" => 6, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ati", "b" => "tabba", "c" => "itabba", "len" => 6, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "iti", "b" => "a", "c" => "icca", "len" => 4, "adj_len" => 0, "advance" => false);
+
+$sandhi[] = array("a" => "uṃ", "b" => "a", "c" => "uma", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "u[ūnaṃ]", "b" => "a", "c" => "ūnama", "len" => 5, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī[īnaṃ]", "b" => "a", "c" => "īnama", "len" => 5, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "su", "b" => "a", "c" => "sva", "len" => 3, "adj_len" => 0, "advance" => false);
+
+#other sandhi rule. can be use but program will be slow down
+#其他连音规则,如果使用则会让程序运行变慢
+
+$sandhi[] = array("a" => "ā", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "iti", "c" => "āti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "e", "b" => "iti", "c" => "eti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "i", "b" => "iti", "c" => "īti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "o", "b" => "iti", "c" => "oti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ū", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "u", "b" => "iti", "c" => "ūti", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ṃ", "b" => "iti", "c" => "nti", "len" => 3, "adj_len" => 0, "advance" => false);
+
+$sandhi[] = array("a" => "ṃ", "b" => "ca", "c" => "ñca", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ṃ", "b" => "cāti", "c" => "ñcāti", "len" => 5, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ṃ", "b" => "cet", "c" => "ñcet", "len" => 4, "adj_len" => 0, "advance" => false);
+
+/*
+$sandhi[] = array("a" => "a", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ā", "b" => "eva", "c" => "āyeva", "len" => 5, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "e", "b" => "eva", "c" => "eva", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "i", "b" => "eva", "c" => "yeva", "len" => 4, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "iyeva", "len" => 5, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "īyeva", "len" => 5, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "o", "b" => "eva", "c" => "ova", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "u", "b" => "eva", "c" => "veva", "len" => 3, "adj_len" => 0, "advance" => false);
+
+$sandhi[] = array("a" => "a", "b" => "eva", "c" => "evā", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "e", "b" => "eva", "c" => "evā", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "i", "b" => "eva", "c" => "yevā", "len" => 4, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "yevā", "len" => 4, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "iyevā", "len" => 4, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī", "b" => "eva", "c" => "īyevā", "len" => 4, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "o", "b" => "eva", "c" => "ovā", "len" => 4, "adj_len" => 0, "advance" => false);
+
+$sandhi[] = array("a" => "ā", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "a", "b" => "api", "c" => "āpi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "e", "b" => "api", "c" => "epi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ī", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "i", "b" => "api", "c" => "īpi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "o", "b" => "api", "c" => "opi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ū", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "u", "b" => "api", "c" => "ūpi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "u", "b" => "api", "c" => "upi", "len" => 3, "adj_len" => 0, "advance" => false);
+$sandhi[] = array("a" => "ṃ", "b" => "api", "c" => "mpi", "len" => 3, "adj_len" => 0, "advance" => false);
+ */
+$sandhi[] = array("a" => "a", "b" => "a", "c" => "a", "len" => 1, "adj_len" => -1, "advance" => true);
+$sandhi[] = array("a" => "ī", "b" => "", "c" => "i", "len" => 1, "adj_len" => 0, "advance" => true);
+
+function split_diphthong($word)
+{
+    //diphthong table双元音表
+    $search = array('aa', 'ae', 'ai', 'ao', 'au', 'aā', 'aī', 'aū', 'ea', 'ee', 'ei', 'eo', 'eu', 'eā', 'eī', 'eū', 'ia', 'ie', 'ii', 'io', 'iu', 'iā', 'iī', 'iū', 'oa', 'oe', 'oi', 'oo', 'ou', 'oā', 'oī', 'oū', 'ua', 'ue', 'ui', 'uo', 'uu', 'uā', 'uī', 'uū', 'āa', 'āe', 'āi', 'āo', 'āu', 'āā', 'āī', 'āū', 'īa', 'īe', 'īi', 'īo', 'īu', 'īā', 'īī', 'īū', 'ūa', 'ūe', 'ūi', 'ūo', 'ūu', 'ūā', 'ūī', 'ūū');
+    $replace = array('a-a', 'a-e', 'a-i', 'a-o', 'a-u', 'a-ā', 'a-ī', 'a-ū', 'e-a', 'e-e', 'e-i', 'e-o', 'e-u', 'e-ā', 'e-ī', 'e-ū', 'i-a', 'i-e', 'i-i', 'i-o', 'i-u', 'i-ā', 'i-ī', 'i-ū', 'o-a', 'o-e', 'o-i', 'o-o', 'o-u', 'o-ā', 'o-ī', 'o-ū', 'u-a', 'u-e', 'u-i', 'u-o', 'u-u', 'u-ā', 'u-ī', 'u-ū', 'ā-a', 'ā-e', 'ā-i', 'ā-o', 'ā-u', 'ā-ā', 'ā-ī', 'ā-ū', 'ī-a', 'ī-e', 'ī-i', 'ī-o', 'ī-u', 'ī-ā', 'ī-ī', 'ī-ū', 'ū-a', 'ū-e', 'ū-i', 'ū-o', 'ū-u', 'ū-ā', 'ū-ī', 'ū-ū');
+    //将双元音拆开
+    //step 1 : split at diphthong . ~aa~ -> ~a-a~
+    $word1 = str_replace($search, $replace, $word);
+    //按连字符拆开处理
+    $arrword = str_getcsv($word1, "-");
+    return $arrword;
+}
+
+/*
+用于数组连接字符串
+ */
+function myfunction($v1, $v2)
+{
+    return $v1 . "+" . $v2;
+}
+function microtime_float()
+{
+    list($usec, $sec) = explode(" ", microtime());
+    return ((float) $usec + (float) $sec);
+}
+
+function dict_lookup($word)
+{
+    if (strlen($word) <= 1) {
+        return 0;
+    }
+    global $case;
+    global $dbh;
+    $str = strstr($word, "[");
+    if ($str === false) {
+        $search = $word;
+    } else {
+        $search = $str;
+    }
+    $query = "SELECT weight from part where word = ? ";
+    $stmt = $dbh->prepare($query);
+    $stmt->execute(array($search));
+    $row = $stmt->fetch(PDO::FETCH_NUM);
+    if ($row) {
+        return $row[0];
+    } else {
+        //去除尾查
+        $newWord = array();
+        for ($row = 0; $row < count($case); $row++) {
+            $len = mb_strlen($case[$row][1], "UTF-8");
+            $end = mb_substr($word, 0 - $len, null, "UTF-8");
+            if ($end == $case[$row][1]) {
+                $base = mb_substr($word, 0, mb_strlen($word, "UTF-8") - $len, "UTF-8") . $case[$row][0];
+                if ($base != $word) {
+                    $newWord[$base] = 1;
+                }
+            }
+        }
+        #找到最高频的base
+        $base_weight = 0;
+        foreach ($newWord as $x => $x_value) {
+            $query = "SELECT weight from part where word = ? ";
+            $stmt = $dbh->prepare($query);
+            $stmt->execute(array($x));
+            $row = $stmt->fetch(PDO::FETCH_NUM);
+            if ($row) {
+                if ($row[0] > $base_weight) {
+                    $base_weight = $row[0];
+                }
+            }
+        }
+        return $base_weight;
+    }
+}
+
+/*
+查找某个单词是否在现有词典出现
+返回信心指数
+look up single word in dictionary vocabulary
+return the confidence value
+ */
+function isExsit($word, $adj_len = 0)
+{
+
+    global $auto_split_times;
+    global $result;
+    global $part;
+    global $confidence;
+    $auto_split_times++;
+
+    if (isset($_POST["debug"])) {
+        echo "<div>正在查询:{$word}</div>";
+    }
+    $isFound = false;
+    $count = 0;
+    if (isset($part["{$word}"])) {
+        if ($part["{$word}"] > 0) {
+            $isFound = true;
+            $count = $part["{$word}"] + 1;
+        }
+    } else {
+        $db = dict_lookup($word);
+
+        //加入查询缓存
+        $part["{$word}"] = $db;
+        if ($db > 0) {
+            if (isset($_POST["debug"])) {
+                echo "查到:{$word}:{$db}个\n";
+            }
+            $isFound = true;
+            $count = $db + 1;
+        }
+    }
+//fomular of confidence value 信心值计算公式
+    if ($isFound) {
+        if (isset($confidence["{$word}"])) {
+            $cf = $confidence["{$word}"];
+        } else {
+            $len = mb_strlen($word, "UTF-8") + $adj_len;
+            $len_correct = 1.2;
+            $count2 = 1.1 + pow($count, 1.18);
+            $conf_num = pow(1 / $count2, pow(($len - 0.5), $len_correct));
+            $cf = round(1 / (1 + 640 * $conf_num), 9);
+
+            $confidence["{$word}"] = $cf;
+            if (isset($_POST["debug"])) {
+                echo "信心指数:{$word}:{$cf}\n";
+            }
+        }
+        return ($cf);
+
+    } else {
+        return (-1);
+    }
+}
+
+/*
+核心拆分函数
+
+$strWord, word to be look up 要查询的词
+$deep, 当前递归深度
+$express=true, 快速查询
+$adj_len=0 长度校正系数
+$c_threshhold 信心指数阈值
+ */
+
+function mySplit2($strWord, $deep = 0, $express = false, $adj_len = 0, $c_threshhold = 0.8, $w_threshhold = 0.8, $forward = true, $sandhi_advance = false)
+{
+    global $path;
+    global $result;
+    global $sandhi;
+    $output = array();
+
+    //达到最大搜索深度,返回
+    if ($deep >= 16) {
+        $word = "";
+        $cf = 1.0;
+        for ($i = 0; $i < $deep; $i++) {
+            if (!empty($path[$i][0])) {
+                $word .= $path[$i][0] . "+";
+                if (isset($_POST["debug"])) {
+                    $word .= "(" . $path[$i][1] . ")-";
+                }
+                $cf = $cf * $path[$i][1];
+            }
+        }
+        $len = pow(mb_strlen($strWord, "UTF-8"), 3);
+        $cf += (0 - $len) / ($len + 150);
+        $word .= "{$strWord}";
+        if ($forward == true) {
+            $result[$word] = $cf;
+        } else {
+            $reverseWord = word_reverse($word);
+            $result[$reverseWord] = $cf;
+        }
+        return;
+    }
+    //直接找到
+    $confidence = isExsit($strWord, $adj_len);
+    if ($confidence >= 0) {
+        $output[] = array($strWord, "", $confidence);
+    } else {
+        $confidence = isExsit("[" . $strWord . "]");
+        if ($confidence >= 0) {
+            $output[] = array("[" . $strWord . "]", "", $confidence);
+        }
+    }
+
+    //如果开头有双辅音,去掉第一个辅音。因为巴利语中没有以双辅音开头的单词。
+    $doubleword = "kkggccjjṭṭḍḍttddppbb";
+    if (mb_strlen($strWord, "UTF-8") > 2) {
+        $left2 = mb_substr($strWord, 0, 2, "UTF-8");
+        if (mb_strpos($doubleword, $left2, 0, "UTF-8") !== false) {
+            $strWord = mb_substr($strWord, 1, null, "UTF-8");
+        }
+    }
+
+    $len = mb_strlen($strWord, "UTF-8");
+    if ($len > 2) {
+        if ($forward) {
+            #正向切
+            for ($i = $len; $i > 1; $i--) {
+                foreach ($sandhi as $key => $row) {
+                    if ($sandhi_advance == false && $row["advance"] == true) {
+                        continue;
+                    }
+                    if (mb_substr($strWord, $i - $row["len"], $row["len"], "UTF-8") == $row["c"]) {
+                        $str1 = mb_substr($strWord, 0, $i - $row["len"], "UTF-8") . $row["a"];
+                        $str2 = $row["b"] . mb_substr($strWord, $i, null, "UTF-8");
+                        $confidence = isExsit($str1, $adj_len);
+                        if ($row["advance"] == true) {
+                            $confidence = $confidence * 0.99;
+                        }
+                        if ($confidence > $c_threshhold) {
+                            $output[] = array($str1, $str2, $confidence, $row["adj_len"]);
+                            if (isset($_POST["debug"])) {
+                                echo "插入:{$str1}\n";
+                            }
+                            if ($express) {
+                                break;
+                            }
+                        }
+
+                    }
+                }
+            }
+        } else {
+            #反向切
+            for ($i = 1; $i < $len - 1; $i++) {
+                foreach ($sandhi as $key => $row) {
+                    if ($sandhi_advance == false && $row["advance"] == true) {
+                        continue;
+                    }
+                    if (mb_substr($strWord, $i, $row["len"], "UTF-8") == $row["c"]) {
+                        $str1 = mb_substr($strWord, 0, $i, "UTF-8") . $row["a"];
+                        $str2 = $row["b"] . mb_substr($strWord, $i + $row["len"], null, "UTF-8");
+                        $confidence = isExsit($str2, $adj_len);
+                        if ($row["advance"] == true) {
+                            $confidence = $confidence * 0.99;
+                        }
+                        if ($confidence > $c_threshhold) {
+                            $output[] = array($str2, $str1, $confidence, $row["adj_len"]);
+                            if (isset($_POST["debug"])) {
+                                echo "插入:{$str2}\n";
+                            }
+                            if ($express) {
+                                break;
+                            }
+                        }
+
+                    }
+                }
+            }
+        }
+
+    }
+
+    if (count($output) > 0) {
+        foreach ($output as $part) {
+            $checked = $part[0];
+            $remainder = $part[1];
+
+            $path[$deep][0] = $checked;
+            $path[$deep][1] = $part[2];
+            if (empty($remainder)) {
+                #全切完了
+                $word = "";
+                $cf = 1.0;
+                for ($i = 0; $i < $deep; $i++) {
+                    $word .= $path[$i][0];
+                    if (isset($_POST["debug"])) {
+                        $word .= "(" . $path[$i][1] . ")";
+                    }
+                    $word .= "+";
+                    $cf = $cf * $path[$i][1];
+                }
+
+                if (isset($_POST["debug"])) {
+                    $word .= $checked . "({$part[2]})";
+                } else {
+                    $word .= $checked;
+                }
+                $cf = $cf + $part[2] * 0.1;
+                if ($cf > $w_threshhold) {
+                    if ($forward == true) {
+                        $result[$word] = $cf;
+                    } else {
+                        $reverseWord = word_reverse($word);
+                        $result[$reverseWord] = $cf;
+                    }
+                }
+            } else {
+                #接着切
+                mySplit2($remainder, ($deep + 1), $express, $adj_len, $c_threshhold, $w_threshhold, $forward, $sandhi_advance);
+            }
+        }
+    } else {
+        #尾巴查不到了
+        $word = "";
+        $cf = 1.0;
+        for ($i = 0; $i < $deep; $i++) {
+            $word .= $path[$i][0];
+            if (isset($_POST["debug"])) {
+                $word .= "(" . $path[$i][1] . ")";
+            }
+            $word .= "+";
+            $cf = $cf * $path[$i][1];
+        }
+        $len = pow(mb_strlen($strWord, "UTF-8"), 3);
+        if ($forward) {
+            $cf += (0 - $len) / ($len + 150);
+        } else {
+            $cf += (0 - $len) / ($len + 5);
+        }
+        if (isset($_POST["debug"])) {
+            $word .= $strWord . "(0)";
+        } else {
+            $word .= $strWord;
+        }
+
+        if ($cf > $w_threshhold) {
+            if ($forward == true) {
+                $result[$word] = $cf;
+            } else {
+                $reverseWord = word_reverse($word);
+                $result[$reverseWord] = $cf;
+            }
+        }
+    }
+}
+
+function word_reverse($word)
+{
+    $reverse = array();
+    $newword = explode("+", $word);
+    $len = count($newword);
+    if ($len > 0) {
+        for ($i = $len - 1; $i >= 0; $i--) {
+            # code...
+            $reverse[] = $newword[$i];
+        }
+        $output = implode("+", $reverse);
+        return $output;
+    } else {
+        return $word;
+    }
+}

+ 9 - 11
app/redis/function.php

@@ -1,13 +1,11 @@
 <?php
-function redis_connect(){
-	return false;
-	$redis = new redis();  
-	$r_conn = $redis->connect('127.0.0.1', 6379);  
-	if($r_conn){
-		return $redis;
-	}
-	else{
-		return false;
-	}
+function redis_connect()
+{
+    $redis = new redis();
+    $r_conn = $redis->connect('127.0.0.1', 6379);
+    if ($r_conn) {
+        return $redis;
+    } else {
+        return false;
+    }
 }
-?>