| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- <?php
- require_once "../config.php";
- require_once "../dict/turbo_split.php";
- require_once "../redis/function.php";
- if (isset($argv[1])) {
- $start = (int)$argv[1];
- }
- else{
- $start=1;
- }
- if (isset($argv[2])) {
- $end = (int)$argv[2];
- }
- else{
- $end=1000000;
- }
- global $result;
- $myfile = fopen(_DIR_TEMP_DICT_TEXT_ . "/comp.csv", "a");
- $filefail = fopen(_DIR_TEMP_DICT_TEXT_ . "/comp_fail.txt", "a");
- $iMax = 2;//输出前三个结果
- /*
- 不用全pali单词表 用redis里的wordindex 原因是需要排除语法书中的特别的词
- */
- $redis = redis_connect();
- if ($redis == false) {
- echo "no redis connect\n";
- exit;
- }
- $i = null;
- while($word = $redis->hGet("pali://wordindex.hash",$start))
- {
- # code...
- if($start>$end){
- echo "all done";
- exit;
- }
- {
- # code...
- $arrword = split_diphthong($word);
- if (count($arrword) > 1) {
- $data = array($start,$word,'.comp.','','','','',implode("+", $arrword),'',1,50,6,'comp','en');
- fputcsv($myfile, $data);
- }
- foreach ($arrword as $oneword) {
- $result = array(); //全局变量,递归程序的输出容器
- $min_result = 1;
-
- if(mb_strlen($oneword)>35){
- mySplit2($oneword, 0, true, 0.8, 0.9, 0, true, false);
- }
- else{
- mySplit2($oneword, 0, false, 0.8, 0.9, 0, true, false);
- $min_result=3;
- }
-
- if(count($result)<$min_result){
- mySplit2($oneword, 0, false, 0.2, 0.8, 0, true, true);
- if (isset($_POST["debug"])) {
- echo "正切:" . count($result) . "\n";
- }
- if(count($result)<2){
- mySplit2($oneword, 0, false, 0.2, 0.8, 0, false, true);
- if (isset($_POST["debug"])) {
- echo "反切:" . count($result) . "\n";
- }
- }
- }
- /*
- #正向切分
- mySplit2($oneword, 0, false);
- if (count($result) == 0) {
- #如果没有 逆向切分
- mySplit2($oneword, 0, false, 0, 0.8, 0.8, true);
- }
- */
- echo "{$start}-{$oneword}:" . count($result) . "\n";
- if (count($result) > 0) {
- arsort($result); //按信心指数排序
- $iCount = 0;
- foreach ($result as $row => $value) {
- $data = array($start,$oneword,'.comp.','','','','',$row,'',1,round($value*70),6,'comp','en');
- fputcsv($myfile, $data);
- //后处理 进一步切分没有意思的长词
- $new = split2($row);
- if($new!==$row){
- $data = array($start,$oneword,'.comp.','','','','',$new,'',1,round($value*70),6,'comp','en');
- fputcsv($myfile, $data);
- #再处理一次
- $new2 = split2($new);
- if($new2!==$new){
- $data = array($start,$oneword,'.comp.','','','','',$new2,'',1,round($value*70),6,'comp','en');
- fputcsv($myfile, $data);
- }
- }
- $iCount++;
- if ($iCount > $iMax) {
- break;
- }
- }
- } else {
- fwrite($filefail, $oneword . "\n");
- }
- }
- }
- $start++;
- }
|