comp_csv.php 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. <?php
  2. require_once "../config.php";
  3. require_once "../dict/turbo_split.php";
  4. require_once "../redis/function.php";
  5. if (isset($argv[1])) {
  6. $start = (int)$argv[1];
  7. }
  8. else{
  9. $start=1;
  10. }
  11. if (isset($argv[2])) {
  12. $end = (int)$argv[2];
  13. }
  14. else{
  15. $end=1000000;
  16. }
  17. global $result;
  18. $myfile = fopen(_DIR_TEMP_DICT_TEXT_ . "/comp.csv", "a");
  19. $filefail = fopen(_DIR_TEMP_DICT_TEXT_ . "/comp_fail.txt", "a");
  20. $iMax = 2;//输出前三个结果
  21. /*
  22. 不用全pali单词表 用redis里的wordindex 原因是需要排除语法书中的特别的词
  23. */
  24. $redis = redis_connect();
  25. if ($redis == false) {
  26. echo "no redis connect\n";
  27. exit;
  28. }
  29. $i = null;
  30. while($word = $redis->hGet("pali://wordindex.hash",$start))
  31. {
  32. # code...
  33. if($start>$end){
  34. echo "all done";
  35. exit;
  36. }
  37. {
  38. # code...
  39. $arrword = split_diphthong($word);
  40. if (count($arrword) > 1) {
  41. $data = array($start,$word,'.comp.','','','','',implode("+", $arrword),'',1,50,6,'comp','en');
  42. fputcsv($myfile, $data);
  43. }
  44. foreach ($arrword as $oneword) {
  45. $result = array(); //全局变量,递归程序的输出容器
  46. $min_result = 1;
  47. if(mb_strlen($oneword)>35){
  48. mySplit2($oneword, 0, true, 0.8, 0.9, 0, true, false);
  49. }
  50. else{
  51. mySplit2($oneword, 0, false, 0.8, 0.9, 0, true, false);
  52. $min_result=3;
  53. }
  54. if(count($result)<$min_result){
  55. mySplit2($oneword, 0, false, 0.2, 0.8, 0, true, true);
  56. if (isset($_POST["debug"])) {
  57. echo "正切:" . count($result) . "\n";
  58. }
  59. if(count($result)<2){
  60. mySplit2($oneword, 0, false, 0.2, 0.8, 0, false, true);
  61. if (isset($_POST["debug"])) {
  62. echo "反切:" . count($result) . "\n";
  63. }
  64. }
  65. }
  66. /*
  67. #正向切分
  68. mySplit2($oneword, 0, false);
  69. if (count($result) == 0) {
  70. #如果没有 逆向切分
  71. mySplit2($oneword, 0, false, 0, 0.8, 0.8, true);
  72. }
  73. */
  74. echo "{$start}-{$oneword}:" . count($result) . "\n";
  75. if (count($result) > 0) {
  76. arsort($result); //按信心指数排序
  77. $iCount = 0;
  78. foreach ($result as $row => $value) {
  79. $data = array($start,$oneword,'.comp.','','','','',$row,'',1,round($value*70),6,'comp','en');
  80. fputcsv($myfile, $data);
  81. //后处理 进一步切分没有意思的长词
  82. $new = split2($row);
  83. if($new!==$row){
  84. $data = array($start,$oneword,'.comp.','','','','',$new,'',1,round($value*70),6,'comp','en');
  85. fputcsv($myfile, $data);
  86. #再处理一次
  87. $new2 = split2($new);
  88. if($new2!==$new){
  89. $data = array($start,$oneword,'.comp.','','','','',$new2,'',1,round($value*70),6,'comp','en');
  90. fputcsv($myfile, $data);
  91. }
  92. }
  93. $iCount++;
  94. if ($iCount > $iMax) {
  95. break;
  96. }
  97. }
  98. } else {
  99. fwrite($filefail, $oneword . "\n");
  100. }
  101. }
  102. }
  103. $start++;
  104. }