comp_csv.php 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. <?php
  2. require_once "../path.php";
  3. require_once "../dict/turbo_split.php";
  4. require_once "../redis/function.php";
  5. if (isset($argv[1])) {
  6. $start = (int)$argv[1];
  7. }
  8. else{
  9. $start=0;
  10. }
  11. if (isset($argv[2])) {
  12. $end = (int)$argv[2];
  13. }
  14. else{
  15. $end=1000000;
  16. }
  17. global $result;
  18. $myfile = fopen(_DIR_TEMP_DICT_TEXT_ . "/comp.csv", "a");
  19. $filefail = fopen(_DIR_TEMP_DICT_TEXT_ . "/comp_fail.txt", "a");
  20. $iMax = 2;//输出前三个结果
  21. /*
  22. $dns = "" . _FILE_DB_WORD_INDEX_;
  23. $dbh_word = new PDO($dns, "", "", array(PDO::ATTR_PERSISTENT => true));
  24. $dbh_word->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
  25. $query = "SELECT * from wordindex where 1";
  26. $stmt = $dbh_word->query($query);
  27. while ($word = $stmt->fetch(PDO::FETCH_ASSOC))
  28. */
  29. $redis = redis_connect();
  30. if ($redis == false) {
  31. echo "no redis connect\n";
  32. exit;
  33. }
  34. $i = null;
  35. $counter = 0;
  36. while ($words = $redis->sscan("pali_word", $i)) {
  37. # code...
  38. if($counter<$start){
  39. $counter+=10;
  40. continue;
  41. }
  42. if($counter>$end){
  43. echo "all done";
  44. exit;
  45. }
  46. foreach ($words as $key => $word) {
  47. # code...
  48. $arrword = split_diphthong($word);
  49. if (count($arrword) > 1) {
  50. $data = array($counter,$word,'.comp.','','','','',implode("+", $arrword),'',1,50,6,'comp','en');
  51. fputcsv($myfile, $data);
  52. }
  53. foreach ($arrword as $oneword) {
  54. $counter++;
  55. $result = array(); //全局变量,递归程序的输出容器
  56. mySplit2($oneword, 0, true, 0.5, 0.9, 0, true, false);
  57. mySplit2($oneword, 0, true, 0.5, 0.9, 0, false, false);
  58. if(count($result)<5){
  59. mySplit2($oneword, 0, false, 0.2, 0.8, 0, true, true);
  60. if (isset($_POST["debug"])) {
  61. echo "正切:" . count($result) . "\n";
  62. }
  63. if(count($result)<2){
  64. mySplit2($oneword, 0, false, 0.2, 0.8, 0, false, true);
  65. if (isset($_POST["debug"])) {
  66. echo "反切:" . count($result) . "\n";
  67. }
  68. }
  69. }
  70. /*
  71. #正向切分
  72. mySplit2($oneword, 0, false);
  73. if (count($result) == 0) {
  74. #如果没有 逆向切分
  75. mySplit2($oneword, 0, false, 0, 0.8, 0.8, true);
  76. }
  77. */
  78. echo "{$counter}-{$oneword}:" . count($result) . "\n";
  79. if (count($result) > 0) {
  80. arsort($result); //按信心指数排序
  81. $iCount = 0;
  82. foreach ($result as $row => $value) {
  83. $data = array($counter,$oneword,'.comp.','','','','',$row,'',1,round($value*70),6,'comp','en');
  84. fputcsv($myfile, $data);
  85. //后处理 进一步切分没有意思的长词
  86. $new = split2($row);
  87. if($new!==$row){
  88. $data = array($counter,$oneword,'.comp.','','','','',$new,'',1,round($value*70),6,'comp','en');
  89. fputcsv($myfile, $data);
  90. #再处理一次
  91. $new2 = split2($new);
  92. if($new2!==$new){
  93. $data = array($counter,$oneword,'.comp.','','','','',$new2,'',1,round($value*70),6,'comp','en');
  94. fputcsv($myfile, $data);
  95. }
  96. }
  97. $iCount++;
  98. if ($iCount > $iMax) {
  99. break;
  100. }
  101. }
  102. } else {
  103. fwrite($filefail, $oneword . "\n");
  104. }
  105. }
  106. }
  107. }