word_index_weight_refresh.php 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. <?php
  2. /*
  3. 计算单词权重
  4. */
  5. require_once __DIR__.'/../config.php';
  6. require_once __DIR__.'/word_index_weight_table.php';
  7. set_exception_handler(function($e){
  8. fwrite(STDERR,"error-msg:".$e->getMessage().PHP_EOL);
  9. fwrite(STDERR,"error-file:".$e->getFile().PHP_EOL);
  10. fwrite(STDERR,"error-line:".$e->getLine().PHP_EOL);
  11. exit;
  12. });
  13. define("__DB_WORD_INDEX__", _PG_DB_WORD_INDEX_);
  14. define("__TABLE_WORD_INDEX__", _PG_TABLE_WORD_INDEX_);
  15. define("__DB_PALI_INDEX__", _PG_DB_PALI_INDEX_);
  16. define("__TABLE_WORD__", _PG_TABLE_WORD_);
  17. if (isset($_GET["from"])) {
  18. $from = (int)$_GET["from"];
  19. $to = (int)$_GET["to"];
  20. } else {
  21. if ($argc != 3) {
  22. echo "无效的参数 ";
  23. exit;
  24. }
  25. $from = (int) $argv[1];
  26. $to = (int) $argv[2];
  27. if ($to > 217) {
  28. $to = 217;
  29. }
  30. }
  31. $dh_word = new PDO( __DB_WORD_INDEX__, _DB_USERNAME_, _DB_PASSWORD_);
  32. $dh_word->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
  33. $dh_pali = new PDO( __DB_PALI_INDEX__, _DB_USERNAME_, _DB_PASSWORD_);
  34. $dh_pali->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
  35. fwrite(STDOUT, "from=$from to = $to \n");
  36. for ($i = $from; $i <= $to; $i++) {
  37. $time_start = microtime(true);
  38. fwrite(STDOUT, "正在处理 book= $i ");
  39. $query = "SELECT max(paragraph) from ".__TABLE_WORD__." where book=?";
  40. try {
  41. //code...
  42. $stmt = $dh_pali->prepare($query);
  43. $stmt->execute(array($i));
  44. $row = $stmt->fetch(PDO::FETCH_NUM);
  45. }catch(PDOException $e){
  46. fwrite(STDERR,"error:".$e->getMessage());
  47. continue;
  48. }
  49. if ($row) {
  50. $max_para = $row[0];
  51. fwrite(STDOUT, " paragraph :$max_para ");
  52. for ($j = 0; $j <= $max_para; $j++) {
  53. # code...
  54. $query = "SELECT id,book,wordindex,bold from ".__TABLE_WORD__." where book={$i} and paragraph={$j} order by id ASC";
  55. $stmt = $dh_pali->query($query);
  56. $fetch = $stmt->fetchAll(PDO::FETCH_ASSOC);
  57. $query = "SELECT wordindex,count(*) as co from ".__TABLE_WORD__." where book={$i} and paragraph={$j} group by wordindex";
  58. $stmt = $dh_pali->query($query);
  59. $fetch_voc = $stmt->fetchAll(PDO::FETCH_ASSOC);
  60. $vocabulary = array();
  61. foreach ($fetch_voc as $key => $value) {
  62. $vocabulary[$value["wordindex"]] = $value["co"];
  63. }
  64. for ($iWord = 0; $iWord < count($fetch); $iWord++) {
  65. # 非黑体字
  66. if ($fetch[$iWord]["bold"] == 0) {
  67. $count = $vocabulary[$fetch[$iWord]["wordindex"]];
  68. $paraWeight = pow(1.01, $count); //总分
  69. if ($paraWeight > 1.9) {
  70. $paraWeight = 1.9;
  71. }
  72. $weight = $paraWeight / $count;
  73. } else {
  74. #黑体字
  75. #查找前后相连的黑体字
  76. $begin = $iWord;
  77. while ($fetch[$begin]["bold"] == 1) {
  78. $begin--;
  79. if ($begin < 0) {
  80. break;
  81. }
  82. }
  83. $begin = $begin + 1;
  84. $end = $iWord;
  85. while ($fetch[$end]["bold"] == 1) {
  86. $end++;
  87. if ($end > count($fetch) - 1) {
  88. break;
  89. }
  90. }
  91. $end = $end - 1;
  92. $bold_count = $end - $begin + 1;
  93. if ($bold_count == 1) {
  94. $query = "SELECT * from ".__TABLE_WORD_INDEX__." where id=" . $fetch[$iWord]["wordindex"];
  95. $stmt_word = $dh_word->query($query);
  96. $wordinfo = $stmt_word->fetch(PDO::FETCH_ASSOC);
  97. $bookId = (int) $fetch[$iWord]["book"];
  98. if (mb_substr($wordinfo["word"], -2) == "ti") {
  99. $weight = 100 + $book_weight[$bookId];
  100. } else {
  101. $weight = 100 + $book_weight[$bookId];
  102. }
  103. } else {
  104. #连续黑体字
  105. $len_sum = 0;
  106. $len_curr = 0;
  107. for ($iBold = $begin; $iBold <= $end; $iBold++) {
  108. # code...
  109. $boldid = $fetch[$iBold]["wordindex"];
  110. $query = "SELECT len from ".__TABLE_WORD_INDEX__." where id=" . $boldid;
  111. $stmt_bold = $dh_word->query($query);
  112. $wordbold = $stmt_bold->fetch(PDO::FETCH_ASSOC);
  113. $len_sum += $wordbold["len"];
  114. if ($iBold == $i) {
  115. $len_curr = $wordbold["len"];
  116. }
  117. }
  118. $weight = 10 + $len_curr / $len_sum;
  119. }
  120. }
  121. $fetch[$iWord]["weight"] = (int) ($weight * 100);
  122. }
  123. # 将整段权重写入据库
  124. $dh_pali->beginTransaction();
  125. $query = "UPDATE ".__TABLE_WORD__." set weight = ? , updated_at = now() where id=? ";
  126. $stmt_weight = $dh_pali->prepare($query);
  127. foreach ($fetch as $key => $value) {
  128. $stmt_weight->execute(array($value["weight"], $value["id"]));
  129. }
  130. $dh_pali->commit();
  131. if (!$stmt_weight || ($stmt_weight && $stmt_weight->errorCode() != 0)) {
  132. $error = $dh_pali->errorInfo();
  133. fwrite(STDERR, "error - $error[2]".PHP_EOL);
  134. }
  135. }
  136. } else {
  137. fwrite(STDERR, "无法获取段落最大值".PHP_EOL);
  138. }
  139. fwrite(STDOUT, "处理时间 :" . (microtime(true) - $time_start). "\n");
  140. }