sql.php 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. <?php
  2. require_once __DIR__."/../config.php";
  3. /*
  4. * 该脚本用于生成 SQL 语句, 将三藏语料 CSV 数据 (如:abh01a.att.csv)
  5. * 转换为 SQL 语句插入到 PostgreSQL 内,数据表结构参见 fts.sql
  6. * 由于懒惰,没有优化脚本,占用了较多内存,所以执行时请多给 PHP 一些内存:
  7. * php -d memory_limit=1024M sql.php
  8. *
  9. */
  10. function is_pali_word ($str) {
  11. $pali_word_exp = "/^[āīūṅñṭḍṇḷṃṁŋĀĪŪṄÑṬḌṆḶṂṀŊabcdefghijklmnoprstuvyABCDEFGHIJKLMNOPRSTUVY-]+$/";
  12. return preg_match($pali_word_exp, $str) === 1;
  13. }
  14. /*
  15. *
  16. * 通过黑体字数组来计算黑体字连续出现的次数
  17. * 参数样例: ['a', '', '', 'b', 'c', 'd', '','','e','f', '', 'g','h']
  18. * 函数返回值样例:
  19. *
  20. * Array
  21. * (
  22. * [bold_single] => a
  23. * [bold_double] => e f , g h
  24. * [bold_multiple] => b c d
  25. * )
  26. *
  27. * */
  28. function count_bld ($bld_array) {
  29. $prev = '';
  30. $bag = [];
  31. $result = [];
  32. // 添加最后一个空白结束占位符
  33. array_push($bld_array, '');
  34. foreach($bld_array as $v) {
  35. if (empty($v)) {
  36. $prev = $v;
  37. if (!empty($bag)) {
  38. array_push($result, $bag);
  39. $bag = [];
  40. }
  41. continue;
  42. } else {
  43. array_push($bag, $v);
  44. }
  45. }
  46. $final_result = [];
  47. foreach($result as $v) {
  48. $cnt = count($v);
  49. $content = join(' ', $v);
  50. if ($cnt == 1) {
  51. $key = 'bold_single';
  52. } else if ($cnt == 2) {
  53. $key = 'bold_double';
  54. } else if ($cnt > 2) {
  55. $key = 'bold_multiple';
  56. }
  57. if (empty($final_result[$key])) {
  58. $final_result[$key] = $content;
  59. } else {
  60. $final_result[$key] .= (' , ' . $content);
  61. }
  62. }
  63. return $final_result;
  64. }
  65. $dns = _DB_ENGIN_.":host="._DB_HOST_.";port="._DB_PORT_.";dbname="._DB_NAME_.";user="._DB_USERNAME_.";password="._DB_PASSWORD_.";";
  66. $dbh_fts = new PDO($dns, _DB_USERNAME_, _DB_PASSWORD_, array(PDO::ATTR_PERSISTENT => true));
  67. $dbh_fts->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
  68. // 查找 tmp/palicsv/ 目录下的语料数据
  69. $palicsv_path = __DIR__.'/../../tmp/palicsv/';
  70. $scan = scandir($palicsv_path);
  71. $fileCounter = 0;
  72. foreach($scan as $foldername) {
  73. if (is_dir("$palicsv_path/$foldername")) {
  74. $csv_file = "$palicsv_path/$foldername/$foldername.csv";
  75. // DEBUG
  76. // if ($foldername != 'abh01m.mul') continue;
  77. if (is_file($csv_file)) {
  78. $fileCounter++;
  79. echo "正在处理文件: $fileCounter" . PHP_EOL . $csv_file . PHP_EOL;
  80. // 存放当前正在处理的 CSV 文件生成的所有 SQL
  81. $sql_from_csv = '';
  82. // 初始化段落为 0 (没有这种段落)
  83. $paragraph = 0;
  84. // 初始化当前段落的黑体字数组
  85. $bold_text = [];
  86. if (($handle = fopen($csv_file, "r")) !== FALSE) {
  87. # 获取book id
  88. $data = fgetcsv($handle, 1000, ",");
  89. $data = fgetcsv($handle, 1000, ",");
  90. $bookId = (int)mb_substr($data[2],1);
  91. #删除旧数据
  92. $query = "DELETE FROM "._TABLE_FTS_." WHERE book=?";
  93. $stmt = $dbh_fts->prepare($query);
  94. $stmt->execute(array($bookId));
  95. // 开始一个事务,关闭自动提交
  96. $dbh_fts->beginTransaction();
  97. $query = "INSERT INTO "._TABLE_FTS_." (book , paragraph , wid,bold_single,bold_double,bold_multiple,content) VALUES ( ? , ? , ? , ? , ? , ? , ? )";
  98. $stmt = $dbh_fts->prepare($query);
  99. rewind($handle);
  100. $row=0;
  101. while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) {
  102. #忽略第一行
  103. if($row > 0){
  104. $current_word = $data[5];
  105. $style = $data[15];
  106. if ($style == 'paranum') {
  107. // 如果是段落编号,则保留数字
  108. $current_word = $data[4];
  109. } else if (!is_pali_word($current_word)) {
  110. /*
  111. * 如果当前单词不是巴利语单词,则忽略,当作它不存在
  112. * TODO 这样的处理方式,可能不合适,如下面场景:
  113. * bld1 - bld2
  114. * bld1 和 bld2 是否应该分开对待呢?
  115. */
  116. continue;
  117. }
  118. if ($paragraph == $data[3]) {
  119. // 如果是同一段落,那么合并段落中的内容,中间加入空格
  120. $content .= ' ' . $current_word;
  121. // wid 取最后一个不为空的值 TODO (不一定合适)
  122. $wid = empty($data[1]) ? $wid : $data[1];
  123. array_push($bold_text, $style == 'bld' ? $current_word : '');
  124. } else {
  125. // 如果是不同段落
  126. if ($paragraph !== 0) {
  127. // 如果刚才已经记录有数据,则转换为 SQL
  128. $bold_result = count_bld($bold_text);
  129. if(isset($bold_result['bold_single'])){
  130. $bold_single = $bold_result['bold_single'];
  131. }else{
  132. $bold_single = "";
  133. }
  134. if(isset($bold_result['bold_double'])){
  135. $bold_double = $bold_result['bold_double'];
  136. }else{
  137. $bold_double = "";
  138. }
  139. if(isset($bold_result['bold_multiple'])){
  140. $bold_multiple = $bold_result['bold_multiple'];
  141. }else{
  142. $bold_multiple = "";
  143. }
  144. $stmt->execute(array($book, $paragraph, $wid,$bold_single,$bold_double,$bold_multiple,$content));
  145. // 转换后,重置黑体字数据
  146. $bold_text = [];
  147. }
  148. // 如果是不同段落,则赋新的值
  149. $content = $current_word;
  150. $paragraph = (int)$data[3];
  151. $book = (int)mb_substr($data[2],1);
  152. $wid = $data[1];
  153. array_push($bold_text, $style == 'bld' ? $current_word : '');
  154. }
  155. }
  156. $row++;
  157. }
  158. fclose($handle);
  159. // 提交更改
  160. $dbh_fts->commit();
  161. if (!$stmt || ($stmt && $stmt->errorCode() != 0)) {
  162. $error = $dbh_fts->errorInfo();
  163. echo "error - $error[2]".PHP_EOL;
  164. } else {
  165. echo "updata $row recorders.".PHP_EOL;
  166. }
  167. }
  168. file_put_contents("./sql/$foldername.sql", $sql_from_csv);
  169. // DEBUG 仅生成一个文件,测试用
  170. // exit;
  171. }
  172. }
  173. }
  174. echo "Done. Amitābha \n";
  175. ?>