db_insert_sentence.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. <!--句子库生成-->
  2. <?php
  3. require_once "install_head.php";
  4. ?>
  5. <!DOCTYPE html>
  6. <html>
  7. <head>
  8. </head>
  9. <body>
  10. <h2>Insert to Sentence DB</h2>
  11. <p><a href="index.php">Home</a></p>
  12. <?php
  13. include "./_pdo.php";
  14. require_once '../config.php';
  15. $db_file = _FILE_DB_PALI_SENTENCE_;
  16. $thisfile = '.' . mb_substr(__FILE__, mb_strlen(__DIR__));
  17. if (isset($_GET["from"]) == false) {
  18. ?>
  19. <form action="<?php echo $thisfile; ?>" method="get">
  20. From: <input type="text" value="0" name="from"><br>
  21. To: <input type="text" value="216" name="to"><br>
  22. <input type="submit">
  23. </form>
  24. <?php
  25. return;
  26. }
  27. function wordStyle($word, $style)
  28. {
  29. switch ($style) {
  30. case 'bld':
  31. # bold form
  32. # 不包含字符{ }
  33. if (mb_strpos($word, '{', 0, "UTF-8") === false) {
  34. return "<b>" . $word . "</b> ";
  35. } else {
  36. $word = str_replace("{", "<b>", $word);
  37. $word = str_replace("}", "</b>", $word);
  38. return $word;
  39. }
  40. break;
  41. case 'note':
  42. # vir note...
  43. return "<n>" . $word . "</n>";
  44. break;
  45. case 'paranum':
  46. # vir note...
  47. return "<paranum>" . $word . "</paranum>";
  48. break;
  49. default:
  50. # code...
  51. return $word;
  52. break;
  53. }
  54. }
  55. $from = $_GET["from"];
  56. $to = $_GET["to"];
  57. $filelist = array();
  58. $fileNums = 0;
  59. $log = "";
  60. echo "<h2>$from-$to</h2>";
  61. if (($handle = fopen("filelist.csv", 'r')) !== false) {
  62. while (($filelist[$fileNums] = fgetcsv($handle, 0, ',')) !== false) {
  63. $fileNums++;
  64. }
  65. }
  66. if ($to >= $fileNums) {
  67. $to = $fileNums - 1;
  68. }
  69. $FileName = $filelist[$from][1] . ".htm";
  70. $fileId = $filelist[$from][0];
  71. $fileId = $filelist[$from][0];
  72. $dirLog = _DIR_LOG_ . "/";
  73. $dirDb = "db/";
  74. $inputFileName = $FileName;
  75. $outputFileNameHead = $filelist[$from][1];
  76. $bookId = $filelist[$from][2];
  77. $vriParNum = 0;
  78. $wordOrder = 1;
  79. $dirXmlBase = _DIR_PALI_CSV_ . "/";
  80. $dirXml = $outputFileNameHead . "/";
  81. $currChapter = "";
  82. $currParNum = "";
  83. $arrAllWords[0] = array("id", "wid", "book", "paragraph", "word", "real", "type", "gramma", "mean", "note", "part", "partmean", "bmc", "bmt", "un", "style", "vri", "sya", "si", "ka", "pi", "pa", "kam");
  84. $g_wordCounter = 0;
  85. $arrUnWords[0] = array("id", "word", "type", "gramma", "parent", "mean", "note", "part", "partmean", "cf", "state", "delete", "tag", "len");
  86. $g_unWordCounter = 0;
  87. $arrUnPart[0] = "word";
  88. $g_unPartCounter = -1;
  89. /*去掉标点符号的统计*/
  90. $arrAllPaliWordsCount = array();
  91. $g_paliWordCounter = 0;
  92. $g_wordCounterInSutta = 0;
  93. $g_paliWordCountCounter = 0;
  94. $xmlfile = $inputFileName;
  95. echo "doing:" . $xmlfile . "<br>";
  96. $log = $log . "$from,$FileName,open\r\n";
  97. $arrInserString = array();
  98. function getWordEn($strIn)
  99. {
  100. $search = array('ā', 'ī', 'ū', 'ṅ', 'ñ', 'ṭ', 'ḍ', 'ṇ', 'ḷ', 'ṃ');
  101. $replace = array('a', 'i', 'u', 'n', 'n', 't', 'd', 'n', 'l', 'm');
  102. return (str_replace($search, $replace, $strIn));
  103. }
  104. // 打开文件并读取数据
  105. $iWord = 0;
  106. $pre = null;
  107. $curr = null;
  108. $next = null;
  109. $wordlist = array();
  110. $arrSent = array();
  111. $book = 0;
  112. $sent_html = "";
  113. if (($fp = fopen($dirXmlBase . $dirXml . $outputFileNameHead . ".csv", "r")) !== false) {
  114. while (($data = fgetcsv($fp)) !== false) {
  115. //id,wid,book,paragraph,word,real,type,gramma,mean,note,part,partmean,bmc,bmt,un,style,vri,sya,si,ka,pi,pa,kam
  116. //$data = mb_split(",",$data);
  117. $wordlist[] = $data;
  118. if ($book == 0) {
  119. $book = substr($data[2], 1);
  120. }
  121. }
  122. fclose($fp);
  123. $iWord = 0;
  124. $iCurrPara = 0;
  125. $Note_Mark = 0;
  126. if ($wordlist[1][6] != ".ctl.") {
  127. $sent = $wordlist[1][4] . " ";
  128. $sent_html = wordStyle($wordlist[1][4], $wordlist[1][15]) . " ";
  129. $sent_real = $wordlist[1][5];
  130. $wordcount = 1;
  131. } else {
  132. $sent = "";
  133. $sent_html = "";
  134. $sent_real = "";
  135. $wordcount = 0;
  136. }
  137. $begin = 1;
  138. $end = 1;
  139. $iSent = 0;
  140. $Note_Mark1 = 0;
  141. $Note_Mark2 = 0;
  142. $Note_Mark = 0;
  143. $wordcount = 0;
  144. for ($i = 1; $i < count($wordlist); $i++) {
  145. if ($wordlist[$i][3] > $iCurrPara) {
  146. //echo "new paragraph<br>";
  147. $iWord = 0;
  148. if ($i > 1) {
  149. //echo "上一段结束<br>";
  150. if (strlen(trim($sent)) > 0) {
  151. $end = $wordlist[$i - 1][16];
  152. $arrSent[] = array($book, $iCurrPara, $begin, $end, mb_strlen(trim($sent_real), "UTF-8"), $wordcount, $sent, $sent_html, trim($sent_real), getWordEn($sent_real));
  153. //echo "end={$end}<br>";
  154. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  155. }
  156. $iCurrPara = $wordlist[$i][3];
  157. $Note_Mark1 = 0;
  158. $Note_Mark2 = 0;
  159. $Note_Mark = 0;
  160. $pre = $wordlist[$i - 1];
  161. $curr = $wordlist[$i];
  162. if ($i < count($wordlist) - 1) {
  163. $next = $wordlist[$i + 1];
  164. } else {
  165. $next = "";
  166. }
  167. if ($next[4] == "(" || $curr[4] == "(") {
  168. $Note_Mark1 = 1;
  169. } else if ($pre[4] == ")" && $Note_Mark1 == 1) {
  170. $Note_Mark1 = 0;
  171. }
  172. if ($next[4] == "[" || $curr[4] == "[") {
  173. $Note_Mark2 = 1;
  174. } else if ($pre[4] == "]" && $Note_Mark2 == 1) {
  175. $Note_Mark2 = 0;
  176. }
  177. $Note_Mark = $Note_Mark1 + $Note_Mark2;
  178. //下一段开始
  179. if ($wordlist[$i][6] != ".ctl.") {
  180. $sent = $wordlist[$i][4] . " ";
  181. $sent_html = wordStyle($wordlist[$i][4], $wordlist[$i][15]) . " ";
  182. if ($wordlist[$i][5] == '"') {
  183. $sent_real = "";
  184. } else {
  185. $sent_real = $wordlist[$i][5];
  186. }
  187. $wordcount = 1;
  188. } else {
  189. $sent = "";
  190. $sent_html = "";
  191. $sent_real = "";
  192. $wordcount = 0;
  193. }
  194. $begin = $wordlist[$i][16];
  195. $iSent++;
  196. continue;
  197. }
  198. $iCurrPara = $wordlist[$i][3];
  199. }
  200. $isEndOfSen = false;
  201. if ($i < count($wordlist) - 1) {
  202. $pre = $wordlist[$i - 1];
  203. $curr = $wordlist[$i];
  204. if ($i < count($wordlist) - 1) {
  205. $next = $wordlist[$i + 1];
  206. } else {
  207. $next = "";
  208. }
  209. if ($next[4] == "(") {
  210. $Note_Mark1 = 1;
  211. } else if ($pre[4] == ")" && $Note_Mark1 == 1) {
  212. $Note_Mark1 = 0;
  213. }
  214. if ($next[4] == "[") {
  215. $Note_Mark2 = 1;
  216. } else if ($pre[4] == "]" && $Note_Mark2 == 1) {
  217. $Note_Mark2 = 0;
  218. }
  219. $Note_Mark = $Note_Mark1 + $Note_Mark2;
  220. if ($curr[15] != "note" || mb_substr($curr[1], 0, 5, "UTF-8") != "gatha") {
  221. if ($curr[4] == "." && !is_numeric($pre[4]) && $next[3] == $iCurrPara && $Note_Mark === 0) {
  222. //以.結尾且非註釋
  223. if ($next[4] != "(") {
  224. $isEndOfSen = true;
  225. }
  226. } else if ($curr[4] == "–" && $next[4] == "‘" && $Note_Mark === 0) {
  227. $isEndOfSen = true;
  228. } else if ($Note_Mark == 0) {
  229. //以!或?或;結尾
  230. if ($curr[4] == "!") {
  231. if ($next[4] != "!") {
  232. if ($next[4] != "(") {
  233. $isEndOfSen = true;
  234. }
  235. }
  236. } else if ($curr[4] == ";" || $curr[4] == "?") {
  237. if ($next[4] != "(") {
  238. $isEndOfSen = true;
  239. }
  240. }
  241. }
  242. }
  243. }
  244. if ($curr[6] != ".ctl.") {
  245. if ($next[5] != "") {
  246. # 下一个是标点符号
  247. $sent .= $curr[4] . " ";
  248. $sent_html .= wordStyle($curr[4], $curr[15]) . " ";
  249. } else {
  250. $sent .= $curr[4];
  251. $sent_html .= wordStyle($curr[4], $curr[15]);
  252. }
  253. if ($wordlist[$i][5] != '' && ($Note_Mark == 0 || ($Note_Mark == 1 && ($next[4] == "[" || $next[4] == "(")))) {
  254. $wordcount++;
  255. if ($wordlist[$i][5] == "iti") {
  256. $sent_real .= " " . $curr[4];
  257. } else {
  258. $sent_real .= " " . $curr[5];
  259. }
  260. }
  261. }
  262. if ($isEndOfSen == true && strlen(trim($sent)) > 0) {
  263. $end = $wordlist[$i][16];
  264. $arrSent[] = array($book, $iCurrPara, $begin, $end, mb_strlen(trim($sent_real), "UTF-8"), $wordcount, $sent, $sent_html, trim($sent_real), getWordEn($sent_real));
  265. //echo "end={$end}<br>";
  266. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  267. $sent = "";
  268. $sent_html = "";
  269. $sent_real = "";
  270. $iSent++;
  271. $begin = $wordlist[$i][16] + 1;
  272. $wordcount = 0;
  273. }
  274. $iWord++;
  275. }
  276. if (strlen(trim($sent)) > 0) {
  277. $end = $wordlist[count($wordlist) - 1][16];
  278. $arrSent[] = array($book, $iCurrPara, $begin, $end, mb_strlen(trim($sent_real), "UTF-8"), $wordcount, $sent, $sent_html, trim($sent_real), getWordEn($sent_real));
  279. //echo "end={$end}<br>";
  280. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  281. }
  282. } else {
  283. echo "can not open csv file. filename=" . $dirXmlBase . $dirXml . $outputFileNameHead . ".csv";
  284. }
  285. // 开始一个事务,关闭自动提交
  286. PDO_Connect("$db_file");
  287. $PDO->beginTransaction();
  288. $query = "INSERT INTO "._TABLE_PALI_SENT_." (book , paragraph , begin , end , length , count , text , html , real , real_en ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )";
  289. $stmt = $PDO->prepare($query);
  290. foreach ($arrSent as $oneParam) {
  291. $stmt->execute($oneParam);
  292. }
  293. // 提交更改
  294. $PDO->commit();
  295. if (!$stmt || ($stmt && $stmt->errorCode() != 0)) {
  296. $error = PDO_ErrorInfo();
  297. echo "error - $error[2] <br>";
  298. $log = $log . "$from, $FileName, error, $error[2] \r\n";
  299. } else {
  300. $count = count($arrSent);
  301. echo "updata $count recorders.";
  302. }
  303. $myLogFile = fopen(_DIR_LOG_ . "insert_sent.log", "a");
  304. fwrite($myLogFile, $log);
  305. fclose($myLogFile);
  306. ?>
  307. <?php
  308. if ($from >= $to) {
  309. echo "<h2>齐活!功德无量!all done!</h2>";
  310. } else {
  311. echo "<script>";
  312. echo "window.location.assign(\"db_insert_sentence.php?from=" . ($from + 1) . "&to=" . $to . "\")";
  313. echo "</script>";
  314. echo "正在载入:" . ($from + 1) . "——" . $filelist[$from + 1][0];
  315. }
  316. ?>
  317. </body>
  318. </html>