db_insert_sentence.php 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. <!--句子库生成-->
  2. <!DOCTYPE html>
  3. <html>
  4. <head>
  5. </head>
  6. <body>
  7. <h2>Insert to Sentence DB</h2>
  8. <p><a href="index.php">Home</a></p>
  9. <?php
  10. include "./_pdo.php";
  11. require_once '../path.php';
  12. $db_file = _DIR_TEMP_."/pali_sent.db3";
  13. $thisfile = '.'.mb_substr(__FILE__,mb_strlen(__DIR__));
  14. if(isset($_GET["from"])==false){
  15. ?>
  16. <form action="<?php echo $thisfile; ?>" method="get">
  17. From: <input type="text" value="0" name="from"><br>
  18. To: <input type="text" value="216" name="to"><br>
  19. <input type="submit">
  20. </form>
  21. <?php
  22. if(file_exists($db_file)){
  23. if(!unlink($db_file)){
  24. echo "error: can not delete file "._DIR_TEMP_."/pali_sent.db3";
  25. return;
  26. }
  27. }
  28. PDO_Connect("sqlite:$db_file");
  29. $query="CREATE TABLE pali_sent (
  30. id INTEGER PRIMARY KEY AUTOINCREMENT,
  31. book INTEGER,
  32. paragraph INTEGER,
  33. [begin] INTEGER,
  34. [end] INTEGER,
  35. length INTEGER,
  36. count INTEGER,
  37. text TEXT,
  38. real TEXT,
  39. real_en TEXT
  40. )";
  41. $stmt = @PDO_Execute($query);
  42. if (!$stmt || ($stmt && $stmt->errorCode() != 0)) {
  43. $error = PDO_ErrorInfo();
  44. print_r($error[2]);
  45. }
  46. else{
  47. echo "create table pali_sent .";
  48. }
  49. /*
  50. $query="CREATE INDEX 'search' ON \"pali_sent\" (\"text\", \"real\", \"real_en\" ASC)";
  51. $stmt = @PDO_Execute($query);
  52. if (!$stmt || ($stmt && $stmt->errorCode() != 0)) {
  53. $error = PDO_ErrorInfo();
  54. print_r($error[2]);
  55. $log=$log."$from, $FileName, error, $error[2] \r\n";
  56. }
  57. */
  58. return;
  59. }
  60. $from=$_GET["from"];
  61. $to=$_GET["to"];
  62. $filelist=array();
  63. $fileNums=0;
  64. $log="";
  65. echo "<h2>$from-$to</h2>";
  66. if(($handle=fopen("filelist.csv",'r'))!==FALSE){
  67. while(($filelist[$fileNums]=fgetcsv($handle,0,','))!==FALSE){
  68. $fileNums++;
  69. }
  70. }
  71. if($to>=$fileNums) $to=$fileNums-1;
  72. $FileName=$filelist[$from][1].".htm";
  73. $fileId=$filelist[$from][0];
  74. $fileId=$filelist[$from][0];
  75. $dirLog="log/";
  76. $dirDb="db/";
  77. $inputFileName=$FileName;
  78. $outputFileNameHead=$filelist[$from][1];
  79. $bookId=$filelist[$from][2];
  80. $vriParNum=0;
  81. $wordOrder=1;
  82. $dirXmlBase=_DIR_PALI_CSV_."/";
  83. $dirXml=$outputFileNameHead."/";
  84. $currChapter="";
  85. $currParNum="";
  86. $arrAllWords[0]=array("id","wid","book","paragraph","word","real","type","gramma","mean","note","part","partmean","bmc","bmt","un","style","vri","sya","si","ka","pi","pa","kam");
  87. $g_wordCounter=0;
  88. $arrUnWords[0]=array("id","word","type","gramma","parent","mean","note","part","partmean","cf","state","delete","tag","len");
  89. $g_unWordCounter=0;
  90. $arrUnPart[0]="word";
  91. $g_unPartCounter=-1;
  92. /*去掉标点符号的统计*/
  93. $arrAllPaliWordsCount=array();
  94. $g_paliWordCounter=0;
  95. $g_wordCounterInSutta=0;
  96. $g_paliWordCountCounter=0;
  97. $xmlfile = $inputFileName;
  98. echo "doing:".$xmlfile."<br>";
  99. $log=$log."$from,$FileName,open\r\n";
  100. $arrInserString=array();
  101. function getWordEn($strIn){
  102. $search = array('ā', 'ī', 'ū', 'ṅ', 'ñ' , 'ṭ', 'ḍ', 'ṇ', 'ḷ', 'ṃ');
  103. $replace = array('a', 'i', 'u', 'n', 'n' , 't', 'd', 'n', 'l', 'm');
  104. return(str_replace($search,$replace,$strIn));
  105. }
  106. // 打开文件并读取数据
  107. $iWord=0;
  108. $pre=null;
  109. $curr=null;
  110. $next=null;
  111. $wordlist=array();
  112. $arrSent=array();
  113. $book=0;
  114. if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead.".csv", "r"))!==FALSE){
  115. while(($data=fgetcsv($fp))!==FALSE){
  116. //id,wid,book,paragraph,word,real,type,gramma,mean,note,part,partmean,bmc,bmt,un,style,vri,sya,si,ka,pi,pa,kam
  117. //$data = mb_split(",",$data);
  118. $wordlist[]=$data;
  119. if($book==0){
  120. $book=substr($data[2],1);
  121. }
  122. }
  123. fclose($fp);
  124. $iWord=0;
  125. $iCurrPara=0;
  126. $Note_Mark=0;
  127. if($wordlist[1][6]!=".ctl."){
  128. $sent=$wordlist[1][4]." ";
  129. $sent_real=$wordlist[1][5];
  130. $wordcount=1;
  131. }
  132. else{
  133. $sent="";
  134. $sent_real="";
  135. $wordcount=0;
  136. }
  137. $begin=1;
  138. $end=1;
  139. $iSent=0;
  140. for($i=2;$i<count($wordlist);$i++){
  141. if($wordlist[$i][3]>$iCurrPara){
  142. //echo "new paragraph<br>";
  143. $iWord=0;
  144. if($i>2){
  145. //echo "上一段结束<br>";
  146. if(strlen(trim($sent))>0){
  147. $end = $wordlist[$i-1][16];
  148. $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen($sent_real,"UTF-8"),$wordcount,$sent,$sent_real,getWordEn($sent_real));
  149. //echo "end={$end}<br>";
  150. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  151. }
  152. $iCurrPara=$wordlist[$i][3];
  153. //下一段开始
  154. if($wordlist[$i][6]!=".ctl."){
  155. $sent=$wordlist[$i][4]." ";
  156. if($wordlist[$i][5]=='"'){
  157. $sent_real="";
  158. }
  159. else{
  160. $sent_real=$wordlist[$i][5];
  161. }
  162. $wordcount=1;
  163. }
  164. else{
  165. $sent="";
  166. $sent_real="";
  167. $wordcount=0;
  168. }
  169. $begin = $wordlist[$i][16];
  170. $iSent++;
  171. continue;
  172. }
  173. $iCurrPara=$wordlist[$i][3];
  174. }
  175. $isEndOfSen=false;
  176. if($i<count($wordlist)-1){
  177. $pre=$wordlist[$i-1];
  178. $curr=$wordlist[$i];
  179. $next=$wordlist[$i+1];
  180. if($curr[5]!=""){
  181. $wordcount++;
  182. }
  183. if($next[4]=="("){
  184. $Note_Mark=1;
  185. }
  186. else if($pre[4]==")" && $Note_Mark==1){
  187. $Note_Mark=0;
  188. }
  189. if($curr[15]!="note" || mb_substr($curr[1],0,5,"UTF-8")!="gatha"){
  190. if($curr[4]=="." && !is_numeric($pre[4]) && $next[3]==$iCurrPara && $Note_Mark==0){//以.結尾且非註釋
  191. if($next[4]!="("){
  192. $isEndOfSen=true;
  193. }
  194. }
  195. else if($curr[4]=="–" && $next[4]=="‘" && $Note_Mark==0){
  196. $isEndOfSen=true;
  197. }
  198. else if($Note_Mark==0){//以!或?或;結尾
  199. if($curr[4]=="!"){
  200. if($next[4]!="!"){
  201. if($next[4]!="("){
  202. $isEndOfSen=true;
  203. }
  204. }
  205. }
  206. else if($curr[4]==";" || $curr[4]=="?"){
  207. if($next[4]!="("){
  208. $isEndOfSen=true;
  209. }
  210. }
  211. }
  212. }
  213. }
  214. if($curr[6]!=".ctl."){
  215. if($next[5]!=""){
  216. $sent .= $curr[4]." ";
  217. }
  218. else{
  219. $sent .= $curr[4];
  220. }
  221. if($wordlist[$i][5]!='"'){
  222. if($wordlist[$i][5]=="iti"){
  223. $sent_real .=$curr[4];
  224. }
  225. else{
  226. $sent_real .=$curr[5];
  227. }
  228. }
  229. }
  230. if($isEndOfSen==true && strlen(trim($sent))>0){
  231. $end = $wordlist[$i][16];
  232. $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen($sent_real,"UTF-8"),$wordcount,$sent,$sent_real,getWordEn($sent_real));
  233. //echo "end={$end}<br>";
  234. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  235. $sent="";
  236. $sent_real="";
  237. $iSent++;
  238. $begin = $wordlist[$i][16]+1;
  239. $wordcount=0;
  240. }
  241. $iWord++;
  242. }
  243. if(strlen(trim($sent))>0){
  244. $end = $wordlist[count($wordlist)-1][16];
  245. $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen($sent_real,"UTF-8"),$wordcount,$sent,$sent_real,getWordEn($sent_real));
  246. //echo "end={$end}<br>";
  247. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  248. }
  249. }
  250. else{
  251. echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead.".csv";
  252. }
  253. // 开始一个事务,关闭自动提交
  254. PDO_Connect("sqlite:$db_file");
  255. $PDO->beginTransaction();
  256. $query="INSERT INTO pali_sent ('id','book','paragraph','begin','end','length','count','text','real','real_en') VALUES (NULL,?,?,?,?,?,?,?,?,?)";
  257. $stmt = $PDO->prepare($query);
  258. foreach($arrSent as $oneParam){
  259. $stmt->execute($oneParam);
  260. }
  261. // 提交更改
  262. $PDO->commit();
  263. if (!$stmt || ($stmt && $stmt->errorCode() != 0)) {
  264. $error = PDO_ErrorInfo();
  265. echo "error - $error[2] <br>";
  266. $log=$log."$from, $FileName, error, $error[2] \r\n";
  267. }
  268. else{
  269. $count=count($arrSent);
  270. echo "updata $count recorders.";
  271. }
  272. $myLogFile = fopen(_DIR_LOG_."insert_sent.log", "a");
  273. fwrite($myLogFile, $log);
  274. fclose($myLogFile);
  275. ?>
  276. <?php
  277. if($from>=$to){
  278. echo "<h2>all done!</h2>";
  279. }
  280. else{
  281. echo "<script>";
  282. echo "window.location.assign(\"db_insert_sentence.php?from=".($from+1)."&to=".$to."\")";
  283. echo "</script>";
  284. echo "正在载入:".($from+1)."——".$filelist[$from+1][0];
  285. }
  286. ?>
  287. </body>
  288. </html>