db_insert_sentence.php 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. <!--句子库生成-->
  2. <?php
  3. require_once "install_head.php";
  4. ?>
  5. <!DOCTYPE html>
  6. <html>
  7. <head>
  8. </head>
  9. <body>
  10. <h2>Insert to Sentence DB</h2>
  11. <p><a href="index.php">Home</a></p>
  12. <?php
  13. include "./_pdo.php";
  14. require_once '../path.php';
  15. $db_file =_FILE_DB_PALI_SENTENCE_;
  16. $thisfile = '.'.mb_substr(__FILE__,mb_strlen(__DIR__));
  17. if(isset($_GET["from"])==false){
  18. ?>
  19. <form action="<?php echo $thisfile; ?>" method="get">
  20. From: <input type="text" value="0" name="from"><br>
  21. To: <input type="text" value="216" name="to"><br>
  22. <input type="submit">
  23. </form>
  24. <?php
  25. return;
  26. }
  27. function wordStyle($word,$style){
  28. switch ($style) {
  29. case 'bld':
  30. # bold form
  31. # 不包含字符{ }
  32. if(mb_strpos($word,'{',0,"UTF-8")===FALSE){
  33. return "<b>".$word."</b> ";
  34. }
  35. else{
  36. $word = str_replace("{","<b>",$word);
  37. $word = str_replace("}","</b>",$word);
  38. return $word;
  39. }
  40. break;
  41. case 'note':
  42. # vir note...
  43. return "<note>".$word."</note>";
  44. break;
  45. case 'paranum':
  46. # vir note...
  47. return "<paranum>".$word."</paranum>";
  48. break;
  49. default:
  50. # code...
  51. return $word;
  52. break;
  53. }
  54. }
  55. $from=$_GET["from"];
  56. $to=$_GET["to"];
  57. $filelist=array();
  58. $fileNums=0;
  59. $log="";
  60. echo "<h2>$from-$to</h2>";
  61. if(($handle=fopen("filelist.csv",'r'))!==FALSE){
  62. while(($filelist[$fileNums]=fgetcsv($handle,0,','))!==FALSE){
  63. $fileNums++;
  64. }
  65. }
  66. if($to>=$fileNums) $to=$fileNums-1;
  67. $FileName=$filelist[$from][1].".htm";
  68. $fileId=$filelist[$from][0];
  69. $fileId=$filelist[$from][0];
  70. $dirLog=_DIR_LOG_."/";
  71. $dirDb="db/";
  72. $inputFileName=$FileName;
  73. $outputFileNameHead=$filelist[$from][1];
  74. $bookId=$filelist[$from][2];
  75. $vriParNum=0;
  76. $wordOrder=1;
  77. $dirXmlBase=_DIR_PALI_CSV_."/";
  78. $dirXml=$outputFileNameHead."/";
  79. $currChapter="";
  80. $currParNum="";
  81. $arrAllWords[0]=array("id","wid","book","paragraph","word","real","type","gramma","mean","note","part","partmean","bmc","bmt","un","style","vri","sya","si","ka","pi","pa","kam");
  82. $g_wordCounter=0;
  83. $arrUnWords[0]=array("id","word","type","gramma","parent","mean","note","part","partmean","cf","state","delete","tag","len");
  84. $g_unWordCounter=0;
  85. $arrUnPart[0]="word";
  86. $g_unPartCounter=-1;
  87. /*去掉标点符号的统计*/
  88. $arrAllPaliWordsCount=array();
  89. $g_paliWordCounter=0;
  90. $g_wordCounterInSutta=0;
  91. $g_paliWordCountCounter=0;
  92. $xmlfile = $inputFileName;
  93. echo "doing:".$xmlfile."<br>";
  94. $log=$log."$from,$FileName,open\r\n";
  95. $arrInserString=array();
  96. function getWordEn($strIn){
  97. $search = array('ā', 'ī', 'ū', 'ṅ', 'ñ' , 'ṭ', 'ḍ', 'ṇ', 'ḷ', 'ṃ');
  98. $replace = array('a', 'i', 'u', 'n', 'n' , 't', 'd', 'n', 'l', 'm');
  99. return(str_replace($search,$replace,$strIn));
  100. }
  101. // 打开文件并读取数据
  102. $iWord=0;
  103. $pre=null;
  104. $curr=null;
  105. $next=null;
  106. $wordlist=array();
  107. $arrSent=array();
  108. $book=0;
  109. $sent_html="";
  110. if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead.".csv", "r"))!==FALSE){
  111. while(($data=fgetcsv($fp))!==FALSE){
  112. //id,wid,book,paragraph,word,real,type,gramma,mean,note,part,partmean,bmc,bmt,un,style,vri,sya,si,ka,pi,pa,kam
  113. //$data = mb_split(",",$data);
  114. $wordlist[]=$data;
  115. if($book==0){
  116. $book=substr($data[2],1);
  117. }
  118. }
  119. fclose($fp);
  120. $iWord=0;
  121. $iCurrPara=0;
  122. $Note_Mark=0;
  123. if($wordlist[1][6] != ".ctl."){
  124. $sent=$wordlist[1][4]." ";
  125. $sent_html=wordStyle($wordlist[1][4],$wordlist[1][15])." ";
  126. $sent_real=$wordlist[1][5];
  127. $wordcount=1;
  128. }
  129. else{
  130. $sent="";
  131. $sent_html="";
  132. $sent_real="";
  133. $wordcount=0;
  134. }
  135. $begin=1;
  136. $end=1;
  137. $iSent=0;
  138. $Note_Mark1=0;
  139. $Note_Mark2=0;
  140. $Note_Mark = 0;
  141. $wordcount=0;
  142. for($i=1;$i<count($wordlist);$i++){
  143. if($wordlist[$i][3]>$iCurrPara){
  144. //echo "new paragraph<br>";
  145. $iWord=0;
  146. if($i>1){
  147. //echo "上一段结束<br>";
  148. if(strlen(trim($sent))>0){
  149. $end = $wordlist[$i-1][16];
  150. $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen(trim($sent_real),"UTF-8"),$wordcount,$sent,$sent_html,trim($sent_real),getWordEn($sent_real));
  151. //echo "end={$end}<br>";
  152. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  153. }
  154. $iCurrPara=$wordlist[$i][3];
  155. $Note_Mark1=0;
  156. $Note_Mark2=0;
  157. $Note_Mark = 0;
  158. $pre=$wordlist[$i-1];
  159. $curr=$wordlist[$i];
  160. if($i<count($wordlist)-1){
  161. $next=$wordlist[$i+1];
  162. }
  163. else{
  164. $next="";
  165. }
  166. if($next[4]=="(" || $curr[4]=="("){
  167. $Note_Mark1=1;
  168. }
  169. else if($pre[4]==")" && $Note_Mark1==1){
  170. $Note_Mark1=0;
  171. }
  172. if($next[4]=="[" || $curr[4]=="["){
  173. $Note_Mark2=1;
  174. }
  175. else if($pre[4]=="]" && $Note_Mark2==1){
  176. $Note_Mark2=0;
  177. }
  178. $Note_Mark = $Note_Mark1+$Note_Mark2;
  179. //下一段开始
  180. if($wordlist[$i][6] != ".ctl."){
  181. $sent=$wordlist[$i][4]." ";
  182. $sent_html=wordStyle($wordlist[$i][4],$wordlist[$i][15])." ";
  183. if($wordlist[$i][5]=='"'){
  184. $sent_real="";
  185. }
  186. else{
  187. $sent_real=$wordlist[$i][5];
  188. }
  189. $wordcount=1;
  190. }
  191. else{
  192. $sent="";
  193. $sent_html="";
  194. $sent_real="";
  195. $wordcount=0;
  196. }
  197. $begin = $wordlist[$i][16];
  198. $iSent++;
  199. continue;
  200. }
  201. $iCurrPara=$wordlist[$i][3];
  202. }
  203. $isEndOfSen=false;
  204. if($i<count($wordlist)-1){
  205. $pre=$wordlist[$i-1];
  206. $curr=$wordlist[$i];
  207. if($i<count($wordlist)-1){
  208. $next=$wordlist[$i+1];
  209. }
  210. else{
  211. $next="";
  212. }
  213. if($next[4]=="("){
  214. $Note_Mark1=1;
  215. }
  216. else if($pre[4]==")" && $Note_Mark1==1){
  217. $Note_Mark1=0;
  218. }
  219. if($next[4]=="["){
  220. $Note_Mark2=1;
  221. }
  222. else if($pre[4]=="]" && $Note_Mark2==1){
  223. $Note_Mark2=0;
  224. }
  225. $Note_Mark = $Note_Mark1+$Note_Mark2;
  226. if($curr[15] != "note" || mb_substr($curr[1],0,5,"UTF-8") != "gatha"){
  227. if($curr[4] == "." && !is_numeric($pre[4]) && $next[3]==$iCurrPara && $Note_Mark===0){
  228. //以.結尾且非註釋
  229. if($next[4] != "("){
  230. $isEndOfSen=true;
  231. }
  232. }
  233. else if($curr[4]=="–" && $next[4]=="‘" && $Note_Mark===0){
  234. $isEndOfSen=true;
  235. }
  236. else if($Note_Mark == 0){
  237. //以!或?或;結尾
  238. if($curr[4]=="!"){
  239. if($next[4]!="!"){
  240. if($next[4]!="("){
  241. $isEndOfSen=true;
  242. }
  243. }
  244. }
  245. else if($curr[4]==";" || $curr[4]=="?"){
  246. if($next[4] != "("){
  247. $isEndOfSen=true;
  248. }
  249. }
  250. }
  251. }
  252. }
  253. if($curr[6] != ".ctl."){
  254. if($next[5] != ""){
  255. # 下一个是标点符号
  256. $sent .= $curr[4]." ";
  257. $sent_html .= wordStyle($curr[4],$curr[15])." ";
  258. }
  259. else{
  260. $sent .= $curr[4];
  261. $sent_html .= wordStyle($curr[4],$curr[15]);
  262. }
  263. if($wordlist[$i][5] != '' && ($Note_Mark==0 || ($Note_Mark==1 && ($next[4]=="[" || $next[4]=="(")))){
  264. $wordcount++;
  265. if($wordlist[$i][5]=="iti"){
  266. $sent_real .=" ".$curr[4];
  267. }
  268. else{
  269. $sent_real .=" ".$curr[5];
  270. }
  271. }
  272. }
  273. if($isEndOfSen==true && strlen(trim($sent))>0){
  274. $end = $wordlist[$i][16];
  275. $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen(trim($sent_real),"UTF-8"),$wordcount,$sent,$sent_html,trim($sent_real),getWordEn($sent_real));
  276. //echo "end={$end}<br>";
  277. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  278. $sent="";
  279. $sent_html="";
  280. $sent_real="";
  281. $iSent++;
  282. $begin = $wordlist[$i][16]+1;
  283. $wordcount=0;
  284. }
  285. $iWord++;
  286. }
  287. if(strlen(trim($sent))>0){
  288. $end = $wordlist[count($wordlist)-1][16];
  289. $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen(trim($sent_real),"UTF-8"),$wordcount,$sent,$sent_html,trim($sent_real),getWordEn($sent_real));
  290. //echo "end={$end}<br>";
  291. //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
  292. }
  293. }
  294. else{
  295. echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead.".csv";
  296. }
  297. // 开始一个事务,关闭自动提交
  298. PDO_Connect("sqlite:$db_file");
  299. $PDO->beginTransaction();
  300. $query="INSERT INTO pali_sent ('id','book','paragraph','begin','end','length','count','text','html','real','real_en') VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )";
  301. $stmt = $PDO->prepare($query);
  302. foreach($arrSent as $oneParam){
  303. $stmt->execute($oneParam);
  304. }
  305. // 提交更改
  306. $PDO->commit();
  307. if (!$stmt || ($stmt && $stmt->errorCode() != 0)) {
  308. $error = PDO_ErrorInfo();
  309. echo "error - $error[2] <br>";
  310. $log=$log."$from, $FileName, error, $error[2] \r\n";
  311. }
  312. else{
  313. $count=count($arrSent);
  314. echo "updata $count recorders.";
  315. }
  316. $myLogFile = fopen(_DIR_LOG_."insert_sent.log", "a");
  317. fwrite($myLogFile, $log);
  318. fclose($myLogFile);
  319. ?>
  320. <?php
  321. if($from>=$to){
  322. echo "<h2>齐活!功德无量!all done!</h2>";
  323. }
  324. else{
  325. echo "<script>";
  326. echo "window.location.assign(\"db_insert_sentence.php?from=".($from+1)."&to=".$to."\")";
  327. echo "</script>";
  328. echo "正在载入:".($from+1)."——".$filelist[$from+1][0];
  329. }
  330. ?>
  331. </body>
  332. </html>