| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381 |
- <!--句子库生成-->
- <?php
- require_once "install_head.php";
- ?>
- <!DOCTYPE html>
- <html>
- <head>
- </head>
- <body>
- <h2>Insert to Sentence DB</h2>
- <p><a href="index.php">Home</a></p>
- <?php
- include "./_pdo.php";
- require_once '../path.php';
- $db_file =_FILE_DB_PALI_SENTENCE_;
- $thisfile = '.'.mb_substr(__FILE__,mb_strlen(__DIR__));
- if(isset($_GET["from"])==false){
- ?>
- <form action="<?php echo $thisfile; ?>" method="get">
- From: <input type="text" value="0" name="from"><br>
- To: <input type="text" value="216" name="to"><br>
- <input type="submit">
- </form>
- <?php
- return;
- }
- function wordStyle($word,$style){
- switch ($style) {
- case 'bld':
- # bold form
- # 不包含字符{ }
- if(mb_strpos($word,'{',0,"UTF-8")===FALSE){
- return "<b>".$word."</b> ";
- }
- else{
- $word = str_replace("{","<b>",$word);
- $word = str_replace("}","</b>",$word);
- return $word;
- }
- break;
- case 'note':
- # vir note...
- return "<note>".$word."</note>";
- break;
- case 'paranum':
- # vir note...
- return "<paranum>".$word."</paranum>";
- break;
- default:
- # code...
- return $word;
- break;
- }
- }
- $from=$_GET["from"];
- $to=$_GET["to"];
- $filelist=array();
- $fileNums=0;
- $log="";
- echo "<h2>$from-$to</h2>";
- if(($handle=fopen("filelist.csv",'r'))!==FALSE){
- while(($filelist[$fileNums]=fgetcsv($handle,0,','))!==FALSE){
- $fileNums++;
- }
- }
- if($to>=$fileNums) $to=$fileNums-1;
- $FileName=$filelist[$from][1].".htm";
- $fileId=$filelist[$from][0];
- $fileId=$filelist[$from][0];
- $dirLog=_DIR_LOG_."/";
- $dirDb="db/";
- $inputFileName=$FileName;
- $outputFileNameHead=$filelist[$from][1];
- $bookId=$filelist[$from][2];
- $vriParNum=0;
- $wordOrder=1;
- $dirXmlBase=_DIR_PALI_CSV_."/";
- $dirXml=$outputFileNameHead."/";
- $currChapter="";
- $currParNum="";
- $arrAllWords[0]=array("id","wid","book","paragraph","word","real","type","gramma","mean","note","part","partmean","bmc","bmt","un","style","vri","sya","si","ka","pi","pa","kam");
- $g_wordCounter=0;
- $arrUnWords[0]=array("id","word","type","gramma","parent","mean","note","part","partmean","cf","state","delete","tag","len");
- $g_unWordCounter=0;
- $arrUnPart[0]="word";
- $g_unPartCounter=-1;
- /*去掉标点符号的统计*/
- $arrAllPaliWordsCount=array();
- $g_paliWordCounter=0;
- $g_wordCounterInSutta=0;
- $g_paliWordCountCounter=0;
- $xmlfile = $inputFileName;
- echo "doing:".$xmlfile."<br>";
- $log=$log."$from,$FileName,open\r\n";
- $arrInserString=array();
- function getWordEn($strIn){
- $search = array('ā', 'ī', 'ū', 'ṅ', 'ñ' , 'ṭ', 'ḍ', 'ṇ', 'ḷ', 'ṃ');
- $replace = array('a', 'i', 'u', 'n', 'n' , 't', 'd', 'n', 'l', 'm');
- return(str_replace($search,$replace,$strIn));
- }
- // 打开文件并读取数据
- $iWord=0;
- $pre=null;
- $curr=null;
- $next=null;
- $wordlist=array();
- $arrSent=array();
- $book=0;
- $sent_html="";
- if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead.".csv", "r"))!==FALSE){
- while(($data=fgetcsv($fp))!==FALSE){
- //id,wid,book,paragraph,word,real,type,gramma,mean,note,part,partmean,bmc,bmt,un,style,vri,sya,si,ka,pi,pa,kam
- //$data = mb_split(",",$data);
-
- $wordlist[]=$data;
- if($book==0){
- $book=substr($data[2],1);
- }
- }
- fclose($fp);
-
- $iWord=0;
- $iCurrPara=0;
- $Note_Mark=0;
- if($wordlist[1][6] != ".ctl."){
- $sent=$wordlist[1][4]." ";
- $sent_html=wordStyle($wordlist[1][4],$wordlist[1][15])." ";
- $sent_real=$wordlist[1][5];
- $wordcount=1;
- }
- else{
- $sent="";
- $sent_html="";
- $sent_real="";
- $wordcount=0;
- }
- $begin=1;
- $end=1;
- $iSent=0;
- $Note_Mark1=0;
- $Note_Mark2=0;
- $Note_Mark = 0;
- $wordcount=0;
- for($i=1;$i<count($wordlist);$i++){
- if($wordlist[$i][3]>$iCurrPara){
- //echo "new paragraph<br>";
- $iWord=0;
- if($i>1){
- //echo "上一段结束<br>";
- if(strlen(trim($sent))>0){
- $end = $wordlist[$i-1][16];
- $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen(trim($sent_real),"UTF-8"),$wordcount,$sent,$sent_html,trim($sent_real),getWordEn($sent_real));
- //echo "end={$end}<br>";
- //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
- }
- $iCurrPara=$wordlist[$i][3];
- $Note_Mark1=0;
- $Note_Mark2=0;
- $Note_Mark = 0;
-
- $pre=$wordlist[$i-1];
- $curr=$wordlist[$i];
- if($i<count($wordlist)-1){
- $next=$wordlist[$i+1];
- }
- else{
- $next="";
- }
-
- if($next[4]=="(" || $curr[4]=="("){
- $Note_Mark1=1;
- }
- else if($pre[4]==")" && $Note_Mark1==1){
- $Note_Mark1=0;
- }
-
- if($next[4]=="[" || $curr[4]=="["){
- $Note_Mark2=1;
- }
- else if($pre[4]=="]" && $Note_Mark2==1){
- $Note_Mark2=0;
- }
- $Note_Mark = $Note_Mark1+$Note_Mark2;
- //下一段开始
- if($wordlist[$i][6] != ".ctl."){
- $sent=$wordlist[$i][4]." ";
- $sent_html=wordStyle($wordlist[$i][4],$wordlist[$i][15])." ";
- if($wordlist[$i][5]=='"'){
- $sent_real="";
- }
- else{
- $sent_real=$wordlist[$i][5];
- }
- $wordcount=1;
- }
- else{
- $sent="";
- $sent_html="";
- $sent_real="";
- $wordcount=0;
- }
- $begin = $wordlist[$i][16];
-
- $iSent++;
-
-
- continue;
- }
- $iCurrPara=$wordlist[$i][3];
- }
- $isEndOfSen=false;
- if($i<count($wordlist)-1){
- $pre=$wordlist[$i-1];
- $curr=$wordlist[$i];
- if($i<count($wordlist)-1){
- $next=$wordlist[$i+1];
- }
- else{
- $next="";
- }
- if($next[4]=="("){
- $Note_Mark1=1;
- }
- else if($pre[4]==")" && $Note_Mark1==1){
- $Note_Mark1=0;
- }
-
- if($next[4]=="["){
- $Note_Mark2=1;
- }
- else if($pre[4]=="]" && $Note_Mark2==1){
- $Note_Mark2=0;
- }
- $Note_Mark = $Note_Mark1+$Note_Mark2;
- if($curr[15] != "note" || mb_substr($curr[1],0,5,"UTF-8") != "gatha"){
- if($curr[4] == "." && !is_numeric($pre[4]) && $next[3]==$iCurrPara && $Note_Mark===0){
- //以.結尾且非註釋
- if($next[4] != "("){
- $isEndOfSen=true;
- }
- }
- else if($curr[4]=="–" && $next[4]=="‘" && $Note_Mark===0){
- $isEndOfSen=true;
- }
- else if($Note_Mark == 0){
- //以!或?或;結尾
- if($curr[4]=="!"){
- if($next[4]!="!"){
- if($next[4]!="("){
- $isEndOfSen=true;
- }
- }
- }
- else if($curr[4]==";" || $curr[4]=="?"){
- if($next[4] != "("){
- $isEndOfSen=true;
- }
- }
- }
- }
- }
- if($curr[6] != ".ctl."){
- if($next[5] != ""){
- # 下一个是标点符号
- $sent .= $curr[4]." ";
- $sent_html .= wordStyle($curr[4],$curr[15])." ";
- }
- else{
- $sent .= $curr[4];
- $sent_html .= wordStyle($curr[4],$curr[15]);
- }
- if($wordlist[$i][5] != '' && ($Note_Mark==0 || ($Note_Mark==1 && ($next[4]=="[" || $next[4]=="(")))){
- $wordcount++;
- if($wordlist[$i][5]=="iti"){
- $sent_real .=" ".$curr[4];
- }
- else{
- $sent_real .=" ".$curr[5];
- }
- }
-
- }
- if($isEndOfSen==true && strlen(trim($sent))>0){
- $end = $wordlist[$i][16];
- $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen(trim($sent_real),"UTF-8"),$wordcount,$sent,$sent_html,trim($sent_real),getWordEn($sent_real));
- //echo "end={$end}<br>";
- //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
-
- $sent="";
- $sent_html="";
- $sent_real="";
- $iSent++;
- $begin = $wordlist[$i][16]+1;
- $wordcount=0;
- }
-
- $iWord++;
- }
- if(strlen(trim($sent))>0){
- $end = $wordlist[count($wordlist)-1][16];
- $arrSent[]=array($book,$iCurrPara,$begin,$end,mb_strlen(trim($sent_real),"UTF-8"),$wordcount,$sent,$sent_html,trim($sent_real),getWordEn($sent_real));
- //echo "end={$end}<br>";
- //echo "<div>[{$iCurrPara}-{$begin}-{$end}]({$wordcount})<br>{$sent}<br>{$sent_real}<br>".getWordEn($sent_real)."</div>";
- }
- }
- else{
- echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead.".csv";
- }
- // 开始一个事务,关闭自动提交
- PDO_Connect("sqlite:$db_file");
- $PDO->beginTransaction();
- $query="INSERT INTO pali_sent ('id','book','paragraph','begin','end','length','count','text','html','real','real_en') VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )";
- $stmt = $PDO->prepare($query);
- foreach($arrSent as $oneParam){
- $stmt->execute($oneParam);
- }
- // 提交更改
- $PDO->commit();
- if (!$stmt || ($stmt && $stmt->errorCode() != 0)) {
- $error = PDO_ErrorInfo();
- echo "error - $error[2] <br>";
-
- $log=$log."$from, $FileName, error, $error[2] \r\n";
- }
- else{
- $count=count($arrSent);
- echo "updata $count recorders.";
- }
- $myLogFile = fopen(_DIR_LOG_."insert_sent.log", "a");
- fwrite($myLogFile, $log);
- fclose($myLogFile);
- ?>
- <?php
- if($from>=$to){
- echo "<h2>齐活!功德无量!all done!</h2>";
- }
- else{
- echo "<script>";
- echo "window.location.assign(\"db_insert_sentence.php?from=".($from+1)."&to=".$to."\")";
- echo "</script>";
- echo "正在载入:".($from+1)."——".$filelist[$from+1][0];
- }
- ?>
- </body>
- </html>
|