Insert to Sentence DB

Home

From:
To:
" . $word . " "; } else { $word = str_replace("{", "", $word); $word = str_replace("}", "", $word); return $word; } break; case 'note': # vir note... return "" . $word . ""; break; case 'paranum': # vir note... return "" . $word . ""; break; default: # code... return $word; break; } } $from = $_GET["from"]; $to = $_GET["to"]; $filelist = array(); $fileNums = 0; $log = ""; echo "

$from-$to

"; if (($handle = fopen("filelist.csv", 'r')) !== false) { while (($filelist[$fileNums] = fgetcsv($handle, 0, ',')) !== false) { $fileNums++; } } if ($to >= $fileNums) { $to = $fileNums - 1; } $FileName = $filelist[$from][1] . ".htm"; $fileId = $filelist[$from][0]; $fileId = $filelist[$from][0]; $dirLog = _DIR_LOG_ . "/"; $dirDb = "db/"; $inputFileName = $FileName; $outputFileNameHead = $filelist[$from][1]; $bookId = $filelist[$from][2]; $vriParNum = 0; $wordOrder = 1; $dirXmlBase = _DIR_PALI_CSV_ . "/"; $dirXml = $outputFileNameHead . "/"; $currChapter = ""; $currParNum = ""; $arrAllWords[0] = array("id", "wid", "book", "paragraph", "word", "real", "type", "gramma", "mean", "note", "part", "partmean", "bmc", "bmt", "un", "style", "vri", "sya", "si", "ka", "pi", "pa", "kam"); $g_wordCounter = 0; $arrUnWords[0] = array("id", "word", "type", "gramma", "parent", "mean", "note", "part", "partmean", "cf", "state", "delete", "tag", "len"); $g_unWordCounter = 0; $arrUnPart[0] = "word"; $g_unPartCounter = -1; /*去掉标点符号的统计*/ $arrAllPaliWordsCount = array(); $g_paliWordCounter = 0; $g_wordCounterInSutta = 0; $g_paliWordCountCounter = 0; $xmlfile = $inputFileName; echo "doing:" . $xmlfile . "
"; $log = $log . "$from,$FileName,open\r\n"; $arrInserString = array(); function getWordEn($strIn) { $search = array('ā', 'ī', 'ū', 'ṅ', 'ñ', 'ṭ', 'ḍ', 'ṇ', 'ḷ', 'ṃ'); $replace = array('a', 'i', 'u', 'n', 'n', 't', 'd', 'n', 'l', 'm'); return (str_replace($search, $replace, $strIn)); } // 打开文件并读取数据 $iWord = 0; $pre = null; $curr = null; $next = null; $wordlist = array(); $arrSent = array(); $book = 0; $sent_html = ""; if (($fp = fopen($dirXmlBase . $dirXml . $outputFileNameHead . ".csv", "r")) !== false) { while (($data = fgetcsv($fp)) !== false) { //id,wid,book,paragraph,word,real,type,gramma,mean,note,part,partmean,bmc,bmt,un,style,vri,sya,si,ka,pi,pa,kam //$data = mb_split(",",$data); $wordlist[] = $data; if ($book == 0) { $book = substr($data[2], 1); } } fclose($fp); $iWord = 0; $iCurrPara = 0; $Note_Mark = 0; if ($wordlist[1][6] != ".ctl.") { $sent = $wordlist[1][4] . " "; $sent_html = wordStyle($wordlist[1][4], $wordlist[1][15]) . " "; $sent_real = $wordlist[1][5]; $wordcount = 1; } else { $sent = ""; $sent_html = ""; $sent_real = ""; $wordcount = 0; } $begin = 1; $end = 1; $iSent = 0; $Note_Mark1 = 0; $Note_Mark2 = 0; $Note_Mark = 0; $wordcount = 0; for ($i = 1; $i < count($wordlist); $i++) { if ($wordlist[$i][3] > $iCurrPara) { //echo "new paragraph
"; $iWord = 0; if ($i > 1) { //echo "上一段结束
"; if (strlen(trim($sent)) > 0) { $end = $wordlist[$i - 1][16]; $arrSent[] = array($book, $iCurrPara, $begin, $end, mb_strlen(trim($sent_real), "UTF-8"), $wordcount, $sent, $sent_html, trim($sent_real), getWordEn($sent_real)); //echo "end={$end}
"; //echo "
[{$iCurrPara}-{$begin}-{$end}]({$wordcount})
{$sent}
{$sent_real}
".getWordEn($sent_real)."
"; } $iCurrPara = $wordlist[$i][3]; $Note_Mark1 = 0; $Note_Mark2 = 0; $Note_Mark = 0; $pre = $wordlist[$i - 1]; $curr = $wordlist[$i]; if ($i < count($wordlist) - 1) { $next = $wordlist[$i + 1]; } else { $next = ""; } if ($next[4] == "(" || $curr[4] == "(") { $Note_Mark1 = 1; } else if ($pre[4] == ")" && $Note_Mark1 == 1) { $Note_Mark1 = 0; } if ($next[4] == "[" || $curr[4] == "[") { $Note_Mark2 = 1; } else if ($pre[4] == "]" && $Note_Mark2 == 1) { $Note_Mark2 = 0; } $Note_Mark = $Note_Mark1 + $Note_Mark2; //下一段开始 if ($wordlist[$i][6] != ".ctl.") { $sent = $wordlist[$i][4] . " "; $sent_html = wordStyle($wordlist[$i][4], $wordlist[$i][15]) . " "; if ($wordlist[$i][5] == '"') { $sent_real = ""; } else { $sent_real = $wordlist[$i][5]; } $wordcount = 1; } else { $sent = ""; $sent_html = ""; $sent_real = ""; $wordcount = 0; } $begin = $wordlist[$i][16]; $iSent++; continue; } $iCurrPara = $wordlist[$i][3]; } $isEndOfSen = false; if ($i < count($wordlist) - 1) { $pre = $wordlist[$i - 1]; $curr = $wordlist[$i]; if ($i < count($wordlist) - 1) { $next = $wordlist[$i + 1]; } else { $next = ""; } if ($next[4] == "(") { $Note_Mark1 = 1; } else if ($pre[4] == ")" && $Note_Mark1 == 1) { $Note_Mark1 = 0; } if ($next[4] == "[") { $Note_Mark2 = 1; } else if ($pre[4] == "]" && $Note_Mark2 == 1) { $Note_Mark2 = 0; } $Note_Mark = $Note_Mark1 + $Note_Mark2; if ($curr[15] != "note" || mb_substr($curr[1], 0, 5, "UTF-8") != "gatha") { if ($curr[4] == "." && !is_numeric($pre[4]) && $next[3] == $iCurrPara && $Note_Mark === 0) { //以.結尾且非註釋 if ($next[4] != "(") { $isEndOfSen = true; } } else if ($curr[4] == "–" && $next[4] == "‘" && $Note_Mark === 0) { $isEndOfSen = true; } else if ($Note_Mark == 0) { //以!或?或;結尾 if ($curr[4] == "!") { if ($next[4] != "!") { if ($next[4] != "(") { $isEndOfSen = true; } } } else if ($curr[4] == ";" || $curr[4] == "?") { if ($next[4] != "(") { $isEndOfSen = true; } } } } } if ($curr[6] != ".ctl.") { if ($next[5] != "") { # 下一个是标点符号 $sent .= $curr[4] . " "; $sent_html .= wordStyle($curr[4], $curr[15]) . " "; } else { $sent .= $curr[4]; $sent_html .= wordStyle($curr[4], $curr[15]); } if ($wordlist[$i][5] != '' && ($Note_Mark == 0 || ($Note_Mark == 1 && ($next[4] == "[" || $next[4] == "(")))) { $wordcount++; if ($wordlist[$i][5] == "iti") { $sent_real .= " " . $curr[4]; } else { $sent_real .= " " . $curr[5]; } } } if ($isEndOfSen == true && strlen(trim($sent)) > 0) { $end = $wordlist[$i][16]; $arrSent[] = array($book, $iCurrPara, $begin, $end, mb_strlen(trim($sent_real), "UTF-8"), $wordcount, $sent, $sent_html, trim($sent_real), getWordEn($sent_real)); //echo "end={$end}
"; //echo "
[{$iCurrPara}-{$begin}-{$end}]({$wordcount})
{$sent}
{$sent_real}
".getWordEn($sent_real)."
"; $sent = ""; $sent_html = ""; $sent_real = ""; $iSent++; $begin = $wordlist[$i][16] + 1; $wordcount = 0; } $iWord++; } if (strlen(trim($sent)) > 0) { $end = $wordlist[count($wordlist) - 1][16]; $arrSent[] = array($book, $iCurrPara, $begin, $end, mb_strlen(trim($sent_real), "UTF-8"), $wordcount, $sent, $sent_html, trim($sent_real), getWordEn($sent_real)); //echo "end={$end}
"; //echo "
[{$iCurrPara}-{$begin}-{$end}]({$wordcount})
{$sent}
{$sent_real}
".getWordEn($sent_real)."
"; } } else { echo "can not open csv file. filename=" . $dirXmlBase . $dirXml . $outputFileNameHead . ".csv"; } // 开始一个事务,关闭自动提交 PDO_Connect("$db_file"); $PDO->beginTransaction(); $query = "INSERT INTO pali_sent ('id','book','paragraph','begin','end','length','count','text','html','real','real_en') VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )"; $stmt = $PDO->prepare($query); foreach ($arrSent as $oneParam) { $stmt->execute($oneParam); } // 提交更改 $PDO->commit(); if (!$stmt || ($stmt && $stmt->errorCode() != 0)) { $error = PDO_ErrorInfo(); echo "error - $error[2]
"; $log = $log . "$from, $FileName, error, $error[2] \r\n"; } else { $count = count($arrSent); echo "updata $count recorders."; } $myLogFile = fopen(_DIR_LOG_ . "insert_sent.log", "a"); fwrite($myLogFile, $log); fclose($myLogFile); ?> = $to) { echo "

齐活!功德无量!all done!

"; } else { echo ""; echo "正在载入:" . ($from + 1) . "——" . $filelist[$from + 1][0]; } ?>