Doing $from / $to Don't close this window"; $filelist=array(); $fileNums=0; $log=""; if(($handle=fopen("filelist.csv",'r'))!==FALSE){ while(($filelist[$fileNums]=fgetcsv($handle,0,','))!==FALSE){ $fileNums++; } } if($to==0 || $to>=$fileNums) $to=$fileNums-1; $FileName=$filelist[$from][1].".htm"; $fileId=$filelist[$from][0]; $dirLog=_DIR_LOG_."/"; $dirHtml=_DIR_PALI_HTML_."/"; $inputFileName=$FileName; $outputFileNameHead=$filelist[$from][1]; $bookId=$filelist[$from][2]; $vriParNum=0; $wordOrder=1; $dirXmlBase=_DIR_PALI_CSV_."/"; $dirXml=$outputFileNameHead."/"; $currChapter=""; $currParNum=""; $class=""; $arrAllWords[0]=array("id","wid","book","paragraph","word","real","type","gramma","mean","note","part","partmean","bmc","bmt","un","style","vri","sya","si","ka","pi","pa","kam"); $g_wordCounter=0; $arrUnWords[0]=array("id","word","type","gramma","parent","mean","note","part","partmean","cf","state","delete","tag","len"); $g_unWordCounter=0; $arrToc[0]=array("id","book","par_num","level","class","title","text"); $g_TocCounter=0; $arrUnPart[0]="word"; $g_unPartCounter=-1; /*去掉标点符号的统计*/ $arrAllPaliWordsCount=array(); $g_paliWordCounter=0; $g_wordCounterInSutta=0; $g_paliWordCountCounter=0; if(file_exists($dirHtml.$inputFileName)==false){ die('file ".."not exists...'); } if(is_dir(_DIR_PALI_CSV_)==FALSE){ if (!mkdir(_DIR_PALI_CSV_)) { die('Failed to create folders...'); } } if(is_dir($dirXmlBase.$dirXml)==FALSE){ if (!mkdir($dirXmlBase.$dirXml)) { die('Failed to create folders...'); } } $parBegin=false; function getChildNodeValue($array,$attName){ if($array){ foreach($array as $x=>$x_value) { if($x==$attName){ return $x_value; } } } return false; } //函数在 inWord 字符串中查找是否有非法的字符。找到返回 FALSE 找不到返回 TRUE function testPaliWord($inWord){ $paliletter="āīūṅñṭḍṇḷṃṁŋĀĪŪṄÑṬḌṆḶṂṀŊabcdefghijklmnoprstuvyABCDEFGHIJKLMNOPRSTUVY-"; for($i=0;$iout side tag:a $word insert next paragraph $thisParNum

"; } $wordId=$GLOBALS['bookId']."-".$thisParNum."-".$thisWordOrder; $wordinfo=array($GLOBALS['g_wordCounter'],$wordId,$GLOBALS['bookId'],$thisParNum,$word,$realWord,".ctl.",".a.","?","?","?","?","","","NULL",$inClass,$thisWordOrder,0,0,0,0,0,0); $GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']]=$wordinfo; return; } //toc out put $GLOBALS['arrToc'][$GLOBALS['g_TocCounter']][6] .= $inStr; if($GLOBALS['tocOnly']=="on"){ return; } $paliletter="āīūṅñṭḍṇḷṃṁŋĀĪŪṄÑṬḌṆḶṂṀŊabcdefghijklmnoprstuvyABCDEFGHIJKLMNOPRSTUVY-"; $mStr=str_replace("‘"," ‘ ",$mStr); $mStr=str_replace("’"," ’ ",$mStr); $mStr=str_replace(","," , ",$mStr); $mStr=str_replace("."," . ",$mStr); $mStr=str_replace("?"," ? ",$mStr); $mStr=str_replace("!"," ! ",$mStr); $mStr=str_replace("["," [ ",$mStr); $mStr=str_replace("]"," ] ",$mStr); $mStr=str_replace("("," ( ",$mStr); $mStr=str_replace(")"," ) ",$mStr); $mStr=str_replace("…"," … ",$mStr); $mStr=str_replace("="," = ",$mStr); $mStr=str_replace("+"," + ",$mStr); $mStr=str_replace(":"," : ",$mStr); $mStr=str_replace(";"," ; ",$mStr); $mStr=str_replace("§"," § ",$mStr); $mStr=str_replace("`"," ` ",$mStr); $mStr=str_replace(" "," ",$mStr); $mStr=str_replace(" "," ",$mStr); $mStr=str_replace(" "," ",$mStr); $arrList = mb_split("\s",$mStr); foreach ($arrList as $word){ if(strlen($word)>0){ $iLastWordIndex=$GLOBALS['g_wordCounter']; $GLOBALS['g_wordCounter']++; $GLOBALS['wordOrder']++; /*"id","wid","book","paragraph","word","real","type","gramma","mean","note","part","partmean","bmc","bmt","un",style,"vri","sya","si","ka","pi","pa","kam"*/ $realWord=makeRealWord($word); if((mb_substr($realWord,0,3,"UTF-8")=="nti" || mb_substr($realWord,0,5,"UTF-8")=="ntyād" || $realWord=="ntveva" || $realWord=="nteva" )&& $word!="Nti"){ $lastWord=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1]; if($lastWord[5]!=""/* && $lastWord[15]=="bld"*/)//前一个词不是标点符号，是黑体 { $word=mb_substr($realWord,1); $realWord="i".$word; $GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]."n"; $GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5]."ṃ"; // $GLOBALS['g_unPartCounter']++; $GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5]; } else{ $lastWordIndex=getLastWordIndex($GLOBALS['g_wordCounter']-1); if($lastWordIndex>0){ $word=mb_substr($realWord,1); $realWord="i".$word; $GLOBALS['arrAllWords'][$lastWordIndex][4]=$GLOBALS['arrAllWords'][$lastWordIndex][4]."n"; $GLOBALS['arrAllWords'][$lastWordIndex][5]=$GLOBALS['arrAllWords'][$lastWordIndex][5]."ṃ"; $GLOBALS['g_unPartCounter']++; $GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$GLOBALS['arrAllWords'][$lastWordIndex][5]; } } } if($realWord=="ti" || mb_substr($realWord,0,4,"UTF-8")=="tiād"){ $lastWord=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1]; if($lastWord[5]!="")//前一个词不是标点符号，是黑体 { $strEndofWord=mb_substr($lastWord[5],-1,1,"UTF-8"); if($strEndofWord=="ā" || $strEndofWord=="ī" || $strEndofWord=="ū" ){ switch($strEndofWord){ case 'ā': $newUnWord=mb_substr($lastWord[5],0,-1,"UTF-8").'a'; break; case 'ī': $newUnWord=mb_substr($lastWord[5],0,-1,"UTF-8").'i'; break; case 'ū': $newUnWord=mb_substr($lastWord[5],0,-1,"UTF-8").'u'; break; } //加入连读词列表 $GLOBALS['g_unWordCounter']++; $GLOBALS['arrUnWords'][$GLOBALS['g_unWordCounter']]=array("NULL",$lastWord[5].$realWord,".un.","","","","","$newUnWord+i".$realWord,"","","","","",mb_strlen($lastWord[5].$realWord,"UTF-8")); //加入连读词零件列表 $GLOBALS['g_unPartCounter']++; $GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$newUnWord; } //加入连读词列表 $GLOBALS['g_unWordCounter']++; $GLOBALS['arrUnWords'][$GLOBALS['g_unWordCounter']]=array("NULL",$lastWord[5].$realWord,".un.","","","","",$lastWord[5]."+i".$realWord,"","","","","",mb_strlen($lastWord[5].$realWord,"UTF-8")); //加入连读词零件列表 $GLOBALS['g_unPartCounter']++; $GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$lastWord[5]; //添加到单词列表 $GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][10]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]."+i".$realWord; $GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]="{".$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]."}".$word; $GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5].$realWord; $GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][6]=".un."; $GLOBALS['g_wordCounter']--; $word=""; $realWord=""; } else{//前一个词是标点符号 $lastWordIndex=getLastWordIndex($GLOBALS['g_wordCounter']-1); if($lastWordIndex>0){ //$word="ti"; $realWord="i{$realWord}"; $strEndofWord=mb_substr($GLOBALS['arrAllWords'][$lastWordIndex][5],-1,1,"UTF-8"); if($strEndofWord=="ā" || $strEndofWord=="ī" || $strEndofWord=="ū" ){ switch($strEndofWord){ case 'ā': $newUnWord=mb_substr($GLOBALS['arrAllWords'][$lastWordIndex][5],0,-1,"UTF-8").'a'; break; case 'ī': $newUnWord=mb_substr($GLOBALS['arrAllWords'][$lastWordIndex][5],0,-1,"UTF-8").'i'; break; case 'ū': $newUnWord=mb_substr($GLOBALS['arrAllWords'][$lastWordIndex][5],0,-1,"UTF-8").'u'; break; } //加入连读词零件列表 $GLOBALS['g_unPartCounter']++; $GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$newUnWord; } } } } if($word!=""){ $wordId=$GLOBALS['class'];//$GLOBALS['bookId']."-".$GLOBALS['vriParNum']."-".$GLOBALS['wordOrder']; $wordinfo=array($GLOBALS['g_wordCounter'],$wordId,$GLOBALS['bookId'],$GLOBALS['vriParNum'],$word,$realWord,"?","?","?","?","?","?","","","",$inClass,$GLOBALS['wordOrder'],0,0,0,0,0,0); $GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']]=$wordinfo; $lcWord=mb_strtolower($word,'UTF-8'); if(mb_strlen($word,"UTF-8")>1 && isPaliWord($lcWord)) { //$GLOBALS['arrAllPaliWordsCount'][$lcWord][0]=1; if(isset($GLOBALS['arrAllPaliWordsCount'][$realWord])){ $GLOBALS['arrAllPaliWordsCount'][$realWord][1]++; } else{ $GLOBALS['arrAllPaliWordsCount'][$realWord][1]=1; $GLOBALS['arrAllPaliWordsCount'][$realWord][2]=mb_strlen($realWord,"UTF-8"); //测试是否有非法字符 if($lcWord!="’ti"){ if(testPaliWord($lcWord)===FALSE){ $errorFileLine = $GLOBALS['from']; $errorFileName = $GLOBALS['FileName']; $GLOBALS['log'].="$errorFileLine,$errorFileName,error,char error:,".$word."\r\n"; echo "char error:".$word."
"; } } } $GLOBALS['g_paliWordCounter']++; } } } } return; } $xmlfile = $dirHtml.$inputFileName; $xmlparser = xml_parser_create(); echo "doing:".$xmlfile."
"; // 打开文件并读取数据 $fp = fopen($xmlfile, 'r'); $xmldata = fread($fp,filesize($xmlfile)); xml_parse_into_struct($xmlparser,$xmldata,$values); xml_parser_free($xmlparser); $begin = false; $suttaCount=0; $output=""; $suttaName=""; $log=$log."$from,$FileName,open\r\n"; foreach ($values as $child) { $attributes=getChildNodeValue($child,"attributes"); switch ($child["tag"]) { case "BODY": //无法处理的段落块之外的数据需要手工修改html文件 $parText=""; switch($child["type"]){ case "open": $parText=getChildNodeValue($child,"value"); break; case "close": break; case "complete": $parText=getChildNodeValue($child,"value"); break; case "cdata": $parText=$child["value"]; break; default: echo "无法处理的段落块之外的数据。原因：无法识别的type:"; $log=$log."$from,$FileName,error,无法处理的段落块之外的数据,原因：无法识别的type in body tag\r\n"; break; } if(strlen($parText)>1){ echo "段落块之外的数据:"."size".strlen($parText).$parText; $log=$log. "$from,$FileName,error,无法处理的段落块之外的数据,".$parText."\r\n"; } break; case "P": $class=getChildNodeValue($attributes,"CLASS"); { switch($child["type"]){ case "open": $vriParNum++; $wordOrder=1; $g_TocCounter++; $arrToc[$g_TocCounter]=array('NULL',$bookId,$vriParNum,"0",$class,"",""); splitWords(getChildNodeValue($child,"value")); $parBegin=true; break; case "close": if($parBegin){ $parBegin=false; } break; case "complete": $vriParNum++; $wordOrder=1; $parText=getChildNodeValue($child,"value"); $g_TocCounter++; $arrToc[$g_TocCounter]=array('NULL',$bookId,$vriParNum,"0",$class,"",""); splitWords($parText); $parBegin=false; break; case "cdata": splitWords($child["value"]); break; default: echo "无法处理的块P。原因：无法识别的type:"; $log=$log."$from,$FileName,error,无法处理的块P,原因：无法识别的type\r\n"; break; } } break; case "A": switch($child["type"]){ case "open": echo "无法处理的块A。原因：内部有嵌套其他的块
"; $log=$log."$from,$FileName,error,无法处理的块A,原因：内部有嵌套其他的块\r\n"; break; case "close": break; case "complete": $aName=getChildNodeValue($attributes,"NAME"); if($parBegin===false){ splitWords($aName,"#a#",1); } else{ splitWords($aName,"#a#"); } break; default: echo "无法处理的块A。原因：无法识别的type:".$child["type"]; $log=$log."$from,$FileName,error,无法处理的块A,原因：无法识别的type:".$child["type"]."\r\n"; break; } break; case "SPAN": $className=""; $className=getChildNodeValue($attributes,"CLASS"); if($className=="paranum"){ $currParNum=$child["value"]; } $spanValue=getChildNodeValue($child,"value"); switch($child["type"]){ case "open": splitWords($child["value"],$className); break; case "close": break; case "complete": if($parBegin){ if(strlen($spanValue)>0){ splitWords($child["value"],$className); } } else{ echo "无法处理的块span。原因：该块在段落外
"; $log=$log."$from,$FileName,error,无法处理的块span,原因：该块在段落外\r\n"; } break; case "cdata": splitWords($child["value"]); break; default: echo "无法处理的块span。原因：无法识别的type:"; $log=$log. "$from,$FileName,error,无法处理的块span,原因：无法识别的type:\r\n"; } break; default: echo "无法处理的tag:".$child["tag"]; $log=$log. "$from,$FileName,error,无法处理的tag,".$child["tag"]."\r\n"; } } $myLogFile = fopen($dirLog."palicanoon.log", "a"); fwrite($myLogFile, $log); fclose($myLogFile); //Toc $counter=0; if(($fptitle=fopen($dirXmlBase.$dirXml."/".($from+1)."_title.csv", "w")) === FALSE){ echo "error: can not open output file toc ."; } if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead."_toc.csv", "w"))!==FALSE){ $fpPaliText=fopen($dirXmlBase.$dirXml.$outputFileNameHead."_pali.csv", "w"); foreach($arrToc as $xWord){ $xPali=$xWord; switch($xWord[4]){ case "book": $xWord[3]=1; $xPali[3] = 1; break; case "chapter": $xWord[3]=2; $xPali[3] = 2; break; case "title": $xWord[3]=3; $xPali[3] = 3; break; case "subhead": $xWord[3]=4; $xPali[3] = 4; break; case "subsubhead": $xWord[3]=5; $xPali[3] = 5; break; case "hangnum": $xWord[3]=8; $xPali[3] = 8; break; default: $xWord[3]=100; $xPali[3] = 100; break; } if($xWord[3] < 100){ $xWord[5] = $xWord[6]; } fputcsv($fpPaliText,$xPali); fputcsv($fp,$xWord); fputcsv($fptitle,$xWord); if($counter>0){ //fputcsv($fpCombinToc,$xWord); } $counter++; } fclose($fpPaliText); fclose($fp); fclose($fptitle); //fclose($fpCombinToc); echo "TOC 表导出到：".$dirXmlBase.$dirXml.$outputFileNameHead."_toc.csv
"; } else{ echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead."_toc.csv"; } /*单词表*/ if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead.".csv", "w"))!==FALSE){ foreach($arrAllWords as $xWord){ fputcsv($fp,$xWord); } fclose($fp); echo "单词表导出到：".$dirXmlBase.$dirXml.$outputFileNameHead.".csv
"; } else{ echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead.".csv"; } /*union表*/ if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead."_un.csv", "w"))!==FALSE){ foreach($arrUnWords as $xWord){ fputcsv($fp,$xWord); } fclose($fp); echo "union表导出到：".$dirXmlBase.$dirXml.$outputFileNameHead."_un.csv
"; } else{ echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead."_un.csv"; } /*union part 表*/ if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead."_un_part.csv", "w"))!==FALSE){ foreach($arrUnPart as $xWord){ fwrite($fp,$xWord."\r\n"); } fclose($fp); echo "union part 表导出到：".$dirXmlBase.$dirXml.$outputFileNameHead."_un_part.csv
"; } else{ echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead."_un_part.csv"; } /*Pali单词统计表*/ $countCsvFileName=$dirXmlBase.$dirXml.$outputFileNameHead."_analysis.csv"; if(($fp=fopen($countCsvFileName, "w"))!==FALSE){ $wordCountCsvHead=array("编号","词","数量","百分比","长度"); fputcsv($fp,$wordCountCsvHead); $i=0; foreach($arrAllPaliWordsCount as $x=>$x_value){ $i++; $csvWord[0]=$i; $csvWord[1]=$x; $csvWord[2]=$x_value[1]; $csvWord[3]=$x_value[1]*10000/$g_paliWordCounter; $csvWord[4]=$x_value[2]; fputcsv($fp,$csvWord); } fclose($fp); echo "Pali单词表统计导出到：".$countCsvFileName."
"; } else{ echo "can not open csv file. filename=".$countCsvFileName."
"; } ?> =$to){ echo "

齐活！功德无量！all done!

"; } else{ echo ""; echo "正在载入:".($from+1)."——".$filelist[$from+1][0]; } ?>