Home
Doing $from / $to Don't close this window";
$filelist=array();
$fileNums=0;
$log="";
if(($handle=fopen("filelist.csv",'r'))!==FALSE){
while(($filelist[$fileNums]=fgetcsv($handle,0,','))!==FALSE){
$fileNums++;
}
}
if($to==0 || $to>=$fileNums) $to=$fileNums-1;
$FileName=$filelist[$from][1].".htm";
$fileId=$filelist[$from][0];
$dirLog=_DIR_LOG_."/";
$dirHtml=_DIR_PALI_HTML_."/";
$inputFileName=$FileName;
$outputFileNameHead=$filelist[$from][1];
$bookId=$filelist[$from][2];
$vriParNum=0;
$wordOrder=1;
$dirXmlBase=_DIR_PALI_CSV_."/";
$dirXml=$outputFileNameHead."/";
$currChapter="";
$currParNum="";
$class="";
$arrAllWords[0]=array("id","wid","book","paragraph","word","real","type","gramma","mean","note","part","partmean","bmc","bmt","un","style","vri","sya","si","ka","pi","pa","kam");
$g_wordCounter=0;
$arrUnWords[0]=array("id","word","type","gramma","parent","mean","note","part","partmean","cf","state","delete","tag","len");
$g_unWordCounter=0;
$arrToc[0]=array("id","book","par_num","level","class","title","text");
$g_TocCounter=0;
$arrUnPart[0]="word";
$g_unPartCounter=-1;
/*去掉标点符号的统计*/
$arrAllPaliWordsCount=array();
$g_paliWordCounter=0;
$g_wordCounterInSutta=0;
$g_paliWordCountCounter=0;
if(file_exists($dirHtml.$inputFileName)==false){
die('file ".."not exists...');
}
if(is_dir(_DIR_PALI_CSV_)==FALSE){
if (!mkdir(_DIR_PALI_CSV_)) {
die('Failed to create folders...');
}
}
if(is_dir($dirXmlBase.$dirXml)==FALSE){
if (!mkdir($dirXmlBase.$dirXml)) {
die('Failed to create folders...');
}
}
$parBegin=false;
function getChildNodeValue($array,$attName){
if($array){
foreach($array as $x=>$x_value) {
if($x==$attName){
return $x_value;
}
}
}
return false;
}
//函数在 inWord 字符串中查找 是否有非法的字符。找到返回 FALSE 找不到返回 TRUE
function testPaliWord($inWord){
$paliletter="āīūṅñṭḍṇḷṃṁŋĀĪŪṄÑṬḌṆḶṂṀŊabcdefghijklmnoprstuvyABCDEFGHIJKLMNOPRSTUVY-";
for($i=0;$iout side tag:a $word insert next paragraph $thisParNum";
}
$wordId=$GLOBALS['bookId']."-".$thisParNum."-".$thisWordOrder;
$wordinfo=array($GLOBALS['g_wordCounter'],$wordId,$GLOBALS['bookId'],$thisParNum,$word,$realWord,".ctl.",".a.","?","?","?","?","","","NULL",$inClass,$thisWordOrder,0,0,0,0,0,0);
$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']]=$wordinfo;
return;
}
//toc out put
$GLOBALS['arrToc'][$GLOBALS['g_TocCounter']][6] .= $inStr;
if($GLOBALS['tocOnly']=="on"){
return;
}
$paliletter="āīūṅñṭḍṇḷṃṁŋĀĪŪṄÑṬḌṆḶṂṀŊabcdefghijklmnoprstuvyABCDEFGHIJKLMNOPRSTUVY-";
$mStr=str_replace("‘"," ‘ ",$mStr);
$mStr=str_replace("’"," ’ ",$mStr);
$mStr=str_replace(","," , ",$mStr);
$mStr=str_replace("."," . ",$mStr);
$mStr=str_replace("?"," ? ",$mStr);
$mStr=str_replace("!"," ! ",$mStr);
$mStr=str_replace("["," [ ",$mStr);
$mStr=str_replace("]"," ] ",$mStr);
$mStr=str_replace("("," ( ",$mStr);
$mStr=str_replace(")"," ) ",$mStr);
$mStr=str_replace("…"," … ",$mStr);
$mStr=str_replace("="," = ",$mStr);
$mStr=str_replace("+"," + ",$mStr);
$mStr=str_replace(":"," : ",$mStr);
$mStr=str_replace(";"," ; ",$mStr);
$mStr=str_replace("§"," § ",$mStr);
$mStr=str_replace("`"," ` ",$mStr);
$mStr=str_replace(" "," ",$mStr);
$mStr=str_replace(" "," ",$mStr);
$mStr=str_replace(" "," ",$mStr);
$arrList = mb_split("\s",$mStr);
foreach ($arrList as $word){
if(strlen($word)>0){
$iLastWordIndex=$GLOBALS['g_wordCounter'];
$GLOBALS['g_wordCounter']++;
$GLOBALS['wordOrder']++;
/*"id","wid","book","paragraph","word","real","type","gramma","mean","note","part","partmean","bmc","bmt","un",style,"vri","sya","si","ka","pi","pa","kam"*/
$realWord=makeRealWord($word);
if((mb_substr($realWord,0,3,"UTF-8")=="nti" || mb_substr($realWord,0,5,"UTF-8")=="ntyād" || $realWord=="ntveva" || $realWord=="nteva" )&& $word!="Nti"){
$lastWord=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1];
if($lastWord[5]!=""/* && $lastWord[15]=="bld"*/)//前一个词不是标点符号,是黑体
{
$word=mb_substr($realWord,1);
$realWord="i".$word;
$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]."n";
$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5]."ṃ";
//
$GLOBALS['g_unPartCounter']++;
$GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5];
}
else{
$lastWordIndex=getLastWordIndex($GLOBALS['g_wordCounter']-1);
if($lastWordIndex>0){
$word=mb_substr($realWord,1);
$realWord="i".$word;
$GLOBALS['arrAllWords'][$lastWordIndex][4]=$GLOBALS['arrAllWords'][$lastWordIndex][4]."n";
$GLOBALS['arrAllWords'][$lastWordIndex][5]=$GLOBALS['arrAllWords'][$lastWordIndex][5]."ṃ";
$GLOBALS['g_unPartCounter']++;
$GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$GLOBALS['arrAllWords'][$lastWordIndex][5];
}
}
}
if($realWord=="ti" || mb_substr($realWord,0,4,"UTF-8")=="tiād"){
$lastWord=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1];
if($lastWord[5]!="")//前一个词不是标点符号,是黑体
{
$strEndofWord=mb_substr($lastWord[5],-1,1,"UTF-8");
if($strEndofWord=="ā" || $strEndofWord=="ī" || $strEndofWord=="ū" ){
switch($strEndofWord){
case 'ā':
$newUnWord=mb_substr($lastWord[5],0,-1,"UTF-8").'a';
break;
case 'ī':
$newUnWord=mb_substr($lastWord[5],0,-1,"UTF-8").'i';
break;
case 'ū':
$newUnWord=mb_substr($lastWord[5],0,-1,"UTF-8").'u';
break;
}
//加入连读词列表
$GLOBALS['g_unWordCounter']++;
$GLOBALS['arrUnWords'][$GLOBALS['g_unWordCounter']]=array("NULL",$lastWord[5].$realWord,".un.","","","","","$newUnWord+i".$realWord,"","","","","",mb_strlen($lastWord[5].$realWord,"UTF-8"));
//加入连读词零件列表
$GLOBALS['g_unPartCounter']++;
$GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$newUnWord;
}
//加入连读词列表
$GLOBALS['g_unWordCounter']++;
$GLOBALS['arrUnWords'][$GLOBALS['g_unWordCounter']]=array("NULL",$lastWord[5].$realWord,".un.","","","","",$lastWord[5]."+i".$realWord,"","","","","",mb_strlen($lastWord[5].$realWord,"UTF-8"));
//加入连读词零件列表
$GLOBALS['g_unPartCounter']++;
$GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$lastWord[5];
//添加到单词列表
$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][10]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]."+i".$realWord;
$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]="{".$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][4]."}".$word;
$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5]=$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][5].$realWord;
$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']-1][6]=".un.";
$GLOBALS['g_wordCounter']--;
$word="";
$realWord="";
}
else{//前一个词是标点符号
$lastWordIndex=getLastWordIndex($GLOBALS['g_wordCounter']-1);
if($lastWordIndex>0){
//$word="ti";
$realWord="i{$realWord}";
$strEndofWord=mb_substr($GLOBALS['arrAllWords'][$lastWordIndex][5],-1,1,"UTF-8");
if($strEndofWord=="ā" || $strEndofWord=="ī" || $strEndofWord=="ū" ){
switch($strEndofWord){
case 'ā':
$newUnWord=mb_substr($GLOBALS['arrAllWords'][$lastWordIndex][5],0,-1,"UTF-8").'a';
break;
case 'ī':
$newUnWord=mb_substr($GLOBALS['arrAllWords'][$lastWordIndex][5],0,-1,"UTF-8").'i';
break;
case 'ū':
$newUnWord=mb_substr($GLOBALS['arrAllWords'][$lastWordIndex][5],0,-1,"UTF-8").'u';
break;
}
//加入连读词零件列表
$GLOBALS['g_unPartCounter']++;
$GLOBALS['arrUnPart'][$GLOBALS['g_unPartCounter']]=$newUnWord;
}
}
}
}
if($word!=""){
$wordId=$GLOBALS['class'];//$GLOBALS['bookId']."-".$GLOBALS['vriParNum']."-".$GLOBALS['wordOrder'];
$wordinfo=array($GLOBALS['g_wordCounter'],$wordId,$GLOBALS['bookId'],$GLOBALS['vriParNum'],$word,$realWord,"?","?","?","?","?","?","","","",$inClass,$GLOBALS['wordOrder'],0,0,0,0,0,0);
$GLOBALS['arrAllWords'][$GLOBALS['g_wordCounter']]=$wordinfo;
$lcWord=mb_strtolower($word,'UTF-8');
if(mb_strlen($word,"UTF-8")>1 && isPaliWord($lcWord))
{
//$GLOBALS['arrAllPaliWordsCount'][$lcWord][0]=1;
if(isset($GLOBALS['arrAllPaliWordsCount'][$realWord])){
$GLOBALS['arrAllPaliWordsCount'][$realWord][1]++;
}
else{
$GLOBALS['arrAllPaliWordsCount'][$realWord][1]=1;
$GLOBALS['arrAllPaliWordsCount'][$realWord][2]=mb_strlen($realWord,"UTF-8");
//测试是否有非法字符
if($lcWord!="’ti"){
if(testPaliWord($lcWord)===FALSE){
$errorFileLine = $GLOBALS['from'];
$errorFileName = $GLOBALS['FileName'];
$GLOBALS['log'].="$errorFileLine,$errorFileName,error,char error:,".$word."\r\n";
echo "char error:".$word."
";
}
}
}
$GLOBALS['g_paliWordCounter']++;
}
}
}
}
return;
}
$xmlfile = $dirHtml.$inputFileName;
$xmlparser = xml_parser_create();
echo "doing:".$xmlfile."
";
// 打开文件并读取数据
$fp = fopen($xmlfile, 'r');
$xmldata = fread($fp,filesize($xmlfile));
xml_parse_into_struct($xmlparser,$xmldata,$values);
xml_parser_free($xmlparser);
$begin = false;
$suttaCount=0;
$output="";
$suttaName="";
$log=$log."$from,$FileName,open\r\n";
foreach ($values as $child)
{
$attributes=getChildNodeValue($child,"attributes");
switch ($child["tag"])
{
case "BODY":
//无法处理的段落块之外的数据 需要手工修改html文件
$parText="";
switch($child["type"]){
case "open":
$parText=getChildNodeValue($child,"value");
break;
case "close":
break;
case "complete":
$parText=getChildNodeValue($child,"value");
break;
case "cdata":
$parText=$child["value"];
break;
default:
echo "无法处理的段落块之外的数据。原因:无法识别的type:";
$log=$log."$from,$FileName,error,无法处理的段落块之外的数据,原因:无法识别的type in body tag\r\n";
break;
}
if(strlen($parText)>1){
echo "段落块之外的数据:"."size".strlen($parText).$parText;
$log=$log. "$from,$FileName,error,无法处理的段落块之外的数据,".$parText."\r\n";
}
break;
case "P":
$class=getChildNodeValue($attributes,"CLASS");
{
switch($child["type"]){
case "open":
$vriParNum++;
$wordOrder=1;
$g_TocCounter++;
$arrToc[$g_TocCounter]=array('NULL',$bookId,$vriParNum,"0",$class,"","");
splitWords(getChildNodeValue($child,"value"));
$parBegin=true;
break;
case "close":
if($parBegin){
$parBegin=false;
}
break;
case "complete":
$vriParNum++;
$wordOrder=1;
$parText=getChildNodeValue($child,"value");
$g_TocCounter++;
$arrToc[$g_TocCounter]=array('NULL',$bookId,$vriParNum,"0",$class,"","");
splitWords($parText);
$parBegin=false;
break;
case "cdata":
splitWords($child["value"]);
break;
default:
echo "无法处理的块P。原因:无法识别的type:";
$log=$log."$from,$FileName,error,无法处理的块P,原因:无法识别的type\r\n";
break;
}
}
break;
case "A":
switch($child["type"]){
case "open":
echo "无法处理的块A。原因:内部有嵌套其他的块
";
$log=$log."$from,$FileName,error,无法处理的块A,原因:内部有嵌套其他的块\r\n";
break;
case "close":
break;
case "complete":
$aName=getChildNodeValue($attributes,"NAME");
if($parBegin===false){
splitWords($aName,"#a#",1);
}
else{
splitWords($aName,"#a#");
}
break;
default:
echo "无法处理的块A。原因:无法识别的type:".$child["type"];
$log=$log."$from,$FileName,error,无法处理的块A,原因:无法识别的type:".$child["type"]."\r\n";
break;
}
break;
case "SPAN":
$className="";
$className=getChildNodeValue($attributes,"CLASS");
if($className=="paranum"){
$currParNum=$child["value"];
}
$spanValue=getChildNodeValue($child,"value");
switch($child["type"]){
case "open":
splitWords($child["value"],$className);
break;
case "close":
break;
case "complete":
if($parBegin){
if(strlen($spanValue)>0){
splitWords($child["value"],$className);
}
}
else{
echo "无法处理的块span。原因:该块在段落外
";
$log=$log."$from,$FileName,error,无法处理的块span,原因:该块在段落外\r\n";
}
break;
case "cdata":
splitWords($child["value"]);
break;
default:
echo "无法处理的块span。原因:无法识别的type:";
$log=$log. "$from,$FileName,error,无法处理的块span,原因:无法识别的type:\r\n";
}
break;
default:
echo "无法处理的tag:".$child["tag"];
$log=$log. "$from,$FileName,error,无法处理的tag,".$child["tag"]."\r\n";
}
}
$myLogFile = fopen($dirLog."palicanoon.log", "a");
fwrite($myLogFile, $log);
fclose($myLogFile);
//Toc
$counter=0;
if(($fptitle=fopen($dirXmlBase.$dirXml."/".($from+1)."_title.csv", "w")) === FALSE){
echo "error: can not open output file toc .";
}
if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead."_toc.csv", "w"))!==FALSE){
$fpPaliText=fopen($dirXmlBase.$dirXml.$outputFileNameHead."_pali.csv", "w");
foreach($arrToc as $xWord){
$xPali=$xWord;
switch($xWord[4]){
case "book":
$xWord[3]=1;
$xPali[3] = 1;
break;
case "chapter":
$xWord[3]=2;
$xPali[3] = 2;
break;
case "title":
$xWord[3]=3;
$xPali[3] = 3;
break;
case "subhead":
$xWord[3]=4;
$xPali[3] = 4;
break;
case "subsubhead":
$xWord[3]=5;
$xPali[3] = 5;
break;
case "hangnum":
$xWord[3]=8;
$xPali[3] = 8;
break;
default:
$xWord[3]=100;
$xPali[3] = 100;
break;
}
if($xWord[3] < 100){
$xWord[5] = $xWord[6];
}
fputcsv($fpPaliText,$xPali);
fputcsv($fp,$xWord);
fputcsv($fptitle,$xWord);
if($counter>0){
//fputcsv($fpCombinToc,$xWord);
}
$counter++;
}
fclose($fpPaliText);
fclose($fp);
fclose($fptitle);
//fclose($fpCombinToc);
echo "TOC 表导出到:".$dirXmlBase.$dirXml.$outputFileNameHead."_toc.csv
";
}
else{
echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead."_toc.csv";
}
/*单词表*/
if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead.".csv", "w"))!==FALSE){
foreach($arrAllWords as $xWord){
fputcsv($fp,$xWord);
}
fclose($fp);
echo "单词表导出到:".$dirXmlBase.$dirXml.$outputFileNameHead.".csv
";
}
else{
echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead.".csv";
}
/*union表*/
if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead."_un.csv", "w"))!==FALSE){
foreach($arrUnWords as $xWord){
fputcsv($fp,$xWord);
}
fclose($fp);
echo "union表导出到:".$dirXmlBase.$dirXml.$outputFileNameHead."_un.csv
";
}
else{
echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead."_un.csv";
}
/*union part 表*/
if(($fp=fopen($dirXmlBase.$dirXml.$outputFileNameHead."_un_part.csv", "w"))!==FALSE){
foreach($arrUnPart as $xWord){
fwrite($fp,$xWord."\r\n");
}
fclose($fp);
echo "union part 表导出到:".$dirXmlBase.$dirXml.$outputFileNameHead."_un_part.csv
";
}
else{
echo "can not open csv file. filename=".$dirXmlBase.$dirXml.$outputFileNameHead."_un_part.csv";
}
/*Pali单词统计表*/
$countCsvFileName=$dirXmlBase.$dirXml.$outputFileNameHead."_analysis.csv";
if(($fp=fopen($countCsvFileName, "w"))!==FALSE){
$wordCountCsvHead=array("编号","词","数量","百分比","长度");
fputcsv($fp,$wordCountCsvHead);
$i=0;
foreach($arrAllPaliWordsCount as $x=>$x_value){
$i++;
$csvWord[0]=$i;
$csvWord[1]=$x;
$csvWord[2]=$x_value[1];
$csvWord[3]=$x_value[1]*10000/$g_paliWordCounter;
$csvWord[4]=$x_value[2];
fputcsv($fp,$csvWord);
}
fclose($fp);
echo "Pali单词表统计导出到:".$countCsvFileName."
";
}
else{
echo "can not open csv file. filename=".$countCsvFileName."
";
}
?>
=$to){
echo "齐活!功德无量!all done!
";
}
else{
echo "";
echo "正在载入:".($from+1)."——".$filelist[$from+1][0];
}
?>