Browse Source

关键词权重预处理

Bhikkhu-Kosalla 5 years ago
parent
commit
5646fdc4b1

+ 140 - 0
app/admin/word_index_weight_refresh.php

@@ -0,0 +1,140 @@
+<?php
+
+require_once '../path.php';
+require_once './word_index_weight_table.php';
+
+if(isset($_GET["from"])){
+	$from = $_GET["from"];
+	$to = $_GET["to"];
+}
+else{
+	if ($argc != 3){
+	echo "无效的参数 ";
+	exit;
+	}
+	$from = (int)$argv[1];
+	$to =(int)$argv[2];
+	if($to>217){
+		$to = 217;
+	}
+}
+
+
+$dh_word = new PDO("sqlite:"._FILE_DB_WORD_INDEX_, "", "");
+$dh_word->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
+
+$dh_pali = new PDO("sqlite:"._FILE_DB_PALI_INDEX_, "", "");
+$dh_pali->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
+
+echo "from=$from to = $to \n";
+for ($i=$from; $i <=$to ; $i++) {
+	$time_start = microtime(true);
+	echo "正在处理 book= $i ";
+	$query = "SELECT max(paragraph) from word where book={$i}";
+	$stmt = $dh_pali->query($query);
+	$row = $stmt->fetch(PDO::FETCH_NUM);
+    if ($row) {
+		$max_para =  $row[0];
+		echo "段落数量:$max_para \n";
+		for ($j=0; $j <=$max_para ; $j++) {
+			# code...
+			$query = "SELECT id,book,wordindex,bold from word where book={$i} and paragraph={$j} order by id ASC";
+			$stmt = $dh_pali->query($query);
+			$fetch = $stmt->fetchAll(PDO::FETCH_ASSOC);
+
+			$query = "SELECT wordindex,count(*) as co from word where book={$i} and paragraph={$j} group by wordindex";
+			$stmt = $dh_pali->query($query);
+			$fetch_voc = $stmt->fetchAll(PDO::FETCH_ASSOC);			
+			$vocabulary = array();
+			foreach ($fetch_voc as $key => $value) {
+				$vocabulary[$value["wordindex"]] = $value["co"];
+			}
+			for ($iWord=0; $iWord <count($fetch) ; $iWord++) { 
+				# 非黑体字
+				if($fetch[$iWord]["bold"]==0){
+					$count = $vocabulary[$fetch[$iWord]["wordindex"]];
+					$paraWeight = pow(1.01,$count);//总分
+					if($paraWeight>1.9){
+						$paraWeight = 1.9;
+					}			
+					$weight = $paraWeight/$count;						
+				}
+				else{
+					#黑体字
+					#查找前后相连的黑体字
+					$begin = $iWord;
+					while ($fetch[$begin]["bold"] ==1) {
+						$begin--;
+						if($begin<0){
+							break;
+						}
+					}
+					$begin = $begin+1;
+
+					$end = $iWord;
+					while ($fetch[$end]["bold"] ==1) {
+						$end++;
+						if($end>count($fetch)-1){
+							break;
+						}
+					}
+					$end = $end-1;
+					$bold_count = $end-$begin+1;
+					if($bold_count==1){
+						
+						$query = "SELECT * from wordindex where id=".$fetch[$iWord]["wordindex"];
+						$stmt_word = $dh_word->query($query);
+						$wordinfo = $stmt_word->fetch(PDO::FETCH_ASSOC);
+						$bookId = (int)$fetch[$iWord]["book"];
+						if(mb_substr($wordinfo["word"],-2)=="ti"){
+							$weight = 100+$book_weight[$bookId];
+						}
+						else{
+							$weight = 100+$book_weight[$bookId];
+						}
+						//echo "单独黑体 $weight \n";
+					}
+					else{
+						#连续黑体字
+						//echo "连续黑体字";
+						$len_sum = 0;
+						$len_curr = 0;
+						for ($iBold=$begin; $iBold <=$end ; $iBold++) { 
+							# code...
+							$boldid = $fetch[$iBold]["wordindex"];
+							$query = "SELECT len from wordindex where id=".$boldid;
+							$stmt_bold = $dh_word->query($query);
+							$wordbold = $stmt_bold->fetch(PDO::FETCH_ASSOC);
+							$len_sum += $wordbold["len"];
+							if($iBold==$i){
+								$len_curr = $wordbold["len"];
+							}
+						}
+						$weight = 10+$len_curr/$len_sum;
+					}
+				}
+				//echo $weight."\n";
+				$fetch[$iWord]["weight"] = (int)($weight*100);
+			}
+			# 修改数据库
+			$dh_pali->beginTransaction();
+			$query = "UPDATE word set weight = ? where id=? ";
+			$stmt_weight = $dh_pali->prepare($query);
+			foreach ($fetch as $key => $value) {
+        		$stmt_weight->execute(array($value["weight"],$value["id"]));
+			}
+			$dh_pali->commit();
+			if (!$stmt_weight || ($stmt_weight && $stmt_weight->errorCode() != 0)) {
+				$error = $dh_pali->errorInfo();
+				echo "error - $error[2]";
+			}
+			else{
+				//echo "修改数据库成功 book={$i} paragraph={$j} \n";
+			}
+		}
+    } else {
+        echo "无法获取段落最大值";
+	}
+	echo "处理时间 :".( microtime(true)-$time_start);
+}
+?>

+ 220 - 0
app/admin/word_index_weight_table.php

@@ -0,0 +1,220 @@
+<?php
+$book_weight = array();
+$book_weight[93]=0.995;
+$book_weight[94]=0.991;
+$book_weight[95]=0.986;
+$book_weight[164]=0.982;
+$book_weight[165]=0.977;
+$book_weight[166]=0.972;
+$book_weight[167]=0.968;
+$book_weight[168]=0.963;
+$book_weight[169]=0.959;
+$book_weight[170]=0.954;
+$book_weight[171]=0.95;
+$book_weight[84]=0.945;
+$book_weight[85]=0.94;
+$book_weight[86]=0.936;
+$book_weight[87]=0.931;
+$book_weight[88]=0.927;
+$book_weight[89]=0.922;
+$book_weight[90]=0.917;
+$book_weight[91]=0.913;
+$book_weight[92]=0.908;
+$book_weight[82]=0.904;
+$book_weight[83]=0.899;
+$book_weight[154]=0.894;
+$book_weight[156]=0.89;
+$book_weight[157]=0.885;
+$book_weight[158]=0.881;
+$book_weight[159]=0.876;
+$book_weight[160]=0.872;
+$book_weight[161]=0.867;
+$book_weight[162]=0.862;
+$book_weight[163]=0.858;
+$book_weight[143]=0.853;
+$book_weight[144]=0.849;
+$book_weight[145]=0.844;
+$book_weight[146]=0.839;
+$book_weight[147]=0.835;
+$book_weight[148]=0.83;
+$book_weight[149]=0.826;
+$book_weight[150]=0.821;
+$book_weight[151]=0.817;
+$book_weight[152]=0.812;
+$book_weight[153]=0.807;
+$book_weight[155]=0.803;
+$book_weight[213]=0.798;
+$book_weight[214]=0.794;
+$book_weight[215]=0.789;
+$book_weight[216]=0.784;
+$book_weight[217]=0.78;
+$book_weight[73]=0.775;
+$book_weight[74]=0.771;
+$book_weight[75]=0.766;
+$book_weight[76]=0.761;
+$book_weight[77]=0.757;
+$book_weight[78]=0.752;
+$book_weight[79]=0.748;
+$book_weight[80]=0.743;
+$book_weight[81]=0.739;
+$book_weight[69]=0.734;
+$book_weight[70]=0.729;
+$book_weight[71]=0.725;
+$book_weight[72]=0.72;
+$book_weight[103]=0.716;
+$book_weight[104]=0.711;
+$book_weight[105]=0.706;
+$book_weight[130]=0.702;
+$book_weight[131]=0.697;
+$book_weight[132]=0.693;
+$book_weight[133]=0.688;
+$book_weight[134]=0.683;
+$book_weight[135]=0.679;
+$book_weight[136]=0.674;
+$book_weight[137]=0.67;
+$book_weight[99]=0.665;
+$book_weight[100]=0.661;
+$book_weight[101]=0.656;
+$book_weight[102]=0.651;
+$book_weight[116]=0.647;
+$book_weight[122]=0.642;
+$book_weight[123]=0.638;
+$book_weight[124]=0.633;
+$book_weight[125]=0.628;
+$book_weight[126]=0.624;
+$book_weight[127]=0.619;
+$book_weight[128]=0.615;
+$book_weight[129]=0.61;
+$book_weight[106]=0.606;
+$book_weight[107]=0.601;
+$book_weight[108]=0.596;
+$book_weight[109]=0.592;
+$book_weight[110]=0.587;
+$book_weight[111]=0.583;
+$book_weight[112]=0.578;
+$book_weight[113]=0.573;
+$book_weight[114]=0.569;
+$book_weight[115]=0.564;
+$book_weight[117]=0.56;
+$book_weight[118]=0.555;
+$book_weight[119]=0.55;
+$book_weight[120]=0.546;
+$book_weight[121]=0.541;
+$book_weight[138]=0.537;
+$book_weight[139]=0.532;
+$book_weight[140]=0.528;
+$book_weight[141]=0.523;
+$book_weight[142]=0.518;
+$book_weight[96]=0.514;
+$book_weight[97]=0.509;
+$book_weight[98]=0.505;
+$book_weight[185]=0.5;
+$book_weight[186]=0.495;
+$book_weight[187]=0.491;
+$book_weight[188]=0.486;
+$book_weight[189]=0.482;
+$book_weight[192]=0.477;
+$book_weight[193]=0.472;
+$book_weight[194]=0.468;
+$book_weight[195]=0.463;
+$book_weight[196]=0.459;
+$book_weight[197]=0.454;
+$book_weight[198]=0.45;
+$book_weight[199]=0.445;
+$book_weight[181]=0.44;
+$book_weight[182]=0.436;
+$book_weight[183]=0.431;
+$book_weight[184]=0.427;
+$book_weight[190]=0.422;
+$book_weight[191]=0.417;
+$book_weight[204]=0.413;
+$book_weight[205]=0.408;
+$book_weight[206]=0.404;
+$book_weight[207]=0.399;
+$book_weight[208]=0.394;
+$book_weight[209]=0.39;
+$book_weight[210]=0.385;
+$book_weight[211]=0.381;
+$book_weight[212]=0.376;
+$book_weight[200]=0.372;
+$book_weight[201]=0.367;
+$book_weight[202]=0.362;
+$book_weight[203]=0.358;
+$book_weight[172]=0.353;
+$book_weight[173]=0.349;
+$book_weight[174]=0.344;
+$book_weight[175]=0.339;
+$book_weight[176]=0.335;
+$book_weight[177]=0.33;
+$book_weight[178]=0.326;
+$book_weight[179]=0.321;
+$book_weight[180]=0.317;
+$book_weight[64]=1;
+$book_weight[65]=1;
+$book_weight[66]=1;
+$book_weight[67]=1;
+$book_weight[68]=1;
+$book_weight[39]=0.289;
+$book_weight[40]=0.284;
+$book_weight[41]=0.28;
+$book_weight[42]=0.275;
+$book_weight[43]=0.271;
+$book_weight[44]=0.266;
+$book_weight[45]=0.261;
+$book_weight[22]=0.257;
+$book_weight[23]=0.252;
+$book_weight[24]=0.248;
+$book_weight[25]=0.243;
+$book_weight[1]=0.239;
+$book_weight[2]=0.234;
+$book_weight[3]=0.229;
+$book_weight[4]=0.225;
+$book_weight[5]=0.22;
+$book_weight[6]=0.216;
+$book_weight[7]=0.211;
+$book_weight[8]=0.206;
+$book_weight[61]=0.202;
+$book_weight[62]=0.197;
+$book_weight[63]=0.193;
+$book_weight[13]=0.188;
+$book_weight[14]=0.183;
+$book_weight[15]=0.179;
+$book_weight[16]=0.174;
+$book_weight[17]=0.17;
+$book_weight[18]=0.165;
+$book_weight[19]=0.161;
+$book_weight[20]=0.156;
+$book_weight[21]=0.151;
+$book_weight[9]=0.147;
+$book_weight[10]=0.142;
+$book_weight[11]=0.138;
+$book_weight[12]=0.133;
+$book_weight[27]=0.128;
+$book_weight[28]=0.124;
+$book_weight[29]=0.119;
+$book_weight[30]=0.115;
+$book_weight[31]=0.11;
+$book_weight[32]=0.106;
+$book_weight[33]=0.101;
+$book_weight[34]=0.096;
+$book_weight[35]=0.092;
+$book_weight[26]=0.087;
+$book_weight[36]=0.083;
+$book_weight[37]=0.078;
+$book_weight[38]=0.073;
+$book_weight[52]=0.069;
+$book_weight[53]=0.064;
+$book_weight[54]=0.06;
+$book_weight[55]=0.055;
+$book_weight[56]=0.05;
+$book_weight[57]=0.046;
+$book_weight[58]=0.041;
+$book_weight[59]=0.037;
+$book_weight[60]=0.032;
+$book_weight[46]=0.028;
+$book_weight[47]=0.023;
+$book_weight[48]=0.018;
+$book_weight[49]=0.014;
+$book_weight[50]=0.009;
+$book_weight[51]=0.005;
+?>

BIN
app/studio/module/palicannon/index.xlsx