Просмотр исходного кода

:construction:编写了根据id号查找相似的句子的函数,编写了预计算并存入数据库的函数

vainPointer 5 лет назад
Родитель
Сommit
1d63fb9864
1 измененных файлов с 98 добавлено и 37 удалено
  1. 98 37
      app/pali_sent/pali_sent.php

+ 98 - 37
app/pali_sent/pali_sent.php

@@ -44,11 +44,9 @@ function words_of_sentence(string $sent) {
 
 // 采用 jaccard 相似度,考虑到圣典中的相似句单词、句式都是非常接近的
 function jaccard_similarity($words_of_sent1, $words_of_sent2) {
-	$intersect_array = array_intersect($words_of_sent1, $words_of_sent2);
-	$intersect = count($intersect_array);
-	$union_array = array_merge($words_of_sent1, $words_of_sent2);
-	$union = count($union_array) - $intersect;
-	if ($intersect) {
+	$intersect = count(array_intersect($words_of_sent1, $words_of_sent2));
+	$union = count($words_of_sent1)+count($words_of_sent2)-$intersect;
+	if ($union) {
 		return $intersect / $union;
 	} else {
 		return 0;
@@ -96,16 +94,25 @@ class sim_sent_list {
 		while ($prev->next != null && $prev->next->jaccard_score > $jaccard_score) {
 			$prev = $prev->next;
 		}
-		if ($prev->next == null) {
-			$prev->next = new sim_sent_node($id, $jaccard_score, null);
-		} else {
-			$insert_node = new sim_sent_node($id, $jaccard_score, $prev->next);
-			$prev->next = $insert_node;
-		}
+		$prev->next = new sim_sent_node($id, $jaccard_score, $prev->next);
 		$this->size++;
 	}
 
-	// 调试用打印本链表
+	public function get_text_list() {
+		$prev = $this->head;
+		if ($this->size == 0) {
+			return;
+		}
+
+		$ids = "";
+		while ($prev->next != null) {
+			$ids = $ids.",".$prev->next->id;
+			$prev = $prev->next;
+		}
+		$ids = substr($ids, 1); // 去掉第一个逗号
+		return $ids;
+	}
+
 	public function print_list() {
 		$prev = $this->head;
 		if ($this->size == 0) {
@@ -118,8 +125,8 @@ class sim_sent_list {
 	}
 }
 
-// TODO: 将 current_id 的 similar_sent_list 存入数据库
-function insert_similar_sent_list_into_sqlite($current_id, $list) {
+// 将相似句列表存入数据库
+function insert_similar_sent_list_into_sqlite($current_id, $text_list) {
 	/* 使用这部分代码先为数据库添加一个 sim_sents 字段
 	$add_column = "ALTER TABLE pali_sent ADD COLUMN sim_sents TEXT";
 	$Action_add = PDO_Execute($add_column);
@@ -127,49 +134,103 @@ function insert_similar_sent_list_into_sqlite($current_id, $list) {
 	$Fetch = PDO_FetchALL($query);
 	print_r($Fetch);
 	*/
+	global $PDO;
+	$Update = "UPDATE pali_sent SET sim_sents = ".$PDO->quote($text_list)." WHERE id = ".$current_id;
+	$Result = PDO_Execute($Update);
 	return;
 }
 
-// TODO: 考虑圣典中的相似句位置应该比较接近,可以减少比较量
+// 预计算,存入数据库
 function similar_sent_matrix() {
-	$query = "select id,text from pali_sent limit 40000,1000";
-	#$query = "select id,text from pali_sent where id = 10872 or id = 10716";
-	$Fetch = PDO_FetchAll($query);
+	// 按照 count = 18, 8, ..., 255 依次获得查询结果 (i-3,i+3)
+	//          count = 17,16,...,7                                       (i-2,i+2)
+	for ($current_count=17; $current_count > 7; $current_count--) { 
+		print("单词数:".$current_count."\n");
+		$current_query = "select id,text from pali_sent where count=".$current_count;
+		$Current = PDO_FetchAll($current_query);
+		if (count($Current)) {
+			foreach($Current as $current_row) {
+				$current_id = $current_row['id'];
+				$current_sent = $current_row['text'];
+				$current_words = words_of_sentence($current_sent);
+				
+				// 按照 count > $current_count-3 and count <$current_count+3 查询希望比较的语句
+				$compare_query = "select id,text from pali_sent where count>".($current_count-2)." and count<".($current_count+2);
+				$Compare = PDO_FetchALL($compare_query);
 
-	foreach($Fetch as $current_row) {
-		$current_id = $current_row['id'];
-		$current_sent = $current_row['text'];
-		$current_words = words_of_sentence($current_sent);
+				$current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
+				foreach($Compare as $compare_row) {
+					if ($current_row != $compare_row) {
+						$compare_id = $compare_row['id'];
+						$compare_sent = $compare_row['text'];
+						$compare_words = words_of_sentence($compare_sent);
 
-		if (count($current_words) > 5) { // 比较句子长度大于 5 的
-			$current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
+						$jaccard_score = jaccard_similarity($current_words, $compare_words);
+						if ($jaccard_score > 0.3) {
+							$current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
+						}
+					}
+				} // end of foreach $compare_row
 
-			foreach($Fetch as $compare_row) {
+				if ($current_sim_sent_list->size != 0) {
+					print("update ".$current_id."\n");
+					$text_list = $current_sim_sent_list->get_text_list();
+					insert_similar_sent_list_into_sqlite($current_id, $text_list);
+				}
+			} // end of foreach $current_row
+		}
+	}
+	return;
+}
+
+// 实时计算相似句
+function sents_similar_to_id($id) {
+	$query = "SELECT count,text FROM pali_sent WHERE id=".$id;
+	$Current = PDO_FetchALL($query);
+	if (count($Current)) {
+		foreach($Current as $current_row) {
+			$current_count = $current_row['count'];
+			$current_sent = $current_row['text'];
+			$current_words = words_of_sentence($current_sent);
+			print("current text: \n".$current_sent."\n");
+
+			if ($current_count <= 5) {
+				print("[-] too short.\n");
+				return;
+			}
+
+			// 只和单词数大于 5 的比较
+			$compare_query = "SELECT id,text FROM pali_sent WHERE count>5";
+			$Compare = PDO_FetchALL($compare_query);
+
+			$current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
+			foreach($Compare as $compare_row) {
 				if ($current_row != $compare_row) {
 					$compare_id = $compare_row['id'];
 					$compare_sent = $compare_row['text'];
 					$compare_words = words_of_sentence($compare_sent);
 
-					if(count($compare_words) > 5) {
-						$jaccard_score = jaccard_similarity($current_words, $compare_words);
-						if ($jaccard_score > 0.3) {
-							$current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
-							#print($current_sim_sent_list->next->id);
-						}
+					$jaccard_score = jaccard_similarity($current_words, $compare_words);
+					if ($jaccard_score > 0.3) {
+						print("Jaccard similarity: ".$jaccard_score."\tSentence id:".$compare_id."\n");
+						print("Text: \n". $compare_sent."\n");
+						$current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
 					}
 				}
 			} // end of foreach $compare_row
 
 			if ($current_sim_sent_list->size != 0) {
-				print("sim_sent_list of ".$current_id.":\n");
-				$current_sim_sent_list->print_list();
-				insert_similar_sent_list_into_sqlite($current_id, $current_sim_sent_list);
+				// $current_sim_sent_list->print_list();
+			} else {
+				print("[-]not found.\n");
 			}
-		}
-	} // end of foreach $current_row
+
+		} // end of foreach($Current)
+	} // end of if (count($Current))
 }
 
-similar_sent_matrix();
+$id = $argv[1];
+sents_similar_to_id($id);
 
 if (!isset($_op)) {
 	exit(0);