sim_sent.php 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. <?php
  2. //计算句子相似度
  3. require_once "../path.php";
  4. require_once "../public/_pdo.php";
  5. global $PDO;
  6. PDO_Connect("sqlite:"._FILE_DB_PALI_SENTENCE_);
  7. PDO_Execute("PRAGMA synchronous = OFF");
  8. PDO_Execute("PRAGMA journal_mode = WAL");
  9. PDO_Execute("PRAGMA foreign_keys = ON");
  10. PDO_Execute("PRAGMA busy_timeout = 5000");
  11. // 输入一个句子,输出整个句子的单词 array
  12. function words_of_sentence(string $sent) {
  13. $words = preg_split("/[ \.\[\]\{\}\-,';‘’–0123456789]+/", $sent); // 去除标点、数字
  14. $words = array_filter($words); // 去除空词
  15. $words = array_filter($words, function($item) {
  16. if ($item != 'ca' && $item != 'vā' && $item != 'na') return $item; }); // 去除 ca, vā 和 na
  17. return $words;
  18. }
  19. // 采用 jaccard 相似度,考虑到圣典中的相似句单词、句式都是非常接近的
  20. function jaccard_similarity($words_of_sent1, $words_of_sent2) {
  21. $intersect = count(array_intersect($words_of_sent1, $words_of_sent2));
  22. $union = count($words_of_sent1)+count($words_of_sent2)-$intersect;
  23. if ($union) {
  24. return $intersect / $union;
  25. } else {
  26. return 0;
  27. }
  28. }
  29. // 带顺序的 jaccard 算法,当前效果一般,TODO: 切片相同时加入得分
  30. function ordered_jaccard_similarity($words_of_sent1, $words_of_sent2) {
  31. $score = 0;
  32. $k = min(count($words_of_sent1), count($words_of_sent2));
  33. for ($i=1; $i<$k; $i++) {
  34. $score += jaccard_similarity(
  35. array_slice($words_of_sent1, 0, $i),
  36. array_slice($words_of_sent2, 0, $i));
  37. }
  38. return $score / $k;
  39. }
  40. // 定义一个链表节点,方便 sim_sent_id 按照相似度插入
  41. class sim_sent_node {
  42. public $id;
  43. public $jaccard_score;
  44. public $next;
  45. public function __construct($id = null, $jaccard_score = null, $next = null) {
  46. $this->id = $id;
  47. $this->jaccard_score = $jaccard_score;
  48. $this->next = $next;
  49. }
  50. }
  51. // 定义链表
  52. class sim_sent_list {
  53. public $head; // 头节点,默认一个虚头节点
  54. public $size;
  55. public function __construct() {
  56. $this->head = new sim_sent_node();
  57. $this->size = 0;
  58. }
  59. // 按照 jaccard_score 相似度插入
  60. public function jaccard_add($id, $jaccard_score) {
  61. $prev = $this->head;
  62. while ($prev->next != null && $prev->next->jaccard_score > $jaccard_score) {
  63. $prev = $prev->next;
  64. }
  65. $prev->next = new sim_sent_node($id, $jaccard_score, $prev->next);
  66. $this->size++;
  67. }
  68. public function get_text_list() {
  69. $prev = $this->head;
  70. if ($this->size == 0) {
  71. return;
  72. }
  73. $ids = "";
  74. while ($prev->next != null) {
  75. $ids = $ids.",".$prev->next->id;
  76. $prev = $prev->next;
  77. }
  78. $ids = substr($ids, 1); // 去掉第一个逗号
  79. return $ids;
  80. }
  81. public function print_list() {
  82. $prev = $this->head;
  83. if ($this->size == 0) {
  84. return;
  85. }
  86. while ($prev->next != null) {
  87. print($prev->next->id."\t".$prev->next->jaccard_score."\n");
  88. $prev = $prev->next;
  89. }
  90. }
  91. }
  92. // 将相似句列表存入数据库
  93. function insert_similar_sent_list_into_sqlite($current_id, $text_list) {
  94. /* 使用这部分代码先为数据库添加一个 sim_sents 字段
  95. $add_column = "ALTER TABLE pali_sent ADD COLUMN sim_sents TEXT";
  96. $Action_add = PDO_Execute($add_column);
  97. $query = "PRAGMA TABLE_INFO (pali_sent)";
  98. $Fetch = PDO_FetchALL($query);
  99. print_r($Fetch);
  100. */
  101. global $PDO;
  102. $Update = "UPDATE pali_sent SET sim_sents = ".$PDO->quote($text_list)." WHERE id = ".$current_id;
  103. $Result = PDO_Execute($Update);
  104. return;
  105. }
  106. // 预计算,存入数据库
  107. function similar_sent_matrix($begin,$end,$begin_id=0) {
  108. // 按照 count = 18, 8, ..., 255 依次获得查询结果 (i-3,i+3)
  109. // count = 17,16,...,7 (i-2,i+2)
  110. for ($current_count=$begin; $current_count <=$end ; $current_count++) {
  111. print("单词数:".$current_count."\n");
  112. $current_query = "select id,text from pali_sent where count=".$current_count;
  113. $Current = PDO_FetchAll($current_query);
  114. if (count($Current)) {
  115. foreach($Current as $current_row) {
  116. $current_id = $current_row['id'];
  117. if($begin_id>0 && $current_count==$begin){
  118. if($current_id<$begin_id){
  119. continue;
  120. }
  121. }
  122. $current_sent = $current_row['text'];
  123. $current_words = words_of_sentence($current_sent);
  124. // 按照 count > $current_count-3 and count <$current_count+3 查询希望比较的语句
  125. if($current_count>17){
  126. $section = 3;
  127. }
  128. else if($current_count>6){
  129. $section = 2;
  130. }
  131. else if($current_count>3){
  132. $section = 1;
  133. }
  134. else {
  135. $section = 0;
  136. }
  137. $compare_query = "select id,text from pali_sent where count>".($current_count-$section)." and count<".($current_count+$section);
  138. $Compare = PDO_FetchALL($compare_query);
  139. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  140. foreach($Compare as $compare_row) {
  141. if ($current_row != $compare_row) {
  142. $compare_id = $compare_row['id'];
  143. $compare_sent = $compare_row['text'];
  144. $compare_words = words_of_sentence($compare_sent);
  145. if($current_count>3){
  146. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  147. }
  148. else if($current_count==3 ){
  149. if($current_words==$compare_words){
  150. $jaccard_score = 1;
  151. }
  152. }
  153. else if($current_count==2){
  154. if($current_words==$compare_words){
  155. $jaccard_score = 1;
  156. }
  157. }
  158. else{
  159. $jaccard_score = 0;
  160. }
  161. if ($jaccard_score > 0.3) {
  162. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  163. }
  164. }
  165. } // end of foreach $compare_row
  166. if ($current_sim_sent_list->size != 0) {
  167. print("update $current_count - ".$current_id."\n");
  168. $text_list = $current_sim_sent_list->get_text_list();
  169. insert_similar_sent_list_into_sqlite($current_id, $text_list);
  170. }
  171. } // end of foreach $current_row
  172. }
  173. }
  174. return;
  175. }
  176. // 实时计算相似句
  177. function sents_similar_to_id($id) {
  178. $query = "SELECT count,text FROM pali_sent WHERE id=".$id;
  179. $Current = PDO_FetchALL($query);
  180. if (count($Current)) {
  181. foreach($Current as $current_row) {
  182. $current_count = $current_row['count'];
  183. $current_sent = $current_row['text'];
  184. $current_words = words_of_sentence($current_sent);
  185. print("current text: \n".$current_sent."\n");
  186. if ($current_count <= 5) {
  187. print("[-] too short.\n");
  188. return;
  189. }
  190. // 只和单词数大于 5 的比较
  191. $compare_query = "SELECT id,text FROM pali_sent WHERE count>5";
  192. $Compare = PDO_FetchALL($compare_query);
  193. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  194. foreach($Compare as $compare_row) {
  195. if ($current_row != $compare_row) {
  196. $compare_id = $compare_row['id'];
  197. $compare_sent = $compare_row['text'];
  198. $compare_words = words_of_sentence($compare_sent);
  199. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  200. if ($jaccard_score > 0.3) {
  201. print("Jaccard similarity: ".$jaccard_score."\tSentence id:".$compare_id."\n");
  202. print("Text: \n". $compare_sent."\n");
  203. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  204. }
  205. }
  206. } // end of foreach $compare_row
  207. if ($current_sim_sent_list->size != 0) {
  208. // $current_sim_sent_list->print_list();
  209. } else {
  210. print("[-]not found.\n");
  211. }
  212. } // end of foreach($Current)
  213. } // end of if (count($Current))
  214. }
  215. //$id = $argv[1];
  216. //sents_similar_to_id($id);
  217. if ($argc < 3){
  218. echo "无效的参数 ";
  219. exit;
  220. }
  221. $from = (int)$argv[1];
  222. $to =(int)$argv[2];
  223. if ($argc > 3){
  224. $from_id = (int)$argv[3];
  225. }
  226. else{
  227. $from_id = 0;
  228. }
  229. if($from<2){
  230. $from = 2;
  231. }
  232. if($to>255){
  233. $to = 255;
  234. }
  235. similar_sent_matrix($from,$to,$from_id);
  236. echo "\n all done";
  237. ?>