2
0

sim_sent.php 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. <?php
  2. //计算句子相似度
  3. require_once "../path.php";
  4. require_once "../public/_pdo.php";
  5. global $PDO;
  6. PDO_Connect("" . _FILE_DB_PALI_SENTENCE_);
  7. PDO_Execute("PRAGMA synchronous = OFF");
  8. PDO_Execute("PRAGMA journal_mode = WAL");
  9. PDO_Execute("PRAGMA foreign_keys = ON");
  10. PDO_Execute("PRAGMA busy_timeout = 5000");
  11. // 输入一个句子,输出整个句子的单词 array
  12. function words_of_sentence(string $sent)
  13. {
  14. $words = preg_split("/[ \.\[\]\{\}\-,';‘’–0123456789]+/", $sent); // 去除标点、数字
  15. $words = array_filter($words); // 去除空词
  16. $words = array_filter($words, function ($item) {
  17. if ($item != 'ca' && $item != 'vā' && $item != 'na') {
  18. return $item;
  19. }
  20. }); // 去除 ca, vā 和 na
  21. return $words;
  22. }
  23. // 采用 jaccard 相似度,考虑到圣典中的相似句单词、句式都是非常接近的
  24. function jaccard_similarity($words_of_sent1, $words_of_sent2)
  25. {
  26. $intersect = count(array_intersect($words_of_sent1, $words_of_sent2));
  27. $union = count($words_of_sent1) + count($words_of_sent2) - $intersect;
  28. if ($union) {
  29. return $intersect / $union;
  30. } else {
  31. return 0;
  32. }
  33. }
  34. // 带顺序的 jaccard 算法,当前效果一般,TODO: 切片相同时加入得分
  35. function ordered_jaccard_similarity($words_of_sent1, $words_of_sent2)
  36. {
  37. $score = 0;
  38. $k = min(count($words_of_sent1), count($words_of_sent2));
  39. for ($i = 1; $i < $k; $i++) {
  40. $score += jaccard_similarity(
  41. array_slice($words_of_sent1, 0, $i),
  42. array_slice($words_of_sent2, 0, $i));
  43. }
  44. return $score / $k;
  45. }
  46. // 定义一个链表节点,方便 sim_sent_id 按照相似度插入
  47. class sim_sent_node
  48. {
  49. public $id;
  50. public $jaccard_score;
  51. public $next;
  52. public function __construct($id = null, $jaccard_score = null, $next = null)
  53. {
  54. $this->id = $id;
  55. $this->jaccard_score = $jaccard_score;
  56. $this->next = $next;
  57. }
  58. }
  59. // 定义链表
  60. class sim_sent_list
  61. {
  62. public $head; // 头节点,默认一个虚头节点
  63. public $size;
  64. public function __construct()
  65. {
  66. $this->head = new sim_sent_node();
  67. $this->size = 0;
  68. }
  69. // 按照 jaccard_score 相似度插入
  70. public function jaccard_add($id, $jaccard_score)
  71. {
  72. $prev = $this->head;
  73. while ($prev->next != null && $prev->next->jaccard_score > $jaccard_score) {
  74. $prev = $prev->next;
  75. }
  76. $prev->next = new sim_sent_node($id, $jaccard_score, $prev->next);
  77. $this->size++;
  78. }
  79. public function get_text_list()
  80. {
  81. $prev = $this->head;
  82. if ($this->size == 0) {
  83. return;
  84. }
  85. $ids = "";
  86. while ($prev->next != null) {
  87. $ids = $ids . "," . $prev->next->id;
  88. $prev = $prev->next;
  89. }
  90. $ids = substr($ids, 1); // 去掉第一个逗号
  91. return $ids;
  92. }
  93. public function print_list()
  94. {
  95. $prev = $this->head;
  96. if ($this->size == 0) {
  97. return;
  98. }
  99. while ($prev->next != null) {
  100. print($prev->next->id . "\t" . $prev->next->jaccard_score . "\n");
  101. $prev = $prev->next;
  102. }
  103. }
  104. }
  105. // 将相似句列表存入数据库
  106. function insert_similar_sent_list_into_sqlite($current_id, $text_list)
  107. {
  108. /* 使用这部分代码先为数据库添加一个 sim_sents 字段
  109. $add_column = "ALTER TABLE pali_sent ADD COLUMN sim_sents TEXT";
  110. $Action_add = PDO_Execute($add_column);
  111. $query = "PRAGMA TABLE_INFO (pali_sent)";
  112. $Fetch = PDO_FetchALL($query);
  113. print_r($Fetch);
  114. */
  115. global $PDO;
  116. $Update = "UPDATE pali_sent SET sim_sents = " . $PDO->quote($text_list) . " WHERE id = " . $current_id;
  117. $Result = PDO_Execute($Update);
  118. return;
  119. }
  120. // 预计算,存入数据库
  121. function similar_sent_matrix($begin, $end, $begin_id = 0)
  122. {
  123. // 按照 count = 18, 8, ..., 255 依次获得查询结果 (i-3,i+3)
  124. // count = 17,16,...,7 (i-2,i+2)
  125. for ($current_count = $begin; $current_count <= $end; $current_count++) {
  126. print("单词数:" . $current_count . "\n");
  127. $current_query = "select id,text from pali_sent where count=" . $current_count;
  128. $Current = PDO_FetchAll($current_query);
  129. if (count($Current)) {
  130. foreach ($Current as $current_row) {
  131. $current_id = $current_row['id'];
  132. if ($begin_id > 0 && $current_count == $begin) {
  133. if ($current_id < $begin_id) {
  134. continue;
  135. }
  136. }
  137. $current_sent = $current_row['text'];
  138. $current_words = words_of_sentence($current_sent);
  139. // 按照 count > $current_count-3 and count <$current_count+3 查询希望比较的语句
  140. if ($current_count > 17) {
  141. $section = 3;
  142. } else if ($current_count > 6) {
  143. $section = 2;
  144. } else if ($current_count > 3) {
  145. $section = 1;
  146. } else {
  147. $section = 0;
  148. }
  149. $compare_query = "select id,text from pali_sent where count>" . ($current_count - $section) . " and count<" . ($current_count + $section);
  150. $Compare = PDO_FetchALL($compare_query);
  151. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  152. foreach ($Compare as $compare_row) {
  153. if ($current_row != $compare_row) {
  154. $compare_id = $compare_row['id'];
  155. $compare_sent = $compare_row['text'];
  156. $compare_words = words_of_sentence($compare_sent);
  157. if ($current_count > 3) {
  158. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  159. } else if ($current_count == 3) {
  160. if ($current_words == $compare_words) {
  161. $jaccard_score = 1;
  162. }
  163. } else if ($current_count == 2) {
  164. if ($current_words == $compare_words) {
  165. $jaccard_score = 1;
  166. }
  167. } else {
  168. $jaccard_score = 0;
  169. }
  170. if ($jaccard_score > 0.3) {
  171. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  172. }
  173. }
  174. } // end of foreach $compare_row
  175. if ($current_sim_sent_list->size != 0) {
  176. print("update $current_count - " . $current_id . "\n");
  177. $text_list = $current_sim_sent_list->get_text_list();
  178. insert_similar_sent_list_into_sqlite($current_id, $text_list);
  179. }
  180. } // end of foreach $current_row
  181. }
  182. }
  183. return;
  184. }
  185. // 实时计算相似句
  186. function sents_similar_to_id($id)
  187. {
  188. $query = "SELECT count,text FROM pali_sent WHERE id=" . $id;
  189. $Current = PDO_FetchALL($query);
  190. if (count($Current)) {
  191. foreach ($Current as $current_row) {
  192. $current_count = $current_row['count'];
  193. $current_sent = $current_row['text'];
  194. $current_words = words_of_sentence($current_sent);
  195. print("current text: \n" . $current_sent . "\n");
  196. if ($current_count <= 5) {
  197. print("[-] too short.\n");
  198. return;
  199. }
  200. // 只和单词数大于 5 的比较
  201. $compare_query = "SELECT id,text FROM pali_sent WHERE count>5";
  202. $Compare = PDO_FetchALL($compare_query);
  203. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  204. foreach ($Compare as $compare_row) {
  205. if ($current_row != $compare_row) {
  206. $compare_id = $compare_row['id'];
  207. $compare_sent = $compare_row['text'];
  208. $compare_words = words_of_sentence($compare_sent);
  209. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  210. if ($jaccard_score > 0.3) {
  211. print("Jaccard similarity: " . $jaccard_score . "\tSentence id:" . $compare_id . "\n");
  212. print("Text: \n" . $compare_sent . "\n");
  213. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  214. }
  215. }
  216. } // end of foreach $compare_row
  217. if ($current_sim_sent_list->size != 0) {
  218. // $current_sim_sent_list->print_list();
  219. } else {
  220. print("[-]not found.\n");
  221. }
  222. } // end of foreach($Current)
  223. } // end of if (count($Current))
  224. }
  225. //$id = $argv[1];
  226. //sents_similar_to_id($id);
  227. if ($argc < 3) {
  228. echo "无效的参数 ";
  229. exit;
  230. }
  231. $from = (int) $argv[1];
  232. $to = (int) $argv[2];
  233. if ($argc > 3) {
  234. $from_id = (int) $argv[3];
  235. } else {
  236. $from_id = 0;
  237. }
  238. if ($from < 2) {
  239. $from = 2;
  240. }
  241. if ($to > 255) {
  242. $to = 255;
  243. }
  244. similar_sent_matrix($from, $to, $from_id);
  245. echo "\n all done";