pali_sent.php 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. <?php
  2. //查询term字典
  3. require_once "../path.php";
  4. require_once "../public/_pdo.php";
  5. //is login
  6. if (isset($_COOKIE["username"]) && !empty($_COOKIE["username"])) {
  7. $username = $_COOKIE["username"];
  8. } else {
  9. $username = "";
  10. }
  11. if (isset($_GET["op"])) {
  12. $_op = $_GET["op"];
  13. } else if (isset($_POST["op"])) {
  14. $_op = $_POST["op"];
  15. }
  16. if (isset($_GET["word"])) {
  17. $_word = mb_strtolower($_GET["word"], 'UTF-8');
  18. }
  19. if (isset($_GET["id"])) {
  20. $_id = $_GET["id"];
  21. }
  22. global $PDO;
  23. PDO_Connect("" . _FILE_DB_PALI_SENTENCE_);
  24. if (isset($_GET["sent"])) {
  25. $_sent = mb_strtolower($_GET["sent"], 'UTF-8');
  26. }
  27. // 输入一个句子,输出整个句子的单词 array
  28. function words_of_sentence(string $sent)
  29. {
  30. $words = preg_split("/[ \.\[\]\{\}\-,';‘’–0123456789]+/", $sent); // 去除标点、数字
  31. $words = array_filter($words); // 去除空词
  32. $words = array_filter($words, function ($item) {
  33. if ($item != 'ca' && $item != 'vā' && $item != 'na') {
  34. return $item;
  35. }
  36. }); // 去除 ca, vā 和 na
  37. return $words;
  38. }
  39. // 采用 jaccard 相似度,考虑到圣典中的相似句单词、句式都是非常接近的
  40. function jaccard_similarity($words_of_sent1, $words_of_sent2)
  41. {
  42. $intersect = count(array_intersect($words_of_sent1, $words_of_sent2));
  43. $union = count($words_of_sent1) + count($words_of_sent2) - $intersect;
  44. if ($union) {
  45. return $intersect / $union;
  46. } else {
  47. return 0;
  48. }
  49. }
  50. // 带顺序的 jaccard 算法,当前效果一般,TODO: 切片相同时加入得分
  51. function ordered_jaccard_similarity($words_of_sent1, $words_of_sent2)
  52. {
  53. $score = 0;
  54. $k = min(count($words_of_sent1), count($words_of_sent2));
  55. for ($i = 1; $i < $k; $i++) {
  56. $score += jaccard_similarity(
  57. array_slice($words_of_sent1, 0, $i),
  58. array_slice($words_of_sent2, 0, $i));
  59. }
  60. return $score / $k;
  61. }
  62. // 定义一个链表节点,方便 sim_sent_id 按照相似度插入
  63. class sim_sent_node
  64. {
  65. public $id;
  66. public $jaccard_score;
  67. public $next;
  68. public function __construct($id = null, $jaccard_score = null, $next = null)
  69. {
  70. $this->id = $id;
  71. $this->jaccard_score = $jaccard_score;
  72. $this->next = $next;
  73. }
  74. }
  75. // 定义链表
  76. class sim_sent_list
  77. {
  78. public $head; // 头节点,默认一个虚头节点
  79. public $size;
  80. public function __construct()
  81. {
  82. $this->head = new sim_sent_node();
  83. $this->size = 0;
  84. }
  85. // 按照 jaccard_score 相似度插入
  86. public function jaccard_add($id, $jaccard_score)
  87. {
  88. $prev = $this->head;
  89. while ($prev->next != null && $prev->next->jaccard_score > $jaccard_score) {
  90. $prev = $prev->next;
  91. }
  92. $prev->next = new sim_sent_node($id, $jaccard_score, $prev->next);
  93. $this->size++;
  94. }
  95. public function get_text_list()
  96. {
  97. $prev = $this->head;
  98. if ($this->size == 0) {
  99. return;
  100. }
  101. $ids = "";
  102. while ($prev->next != null) {
  103. $ids = $ids . "," . $prev->next->id;
  104. $prev = $prev->next;
  105. }
  106. $ids = substr($ids, 1); // 去掉第一个逗号
  107. return $ids;
  108. }
  109. public function print_list()
  110. {
  111. $prev = $this->head;
  112. if ($this->size == 0) {
  113. return;
  114. }
  115. while ($prev->next != null) {
  116. print($prev->next->id . "\t" . $prev->next->jaccard_score . "\n");
  117. $prev = $prev->next;
  118. }
  119. }
  120. }
  121. // 将相似句列表存入数据库
  122. function insert_similar_sent_list_into_sqlite($current_id, $text_list)
  123. {
  124. /* 使用这部分代码先为数据库添加一个 sim_sents 字段
  125. $add_column = "ALTER TABLE pali_sent ADD COLUMN sim_sents TEXT";
  126. $Action_add = PDO_Execute($add_column);
  127. $query = "PRAGMA TABLE_INFO (pali_sent)";
  128. $Fetch = PDO_FetchALL($query);
  129. print_r($Fetch);
  130. */
  131. global $PDO;
  132. $Update = "UPDATE pali_sent SET sim_sents = " . $PDO->quote($text_list) . " WHERE id = " . $current_id;
  133. $Result = PDO_Execute($Update);
  134. return;
  135. }
  136. // 预计算,存入数据库
  137. function similar_sent_matrix()
  138. {
  139. // 按照 count = 18, 8, ..., 255 依次获得查询结果 (i-3,i+3)
  140. // count = 17,16,...,7 (i-2,i+2)
  141. for ($current_count = 17; $current_count > 7; $current_count--) {
  142. print("单词数:" . $current_count . "\n");
  143. $current_query = "select id,text from pali_sent where count=" . $current_count;
  144. $Current = PDO_FetchAll($current_query);
  145. if (count($Current)) {
  146. foreach ($Current as $current_row) {
  147. $current_id = $current_row['id'];
  148. $current_sent = $current_row['text'];
  149. $current_words = words_of_sentence($current_sent);
  150. // 按照 count > $current_count-3 and count <$current_count+3 查询希望比较的语句
  151. $compare_query = "select id,text from pali_sent where count>" . ($current_count - 2) . " and count<" . ($current_count + 2);
  152. $Compare = PDO_FetchALL($compare_query);
  153. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  154. foreach ($Compare as $compare_row) {
  155. if ($current_row != $compare_row) {
  156. $compare_id = $compare_row['id'];
  157. $compare_sent = $compare_row['text'];
  158. $compare_words = words_of_sentence($compare_sent);
  159. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  160. if ($jaccard_score > 0.3) {
  161. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  162. }
  163. }
  164. } // end of foreach $compare_row
  165. if ($current_sim_sent_list->size != 0) {
  166. print("update " . $current_id . "\n");
  167. $text_list = $current_sim_sent_list->get_text_list();
  168. insert_similar_sent_list_into_sqlite($current_id, $text_list);
  169. }
  170. } // end of foreach $current_row
  171. }
  172. }
  173. return;
  174. }
  175. // 实时计算相似句
  176. function sents_similar_to_id($id)
  177. {
  178. $query = "SELECT count,text FROM pali_sent WHERE id=" . $id;
  179. $Current = PDO_FetchALL($query);
  180. if (count($Current)) {
  181. foreach ($Current as $current_row) {
  182. $current_count = $current_row['count'];
  183. $current_sent = $current_row['text'];
  184. $current_words = words_of_sentence($current_sent);
  185. print("current text: \n" . $current_sent . "\n");
  186. if ($current_count <= 5) {
  187. print("[-] too short.\n");
  188. return;
  189. }
  190. // 只和单词数大于 5 的比较
  191. $compare_query = "SELECT id,text FROM pali_sent WHERE count>5";
  192. $Compare = PDO_FetchALL($compare_query);
  193. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  194. foreach ($Compare as $compare_row) {
  195. if ($current_row != $compare_row) {
  196. $compare_id = $compare_row['id'];
  197. $compare_sent = $compare_row['text'];
  198. $compare_words = words_of_sentence($compare_sent);
  199. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  200. if ($jaccard_score > 0.3) {
  201. print("Jaccard similarity: " . $jaccard_score . "\tSentence id:" . $compare_id . "\n");
  202. print("Text: \n" . $compare_sent . "\n");
  203. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  204. }
  205. }
  206. } // end of foreach $compare_row
  207. if ($current_sim_sent_list->size != 0) {
  208. // $current_sim_sent_list->print_list();
  209. } else {
  210. print("[-]not found.\n");
  211. }
  212. } // end of foreach($Current)
  213. } // end of if (count($Current))
  214. }
  215. $id = $argv[1];
  216. //sents_similar_to_id($id);
  217. if (!isset($_op)) {
  218. exit(0);
  219. }
  220. switch ($_op) {
  221. case "get":
  222. {
  223. $Fetch = array();
  224. if (isset($_word)) {
  225. $queryWord = str_replace(" ", "", $_word);
  226. $query = "select book,paragraph,text from pali_sent where \"real\" like " . $PDO->quote("%" . $queryWord . '%') . " limit 0,5";
  227. $Fetch = PDO_FetchAll($query);
  228. $newList = array();
  229. //去掉重复的
  230. foreach ($Fetch as $onerow) {
  231. $found = false;
  232. foreach ($newList as $new) {
  233. if ($onerow["text"] == $new["text"]) {
  234. $found = true;
  235. break;
  236. }
  237. }
  238. if ($found == false) {
  239. array_push($newList, $onerow);
  240. }
  241. }
  242. $Fetch = $newList;
  243. if (count($Fetch) < 5) {
  244. $query = "select text from pali_sent where \"real_en\" like " . $PDO->quote('%' . $queryWord . '%') . " limit 0,5";
  245. $Fetch2 = PDO_FetchAll($query);
  246. //去掉重复的
  247. foreach ($Fetch2 as $onerow) {
  248. $found = false;
  249. foreach ($Fetch as $oldArray) {
  250. if ($onerow["word"] == $oldArray["word"]) {
  251. $found = true;
  252. break;
  253. }
  254. }
  255. if ($found == false) {
  256. array_push($Fetch, $onerow);
  257. }
  258. }
  259. }
  260. } else if (isset($_id)) {
  261. }
  262. echo json_encode($Fetch, JSON_UNESCAPED_UNICODE);
  263. break;
  264. }
  265. }