pali_sent.php 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. <?php
  2. //查询term字典
  3. require_once "../path.php";
  4. require_once "../public/_pdo.php";
  5. //is login
  6. if(isset($_COOKIE["username"]) && !empty($_COOKIE["username"])){
  7. $username = $_COOKIE["username"];
  8. }
  9. else{
  10. $username = "";
  11. }
  12. if(isset($_GET["op"])){
  13. $_op=$_GET["op"];
  14. }
  15. else if(isset($_POST["op"])){
  16. $_op=$_POST["op"];
  17. }
  18. if(isset($_GET["word"])){
  19. $_word=mb_strtolower($_GET["word"],'UTF-8');
  20. }
  21. if(isset($_GET["id"])){
  22. $_id=$_GET["id"];
  23. }
  24. global $PDO;
  25. PDO_Connect("sqlite:"._FILE_DB_PALI_SENTENCE_);
  26. if(isset($_GET["sent"])){
  27. $_sent=mb_strtolower($_GET["sent"],'UTF-8');
  28. }
  29. // 输入一个句子,输出整个句子的单词 array
  30. function words_of_sentence(string $sent) {
  31. $words = preg_split("/[ \.\[\]\{\}\-,';‘’–0123456789]+/", $sent); // 去除标点、数字
  32. $words = array_filter($words); // 去除空词
  33. $words = array_filter($words, function($item) {
  34. if ($item != 'ca' && $item != 'vā' && $item != 'na') return $item; }); // 去除 ca, vā 和 na
  35. return $words;
  36. }
  37. // 采用 jaccard 相似度,考虑到圣典中的相似句单词、句式都是非常接近的
  38. function jaccard_similarity($words_of_sent1, $words_of_sent2) {
  39. $intersect = count(array_intersect($words_of_sent1, $words_of_sent2));
  40. $union = count($words_of_sent1)+count($words_of_sent2)-$intersect;
  41. if ($union) {
  42. return $intersect / $union;
  43. } else {
  44. return 0;
  45. }
  46. }
  47. // 带顺序的 jaccard 算法,当前效果一般,TODO: 切片相同时加入得分
  48. function ordered_jaccard_similarity($words_of_sent1, $words_of_sent2) {
  49. $score = 0;
  50. $k = min(count($words_of_sent1), count($words_of_sent2));
  51. for ($i=1; $i<$k; $i++) {
  52. $score += jaccard_similarity(
  53. array_slice($words_of_sent1, 0, $i),
  54. array_slice($words_of_sent2, 0, $i));
  55. }
  56. return $score / $k;
  57. }
  58. // 定义一个链表节点,方便 sim_sent_id 按照相似度插入
  59. class sim_sent_node {
  60. public $id;
  61. public $jaccard_score;
  62. public $next;
  63. public function __construct($id = null, $jaccard_score = null, $next = null) {
  64. $this->id = $id;
  65. $this->jaccard_score = $jaccard_score;
  66. $this->next = $next;
  67. }
  68. }
  69. // 定义链表
  70. class sim_sent_list {
  71. public $head; // 头节点,默认一个虚头节点
  72. public $size;
  73. public function __construct() {
  74. $this->head = new sim_sent_node();
  75. $this->size = 0;
  76. }
  77. // 按照 jaccard_score 相似度插入
  78. public function jaccard_add($id, $jaccard_score) {
  79. $prev = $this->head;
  80. while ($prev->next != null && $prev->next->jaccard_score > $jaccard_score) {
  81. $prev = $prev->next;
  82. }
  83. $prev->next = new sim_sent_node($id, $jaccard_score, $prev->next);
  84. $this->size++;
  85. }
  86. public function get_text_list() {
  87. $prev = $this->head;
  88. if ($this->size == 0) {
  89. return;
  90. }
  91. $ids = "";
  92. while ($prev->next != null) {
  93. $ids = $ids.",".$prev->next->id;
  94. $prev = $prev->next;
  95. }
  96. $ids = substr($ids, 1); // 去掉第一个逗号
  97. return $ids;
  98. }
  99. public function print_list() {
  100. $prev = $this->head;
  101. if ($this->size == 0) {
  102. return;
  103. }
  104. while ($prev->next != null) {
  105. print($prev->next->id."\t".$prev->next->jaccard_score."\n");
  106. $prev = $prev->next;
  107. }
  108. }
  109. }
  110. // 将相似句列表存入数据库
  111. function insert_similar_sent_list_into_sqlite($current_id, $text_list) {
  112. /* 使用这部分代码先为数据库添加一个 sim_sents 字段
  113. $add_column = "ALTER TABLE pali_sent ADD COLUMN sim_sents TEXT";
  114. $Action_add = PDO_Execute($add_column);
  115. $query = "PRAGMA TABLE_INFO (pali_sent)";
  116. $Fetch = PDO_FetchALL($query);
  117. print_r($Fetch);
  118. */
  119. global $PDO;
  120. $Update = "UPDATE pali_sent SET sim_sents = ".$PDO->quote($text_list)." WHERE id = ".$current_id;
  121. $Result = PDO_Execute($Update);
  122. return;
  123. }
  124. // 预计算,存入数据库
  125. function similar_sent_matrix() {
  126. // 按照 count = 18, 8, ..., 255 依次获得查询结果 (i-3,i+3)
  127. // count = 17,16,...,7 (i-2,i+2)
  128. for ($current_count=17; $current_count > 7; $current_count--) {
  129. print("单词数:".$current_count."\n");
  130. $current_query = "select id,text from pali_sent where count=".$current_count;
  131. $Current = PDO_FetchAll($current_query);
  132. if (count($Current)) {
  133. foreach($Current as $current_row) {
  134. $current_id = $current_row['id'];
  135. $current_sent = $current_row['text'];
  136. $current_words = words_of_sentence($current_sent);
  137. // 按照 count > $current_count-3 and count <$current_count+3 查询希望比较的语句
  138. $compare_query = "select id,text from pali_sent where count>".($current_count-2)." and count<".($current_count+2);
  139. $Compare = PDO_FetchALL($compare_query);
  140. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  141. foreach($Compare as $compare_row) {
  142. if ($current_row != $compare_row) {
  143. $compare_id = $compare_row['id'];
  144. $compare_sent = $compare_row['text'];
  145. $compare_words = words_of_sentence($compare_sent);
  146. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  147. if ($jaccard_score > 0.3) {
  148. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  149. }
  150. }
  151. } // end of foreach $compare_row
  152. if ($current_sim_sent_list->size != 0) {
  153. print("update ".$current_id."\n");
  154. $text_list = $current_sim_sent_list->get_text_list();
  155. insert_similar_sent_list_into_sqlite($current_id, $text_list);
  156. }
  157. } // end of foreach $current_row
  158. }
  159. }
  160. return;
  161. }
  162. // 实时计算相似句
  163. function sents_similar_to_id($id) {
  164. $query = "SELECT count,text FROM pali_sent WHERE id=".$id;
  165. $Current = PDO_FetchALL($query);
  166. if (count($Current)) {
  167. foreach($Current as $current_row) {
  168. $current_count = $current_row['count'];
  169. $current_sent = $current_row['text'];
  170. $current_words = words_of_sentence($current_sent);
  171. print("current text: \n".$current_sent."\n");
  172. if ($current_count <= 5) {
  173. print("[-] too short.\n");
  174. return;
  175. }
  176. // 只和单词数大于 5 的比较
  177. $compare_query = "SELECT id,text FROM pali_sent WHERE count>5";
  178. $Compare = PDO_FetchALL($compare_query);
  179. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  180. foreach($Compare as $compare_row) {
  181. if ($current_row != $compare_row) {
  182. $compare_id = $compare_row['id'];
  183. $compare_sent = $compare_row['text'];
  184. $compare_words = words_of_sentence($compare_sent);
  185. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  186. if ($jaccard_score > 0.3) {
  187. print("Jaccard similarity: ".$jaccard_score."\tSentence id:".$compare_id."\n");
  188. print("Text: \n". $compare_sent."\n");
  189. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  190. }
  191. }
  192. } // end of foreach $compare_row
  193. if ($current_sim_sent_list->size != 0) {
  194. // $current_sim_sent_list->print_list();
  195. } else {
  196. print("[-]not found.\n");
  197. }
  198. } // end of foreach($Current)
  199. } // end of if (count($Current))
  200. }
  201. $id = $argv[1];
  202. sents_similar_to_id($id);
  203. if (!isset($_op)) {
  204. exit(0);
  205. }
  206. switch($_op){
  207. case "get":
  208. {
  209. $Fetch=array();
  210. if(isset($_word)){
  211. $queryWord = str_replace(" ","",$_word);
  212. $query = "select book,paragraph,text from pali_sent where \"real\" like ".$PDO->quote("%".$queryWord.'%')." limit 0,5";
  213. $Fetch = PDO_FetchAll($query);
  214. $newList = array();
  215. //去掉重复的
  216. foreach($Fetch as $onerow){
  217. $found=false;
  218. foreach($newList as $new){
  219. if($onerow["text"]==$new["text"]){
  220. $found=true;
  221. break;
  222. }
  223. }
  224. if($found==false){
  225. array_push($newList,$onerow);
  226. }
  227. }
  228. $Fetch = $newList;
  229. if(count($Fetch)<5){
  230. $query = "select text from pali_sent where \"real_en\" like ".$PDO->quote('%'.$queryWord.'%')." limit 0,5";
  231. $Fetch2 = PDO_FetchAll($query);
  232. //去掉重复的
  233. foreach($Fetch2 as $onerow){
  234. $found=false;
  235. foreach($Fetch as $oldArray){
  236. if($onerow["word"]==$oldArray["word"]){
  237. $found=true;
  238. break;
  239. }
  240. }
  241. if($found==false){
  242. array_push($Fetch,$onerow);
  243. }
  244. }
  245. }
  246. }
  247. else if(isset($_id)){
  248. }
  249. echo json_encode($Fetch, JSON_UNESCAPED_UNICODE);
  250. break;
  251. }
  252. }
  253. ?>