pali_sent.php 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. <?php
  2. //查询term字典
  3. require_once "../path.php";
  4. require_once "../public/_pdo.php";
  5. //is login
  6. if(isset($_COOKIE["username"]) && !empty($_COOKIE["username"])){
  7. $username = $_COOKIE["username"];
  8. }
  9. else{
  10. $username = "";
  11. }
  12. if(isset($_GET["op"])){
  13. $_op=$_GET["op"];
  14. }
  15. else if(isset($_POST["op"])){
  16. $_op=$_POST["op"];
  17. }
  18. if(isset($_GET["word"])){
  19. $_word=mb_strtolower($_GET["word"],'UTF-8');
  20. }
  21. if(isset($_GET["id"])){
  22. $_id=$_GET["id"];
  23. }
  24. global $PDO;
  25. PDO_Connect("sqlite:"._FILE_DB_PALI_SENTENCE_);
  26. if(isset($_GET["sent"])){
  27. $_sent=mb_strtolower($_GET["sent"],'UTF-8');
  28. }
  29. // 输入一个句子,输出整个句子的单词 array
  30. function words_of_sentence(string $sent) {
  31. $words = preg_split("/[ \.\[\]\{\}\-,';‘’–0123456789]+/", $sent); // 去除标点、数字
  32. $words = array_filter($words); // 去除空词
  33. $words = array_filter($words, function($item) {
  34. if ($item != 'ca' && $item != 'vā' && $item != 'na') return $item; }); // 去除 ca, vā 和 na
  35. return $words;
  36. }
  37. // 采用 jaccard 相似度,考虑到圣典中的相似句单词、句式都是非常接近的
  38. function jaccard_similarity($words_of_sent1, $words_of_sent2) {
  39. $intersect_array = array_intersect($words_of_sent1, $words_of_sent2);
  40. $intersect = count($intersect_array);
  41. $union_array = array_merge($words_of_sent1, $words_of_sent2);
  42. $union = count($union_array) - $intersect;
  43. if ($intersect) {
  44. return $intersect / $union;
  45. } else {
  46. return 0;
  47. }
  48. }
  49. // 带顺序的 jaccard 算法,当前效果一般,TODO: 切片相同时加入得分
  50. function ordered_jaccard_similarity($words_of_sent1, $words_of_sent2) {
  51. $score = 0;
  52. $k = min(count($words_of_sent1), count($words_of_sent2));
  53. for ($i=1; $i<$k; $i++) {
  54. $score += jaccard_similarity(
  55. array_slice($words_of_sent1, 0, $i),
  56. array_slice($words_of_sent2, 0, $i));
  57. }
  58. return $score / $k;
  59. }
  60. // 定义一个链表节点,方便 sim_sent_id 按照相似度插入
  61. class sim_sent_node {
  62. public $id;
  63. public $jaccard_score;
  64. public $next;
  65. public function __construct($id = null, $jaccard_score = null, $next = null) {
  66. $this->id = $id;
  67. $this->jaccard_score = $jaccard_score;
  68. $this->next = $next;
  69. }
  70. }
  71. // 定义链表
  72. class sim_sent_list {
  73. public $head; // 头节点,默认一个虚头节点
  74. public $size;
  75. public function __construct() {
  76. $this->head = new sim_sent_node();
  77. $this->size = 0;
  78. }
  79. // 按照 jaccard_score 相似度插入
  80. public function jaccard_add($id, $jaccard_score) {
  81. $prev = $this->head;
  82. while ($prev->next != null && $prev->next->jaccard_score > $jaccard_score) {
  83. $prev = $prev->next;
  84. }
  85. if ($prev->next == null) {
  86. $prev->next = new sim_sent_node($id, $jaccard_score, null);
  87. } else {
  88. $insert_node = new sim_sent_node($id, $jaccard_score, $prev->next);
  89. $prev->next = $insert_node;
  90. }
  91. $this->size++;
  92. }
  93. // 调试用打印本链表
  94. public function print_list() {
  95. $prev = $this->head;
  96. if ($this->size == 0) {
  97. return;
  98. }
  99. while ($prev->next != null) {
  100. print($prev->next->id."\t".$prev->next->jaccard_score."\n");
  101. $prev = $prev->next;
  102. }
  103. }
  104. }
  105. // TODO: 将 current_id 的 similar_sent_list 存入数据库
  106. function insert_similar_sent_list_into_sqlite($current_id, $list) {
  107. /* 使用这部分代码先为数据库添加一个 sim_sents 字段
  108. $add_column = "ALTER TABLE pali_sent ADD COLUMN sim_sents TEXT";
  109. $Action_add = PDO_Execute($add_column);
  110. $query = "PRAGMA TABLE_INFO (pali_sent)";
  111. $Fetch = PDO_FetchALL($query);
  112. print_r($Fetch);
  113. */
  114. return;
  115. }
  116. // TODO: 考虑圣典中的相似句位置应该比较接近,可以减少比较量
  117. function similar_sent_matrix() {
  118. $query = "select id,text from pali_sent limit 40000,1000";
  119. #$query = "select id,text from pali_sent where id = 10872 or id = 10716";
  120. $Fetch = PDO_FetchAll($query);
  121. foreach($Fetch as $current_row) {
  122. $current_id = $current_row['id'];
  123. $current_sent = $current_row['text'];
  124. $current_words = words_of_sentence($current_sent);
  125. if (count($current_words) > 5) { // 比较句子长度大于 5 的
  126. $current_sim_sent_list = new sim_sent_list(); // 新建相似句链表
  127. foreach($Fetch as $compare_row) {
  128. if ($current_row != $compare_row) {
  129. $compare_id = $compare_row['id'];
  130. $compare_sent = $compare_row['text'];
  131. $compare_words = words_of_sentence($compare_sent);
  132. if(count($compare_words) > 5) {
  133. $jaccard_score = jaccard_similarity($current_words, $compare_words);
  134. if ($jaccard_score > 0.3) {
  135. $current_sim_sent_list->jaccard_add($compare_id, $jaccard_score);
  136. #print($current_sim_sent_list->next->id);
  137. }
  138. }
  139. }
  140. } // end of foreach $compare_row
  141. if ($current_sim_sent_list->size != 0) {
  142. print("sim_sent_list of ".$current_id.":\n");
  143. $current_sim_sent_list->print_list();
  144. insert_similar_sent_list_into_sqlite($current_id, $current_sim_sent_list);
  145. }
  146. }
  147. } // end of foreach $current_row
  148. }
  149. similar_sent_matrix();
  150. if (!isset($_op)) {
  151. exit(0);
  152. }
  153. switch($_op){
  154. case "get":
  155. {
  156. $Fetch=array();
  157. if(isset($_word)){
  158. $queryWord = str_replace(" ","",$_word);
  159. $query = "select book,paragraph,text from pali_sent where \"real\" like ".$PDO->quote("%".$queryWord.'%')." limit 0,5";
  160. $Fetch = PDO_FetchAll($query);
  161. $newList = array();
  162. //去掉重复的
  163. foreach($Fetch as $onerow){
  164. $found=false;
  165. foreach($newList as $new){
  166. if($onerow["text"]==$new["text"]){
  167. $found=true;
  168. break;
  169. }
  170. }
  171. if($found==false){
  172. array_push($newList,$onerow);
  173. }
  174. }
  175. $Fetch = $newList;
  176. if(count($Fetch)<5){
  177. $query = "select text from pali_sent where \"real_en\" like ".$PDO->quote('%'.$queryWord.'%')." limit 0,5";
  178. $Fetch2 = PDO_FetchAll($query);
  179. //去掉重复的
  180. foreach($Fetch2 as $onerow){
  181. $found=false;
  182. foreach($Fetch as $oldArray){
  183. if($onerow["word"]==$oldArray["word"]){
  184. $found=true;
  185. break;
  186. }
  187. }
  188. if($found==false){
  189. array_push($Fetch,$onerow);
  190. }
  191. }
  192. }
  193. }
  194. else if(isset($_id)){
  195. }
  196. echo json_encode($Fetch, JSON_UNESCAPED_UNICODE);
  197. break;
  198. }
  199. }
  200. ?>