Просмотр исходного кода

Merge pull request #592 from VitoVan/master

全文检索:词形转换词典更新方式说明 + PHP 调用样例
visuddhinanda 4 лет назад
Родитель
Сommit
7ec1298249
4 измененных файлов с 173 добавлено и 4 удалено
  1. 66 1
      app/fts/README.md
  2. 103 0
      app/fts/example.php
  3. BIN
      app/fts/example.png
  4. 4 3
      app/fts/fts.sql

+ 66 - 1
app/fts/README.md

@@ -56,6 +56,58 @@ php -d memory_limit=1024M pali.syn.php
 
 将会在当前目录下生成 pali.syn 文件
 
+**注意 01**
+
+变更 pali.syn 词形转换词典后,有两种方式使其生效(任选其一):
+
+1. 在当前会话 (session) 内执行下面语句:
+
+关于使用下面语句的原因,请参见:[PostgreSQL / Dictionaries / Simple Dictionary - Caution ("dummy" update)](https://www.postgresql.org/docs/14/textsearch-dictionaries.html#TEXTSEARCH-SIMPLE-DICTIONARY)
+
+```sql
+-- dummy update
+ALTER TEXT SEARCH DICTIONARY pali_stem (
+    SYNONYMS = pali
+);
+```
+
+2. 断开当前会话 (session),重新连接。
+
+**注意 02**
+
+在 pali.syn 内添加词形转换时,请向文件末尾添加,实际测试发现在文件头部添加并未生效。
+
+**注意 03**
+
+更新 pali.syn 这个变更涉及到所有三藏文本中的词形转换,所以需要手动重建全文检索索引。
+
+两种方式(任选其一):
+
+1. 你知道改变更会影响哪些记录(比如先进行搜索得出结果)
+
+```sql
+-- dummy update
+UPDATE fts SET content = content,
+               bold_single = bold_single,
+               bold_double = bold_double,
+               bold_multiple = bold_multiple
+               WHERE paragraph = 37 AND book = 'p180';
+```
+
+请将 paragraph 和 book 替换为目标值。
+
+2. 这是一个普遍的变更,会影响到很多记录
+
+```sql
+-- dummy update
+UPDATE fts SET content = content,
+               bold_single = bold_single,
+               bold_double = bold_double,
+               bold_multiple = bold_multiple;
+```
+
+移除掉 WHERE 条件,将会触发所有记录重新建立索引,会花上很长时间,执行之前请三思。
+
 #### pali.stop 停用词词典
 
 请熟悉巴利语的贤者依据巴利文法编辑 [pali.stop](./pali.stop) 文件,
@@ -136,7 +188,7 @@ done
 
 数据插入时会同步创建索引,耗时较久,请耐心等待。
 
-### 查询数据:
+### 使用 SQL 查询数据:
 
 权重设置:
 
@@ -758,3 +810,16 @@ SELECT
 	</tr>
 </table>
 </details>
+
+### 使用 PHP 查询数据:
+
+可参考 [example.php](./example.php
+),在当前目录下执行:
+
+```bash
+php -d memory_limit=1024M -S 127.0.0.1:8000
+```
+
+即可通过浏览器测试效果:
+
+![Example](./example.png "Example Screenshot")

+ 103 - 0
app/fts/example.php

@@ -0,0 +1,103 @@
+<html>
+  <head>
+    <title>Pali Full Text Search Example @ PostgreSQL</title>
+    <style>
+     * {
+         font-family: "Noto Sans", "Noto Sans SC", "Noto Sans TC", "Padauk", "ATaiThamKHNewV3-Normal", Arial, Verdana;
+     }
+     td {
+         border-right-style: solid;
+         border-top-style: solid;
+     }
+     table {
+         border-style: solid;
+     }
+     table span {
+         background-color: yellow;
+         font-size: 1.2em;
+     }
+     th {
+         font-weight: bold;
+     }
+     input[name="q"] {
+         width: 70%;
+     }
+    </style>
+  </head>
+  
+  <body>
+    <?php
+    if ($_SERVER["REQUEST_METHOD"] == "POST") {
+      // collect value of input field
+      $q = $_POST['q'];
+    }
+    ?>
+    <form method="post" action="<?php echo $_SERVER['PHP_SELF'];?>">
+      Name: <input type="text" name="q" value="<?php echo $q ?>">
+      <input type="submit">
+    </form>
+    <?php
+    if (empty($q)) {
+      echo "Query is empty";
+    } else {
+      // Connecting, selecting database
+      $dbconn = pg_connect("host=localhost dbname=pali user=postgres password=123456")
+      or die('Could not connect: ' . pg_last_error());
+
+      // Performing SQL query
+      $query = "SELECT
+                 ts_rank('{0.1, 0.2, 0.4, 1}',
+                     full_text_search_weighted,
+                     websearch_to_tsquery('pali', '$q')) +
+                 ts_rank('{0.1, 0.2, 0.4, 1}',
+                     full_text_search_weighted_unaccent,
+                     websearch_to_tsquery('pali_unaccent', '$q'))
+                 AS rank,
+                 ts_headline('simple', content,
+                              websearch_to_tsquery('simple', '$q'),
+                              'StartSel = <span>, StopSel = </span>')
+                 AS highlight,
+                 *
+                 FROM fts
+                 WHERE
+                     full_text_search_weighted
+                     @@ websearch_to_tsquery('pali', '$q') OR
+                     full_text_search_weighted_unaccent
+                     @@ websearch_to_tsquery('pali_unaccent', '$q')
+                 ORDER BY rank DESC
+                 LIMIT 20;";
+      $result = pg_query($query) or die('Query failed: ' . pg_last_error());
+
+      // Printing results in HTML
+      echo "<table>\n";
+      echo "<tr>
+                 <th>rank</th>
+                 <th>highlight</th>
+                 <th>paragraph</th>
+                 <th>book</th>
+                 <th>wid</th>
+                 <th>bold_single</th>
+                 <th>bold_double</th>
+                 <th>bold_multiple</th>
+                 <th>content</th>
+                 <th>TSVECTOR</th>
+                 <th>TSVECTOR (unaccent)</th>
+              </tr>";
+      while ($line = pg_fetch_array($result, null, PGSQL_ASSOC)) {
+        echo "\t<tr>\n";
+        foreach ($line as $col_value) {
+          echo "\t\t<td><div class='cell'>$col_value</div></td>\n";
+        }
+        echo "\t</tr>\n";
+      }
+      echo "</table>\n";
+
+      // Free resultset
+      pg_free_result($result);
+
+      // Closing connection
+      pg_close($dbconn);
+    }
+    ?>
+  </body>
+</html>

BIN
app/fts/example.png


+ 4 - 3
app/fts/fts.sql

@@ -32,6 +32,7 @@ CREATE TEXT SEARCH DICTIONARY pali_stopwords (
 );
 
 -- 修改全文检索配置 pali 使用我们创建的字典
+
 ALTER TEXT SEARCH CONFIGURATION pali
     ADD MAPPING FOR asciiword, word, hword_part, hword_asciipart
     WITH pali_stem, pali_stopwords;
@@ -70,12 +71,12 @@ ALTER TABLE fts
 CREATE INDEX full_text_search_weighted_idx
        ON fts USING GIN (full_text_search_weighted);
 
-CREATE INDEX full_text_search_weighted__unaccent_idx
+CREATE INDEX full_text_search_weighted_unaccent_idx
        ON fts USING GIN (full_text_search_weighted_unaccent);
 
 -- 创建查询函数
 
-CREATE OR REPLACE FUNCTION query_pali(query_str TEXT) 
+CREATE OR REPLACE FUNCTION query_pali(query_str TEXT)
   RETURNS TABLE(
           rank NUMERIC,
           paragraph INTEGER,
@@ -85,7 +86,7 @@ CREATE OR REPLACE FUNCTION query_pali(query_str TEXT)
           bold_multiple TEXT,
           content TEXT,
           full_text_search_weighted TSVECTOR,
-          full_text_search_weighted_unaccent TSVECTOR) 
+          full_text_search_weighted_unaccent TSVECTOR)
 AS $$
     SELECT
     ts_rank('{0.1, 0.2, 0.4, 1}',