Kaynağa Gözat

Merge branch 'master' of https://github.com/visuddhinanda/mint

visuddhinanda 4 yıl önce
ebeveyn
işleme
62bb670c05

+ 7 - 7
app/fts/fts.sql

@@ -1,8 +1,8 @@
 -- 创建表结构
 
-CREATE TABLE fts (
+CREATE TABLE fts_texts (
        paragraph        integer,
-       book             varchar(10),
+       book             integer,
        wid              varchar(50),
        -- 单个出现的黑体字,权重较大
        bold_single      text,
@@ -47,7 +47,7 @@ ALTER TEXT SEARCH CONFIGURATION pali_unaccent
 
 -- 添加自动更新的 TSVECTOR 字段
 
-ALTER TABLE fts
+ALTER TABLE fts_texts
       ADD COLUMN full_text_search_weighted TSVECTOR
       GENERATED ALWAYS AS (
          setweight(to_tsvector('pali', coalesce(content,'')), 'A')  || ' ' ||
@@ -56,7 +56,7 @@ ALTER TABLE fts
          setweight(to_tsvector('pali', coalesce(bold_multiple,'')), 'D')
       ) STORED;
 
-ALTER TABLE fts
+ALTER TABLE fts_texts
       ADD COLUMN full_text_search_weighted_unaccent TSVECTOR
       GENERATED ALWAYS AS (
          setweight(to_tsvector('pali_unaccent', coalesce(content,'')), 'A')  || ' ' ||
@@ -68,10 +68,10 @@ ALTER TABLE fts
 -- 为该字段创建索引
 
 CREATE INDEX full_text_search_weighted_idx
-       ON fts USING GIN (full_text_search_weighted);
+       ON fts_texts USING GIN (full_text_search_weighted);
 
 CREATE INDEX full_text_search_weighted__unaccent_idx
-       ON fts USING GIN (full_text_search_weighted_unaccent);
+       ON fts_texts USING GIN (full_text_search_weighted_unaccent);
 
 -- 创建查询函数
 
@@ -96,7 +96,7 @@ AS $$
         websearch_to_tsquery('pali_unaccent', query_str)), -- AS rank
         paragraph, wid, bold_single, bold_double, bold_multiple, content,
         full_text_search_weighted, full_text_search_weighted_unaccent
-    FROM fts
+    FROM fts_texts
     WHERE
         full_text_search_weighted @@ websearch_to_tsquery('pali', query_str) OR
         full_text_search_weighted_unaccent @@ websearch_to_tsquery('pali_unaccent', query_str);

+ 36 - 7
app/fts/sql.php

@@ -1,5 +1,5 @@
 <?php
-
+require_once __DIR__."/../path.php";
 /*
  * 该脚本用于生成 SQL 语句, 将三藏语料 CSV 数据 (如:abh01a.att.csv)
  * 转换为 SQL 语句插入到 PostgreSQL 内,数据表结构参见 fts.sql
@@ -74,18 +74,25 @@ function count_bld ($bld_array) {
 }
 
 
+$dns = _DB_ENGIN_.":host="._DB_HOST_.";port="._DB_PORT_.";dbname="._DB_NAME_.";user="._DB_USERNAME_.";password="._DB_PASSWORD_.";";
+$dbh_fts = new PDO($dns, _DB_USERNAME_, _DB_PASSWORD_, array(PDO::ATTR_PERSISTENT => true));
+$dbh_fts->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
+
 // 查找 tmp/palicsv/ 目录下的语料数据
-$palicsv_path = '../../tmp/palicsv/';
+$palicsv_path = __DIR__.'/../../tmp/palicsv/';
 $scan = scandir($palicsv_path);
+$fileCounter = 0;
 foreach($scan as $foldername) {
   if (is_dir("$palicsv_path/$foldername")) {
+
     $csv_file = "$palicsv_path/$foldername/$foldername.csv";
 
     // DEBUG
     // if ($foldername != 'abh01m.mul') continue;
 
     if (is_file($csv_file)) {
-      echo '正在处理文件: ' . PHP_EOL . $csv_file . PHP_EOL;
+      $fileCounter++;      
+      echo "正在处理文件: $fileCounter" . PHP_EOL . $csv_file . PHP_EOL;
       // 存放当前正在处理的 CSV 文件生成的所有 SQL
       $sql_from_csv = '';
       // 初始化段落为 0 (没有这种段落)
@@ -94,6 +101,21 @@ foreach($scan as $foldername) {
       $bold_text = [];
       
       if (($handle = fopen($csv_file, "r")) !== FALSE) {
+        # 获取book id
+        $data = fgetcsv($handle, 1000, ",");
+        $data = fgetcsv($handle, 1000, ",");
+        $bookId = (int)mb_substr($data[2],1);
+        #删除旧数据
+        $query = "DELETE FROM "._TABLE_FTS_." WHERE book=?";
+        $stmt = $dbh_fts->prepare($query);
+        $stmt->execute(array($bookId));
+
+        // 开始一个事务,关闭自动提交
+        $dbh_fts->beginTransaction();
+        $query = "INSERT INTO "._TABLE_FTS_." (book , paragraph , wid,bold_single,bold_double,bold_multiple,content) VALUES ( ? , ? , ? , ? , ? , ? , ?  )";
+        $stmt = $dbh_fts->prepare($query);
+
+        rewind($handle);
         $row=0;
         while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) {
           #忽略第一行
@@ -142,14 +164,13 @@ foreach($scan as $foldername) {
                   $bold_multiple = "";
                 }
                 
-                $sql_from_csv .=
-                  "INSERT INTO fts VALUES ($paragraph, '$book', '$wid', '$bold_single', '$bold_double', '$bold_multiple', '$content');" . PHP_EOL;
-                // 转换后,重置黑体字数据
+                $stmt->execute(array($book, $paragraph, $wid,$bold_single,$bold_double,$bold_multiple,$content));
+                  // 转换后,重置黑体字数据
                 $bold_text = [];
               }
               // 如果是不同段落,则赋新的值
               $content = $current_word;
-              $paragraph =  $data[3];
+              $paragraph =  (int)$data[3];
               $book = (int)mb_substr($data[2],1);
               $wid =  $data[1];
 
@@ -160,6 +181,14 @@ foreach($scan as $foldername) {
 
         }
         fclose($handle);
+        // 提交更改
+        $dbh_fts->commit();
+        if (!$stmt || ($stmt && $stmt->errorCode() != 0)) {
+            $error = $dbh_fts->errorInfo();
+            echo "error - $error[2]".PHP_EOL;
+        } else {
+            echo "updata $row recorders.".PHP_EOL;
+        }	
       }
 
       file_put_contents("./sql/$foldername.sql", $sql_from_csv);

+ 3 - 0
app/path.sample.php

@@ -340,3 +340,6 @@ define("_FILE_DB_USER_STATISTICS_", "sqlite:" . __DIR__ . "/../tmp/user/statisti
 #权限管理 casbin使用
 //define("_FILE_DB_USER_RBAC_", _DB_ENGIN_.":host="._DB_HOST_.";port="._DB_PORT_.";dbname="._DB_NAME_.";user="._DB_USERNAME_.";password="._DB_PASSWORD_.";");
 define("_FILE_DB_USER_RBAC_",  __DIR__ . "/../tmp/user/rbac.db3");
+
+# 全文搜索
+define("_TABLE_FTS_", "fts_texts");

+ 8 - 1
app/search/paliword_sc.php

@@ -44,12 +44,19 @@ if (isset($_GET["page"])) {
 if (count($arrWordList) > 1) {
 	# 查询多个词
 	$out_data = array();
+    /*
     PDO_Connect(_FILE_DB_PALITEXT_);
     # 首先精确匹配
     $words = implode(" ", $arrWordList);
     $query = "SELECT book,paragraph, text FROM "._TABLE_PALI_TEXT_." WHERE text like ?  LIMIT ? OFFSET ?";
     $Fetch1 = PDO_FetchAll($query, array("%{$words}%", $_pagesize, $_page * $_pagesize));
+    */
+    $dns = _DB_ENGIN_.":host="._DB_HOST_.";port="._DB_PORT_.";dbname="._DB_NAME_.";user="._DB_USERNAME_.";password="._DB_PASSWORD_.";";
+    PDO_Connect(_FILE_DB_PALITEXT_,_DB_USERNAME_,_DB_PASSWORD_);
 
+
+    $query = "SELECT ts_rank('{0.1, 0.2, 0.4, 1}', full_text_search_weighted, websearch_to_tsquery('pali', ?)) AS rank, book,paragraph,content as text FROM fts WHERE full_text_search_weighted @@ websearch_to_tsquery('pali', ?) ORDER BY rank DESC LIMIT 20";
+    $Fetch1 = PDO_FetchAll($query, array($word, $word));    
     foreach ($Fetch1 as $key => $value) {
         # code...
         $newRecode["title"] = $_dbPaliText->getTitle($value["book"], $value["paragraph"]);
@@ -61,7 +68,7 @@ if (count($arrWordList) > 1) {
         $newRecode["wt"] = 0;
         $out_data[] = $newRecode;
     }
-	$result["time"][] = array("event" => "精确匹配结束", "time" => microtime(true)-$_start);
+	$result["time"][] = array("event" => "fts精确匹配结束", "time" => microtime(true)-$_start);
 	/*
     #然后查分散的
     $strQuery = "";

+ 13 - 0
db/postgresql/migrations/2021-12-03-150805_fts_texts/down.sql

@@ -0,0 +1,13 @@
+-- This file should undo anything in `up.sql`
+
+DROP TABLE fts_texts ;
+
+DROP TEXT SEARCH CONFIGURATION pali;
+
+DROP TEXT SEARCH CONFIGURATION pali_unaccent;
+
+DROP TEXT SEARCH DICTIONARY pali_stem;
+
+DROP TEXT SEARCH DICTIONARY pali_stopwords;
+
+DROP FUNCTION query_pali;

+ 105 - 0
db/postgresql/migrations/2021-12-03-150805_fts_texts/up.sql

@@ -0,0 +1,105 @@
+-- Your SQL goes here
+
+-- 创建表结构
+
+CREATE TABLE fts_texts (
+       paragraph        integer,
+       book             integer,
+       wid              varchar(50),
+       -- 单个出现的黑体字,权重较大
+       bold_single      text,
+       -- 成对出现的黑体字,权重一般
+       bold_double      text,
+       -- 连续三个或三个以上的黑体字,权重较低
+       bold_multiple    text,
+       content          text
+);
+
+-- 创建全文检索配置 pali
+CREATE TEXT SEARCH CONFIGURATION pali ( parser = pg_catalog.default );
+
+-- 创建全文检索配置 pali_unaccent 无标音符号版
+CREATE TEXT SEARCH CONFIGURATION pali_unaccent ( parser = pg_catalog.default );
+
+-- 创建巴利语词形转换字典
+CREATE TEXT SEARCH DICTIONARY pali_stem (
+    TEMPLATE = synonym,
+    SYNONYMS = pali
+);
+
+-- 创建巴利语停用词字典
+CREATE TEXT SEARCH DICTIONARY pali_stopwords (
+    TEMPLATE = pg_catalog.simple, STOPWORDS = pali,
+    ACCEPT = true
+);
+
+-- 修改全文检索配置 pali 使用我们创建的字典
+ALTER TEXT SEARCH CONFIGURATION pali
+    ADD MAPPING FOR asciiword, word, hword_part, hword_asciipart
+    WITH pali_stem, pali_stopwords;
+
+-- 修改全文检索配置 pali_unaccent 使用我们创建的字典
+
+CREATE EXTENSION IF NOT EXISTS "unaccent";
+
+ALTER TEXT SEARCH CONFIGURATION pali_unaccent
+    ADD MAPPING FOR asciiword, word, hword_part, hword_asciipart
+    WITH unaccent, pali_stem, pali_stopwords;
+
+
+-- 添加自动更新的 TSVECTOR 字段
+
+ALTER TABLE fts_texts
+      ADD COLUMN full_text_search_weighted TSVECTOR
+      GENERATED ALWAYS AS (
+         setweight(to_tsvector('pali', coalesce(content,'')), 'A')  || ' ' ||
+         setweight(to_tsvector('pali', coalesce(bold_single,'')), 'B') || ' '  ||
+         setweight(to_tsvector('pali', coalesce(bold_double,'')), 'C') || ' ' ||
+         setweight(to_tsvector('pali', coalesce(bold_multiple,'')), 'D')
+      ) STORED;
+
+ALTER TABLE fts_texts
+      ADD COLUMN full_text_search_weighted_unaccent TSVECTOR
+      GENERATED ALWAYS AS (
+         setweight(to_tsvector('pali_unaccent', coalesce(content,'')), 'A')  || ' ' ||
+         setweight(to_tsvector('pali_unaccent', coalesce(bold_single,'')), 'B') || ' '  ||
+         setweight(to_tsvector('pali_unaccent', coalesce(bold_double,'')), 'C') || ' ' ||
+         setweight(to_tsvector('pali_unaccent', coalesce(bold_multiple,'')), 'D')
+      ) STORED;
+
+-- 为该字段创建索引
+
+CREATE INDEX full_text_search_weighted_idx
+       ON fts_texts USING GIN (full_text_search_weighted);
+
+CREATE INDEX full_text_search_weighted__unaccent_idx
+       ON fts_texts USING GIN (full_text_search_weighted_unaccent);
+
+-- 创建查询函数
+
+CREATE OR REPLACE FUNCTION query_pali(query_str TEXT) 
+  RETURNS TABLE(
+          rank NUMERIC,
+          paragraph INTEGER,
+          wid VARCHAR,
+          bold_single TEXT,
+          bold_double TEXT,
+          bold_multiple TEXT,
+          content TEXT,
+          full_text_search_weighted TSVECTOR,
+          full_text_search_weighted_unaccent TSVECTOR) 
+AS $$
+    SELECT
+    ts_rank('{0.1, 0.2, 0.4, 1}',
+        full_text_search_weighted,
+        websearch_to_tsquery('pali', query_str)) +
+    ts_rank('{0.1, 0.2, 0.4, 1}',
+        full_text_search_weighted_unaccent,
+        websearch_to_tsquery('pali_unaccent', query_str)), -- AS rank
+        paragraph, wid, bold_single, bold_double, bold_multiple, content,
+        full_text_search_weighted, full_text_search_weighted_unaccent
+    FROM fts_texts
+    WHERE
+        full_text_search_weighted @@ websearch_to_tsquery('pali', query_str) OR
+        full_text_search_weighted_unaccent @@ websearch_to_tsquery('pali_unaccent', query_str);
+$$ LANGUAGE SQL;