| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795 |
- <?php
- // api-v8/app/Services/OpenSearchService.php
- namespace App\Services;
- use OpenSearch\GuzzleClientFactory;
- use Illuminate\Support\Facades\Log;
- use GuzzleHttp\Client;
- use Illuminate\Support\Facades\Cache;
- use Exception;
- class OpenSearchService
- {
- protected $client;
- protected $http;
- protected $openaiApiKey;
- /** 默认查询排除字段 **/
- private $sourceExcludes = [
- 'title.suggest',
- 'content.suggest',
- ];
- /** 默认权重配置 **/
- private $weights = [
- 'fuzzy' => [
- 'bold_single' => 50,
- 'bold_multi' => 10,
- 'title.pali.text' => 3,
- 'title.zh' => 3,
- 'summary.text' => 2,
- 'content.pali.text' => 1,
- 'content.zh' => 1,
- ],
- 'hybrid' => [
- 'fuzzy_ratio' => 0.7,
- 'semantic_ratio' => 0.3,
- 'bold_single' => 50,
- 'bold_multi' => 10,
- 'title.pali.text' => 3,
- 'title.zh' => 3,
- 'summary.text' => 2,
- 'content.pali.text' => 1,
- 'content.zh' => 1,
- ],
- ];
- private $indexDefinition = [
- 'settings' => [
- 'index' => [
- 'knn' => true,
- ],
- 'analysis' => [
- 'analyzer' => [
- /** */
- 'pali_query_analyzer' => [
- 'tokenizer' => 'standard',
- 'filter' => ['lowercase', 'pali_synonyms'],
- ],
- 'pali_index_analyzer' => [
- 'type' => 'custom',
- 'tokenizer' => 'standard',
- 'char_filter' => ['markdown_strip'],
- 'filter' => ['lowercase'],
- ],
- 'markdown_clean' => [
- 'type' => 'custom',
- 'tokenizer' => 'standard',
- 'char_filter' => ['markdown_strip'],
- 'filter' => ['lowercase'],
- ],
- // Suggest 专用(忽略大小写 + 变音)
- 'pali_suggest_analyzer' => [
- 'tokenizer' => 'standard',
- 'filter' => ['lowercase', 'asciifolding']
- ],
- // 中文简繁统一 (繁 -> 简)
- 'zh_index_analyzer' => [
- 'tokenizer' => 'ik_max_word',
- 'char_filter' => ['tsconvert'],
- ],
- 'zh_query_analyzer' => [
- 'tokenizer' => 'ik_smart',
- 'char_filter' => ['tsconvert'],
- ]
- ],
- 'filter' => [
- 'pali_synonyms' => [
- 'type' => 'synonym_graph',
- 'synonyms_path' => 'analysis/pali_synonyms.txt',
- ],
- ],
- 'char_filter' => [
- 'markdown_strip' => [
- 'type' => 'pattern_replace',
- 'pattern' => '\\*\\*|\\*|_|`|~',
- 'replacement' => '',
- ],
- "tsconvert" => [
- "type" => "stconvert",
- "convert_type" => "t2s"
- ]
- ],
- ],
- ],
- 'mappings' => [
- 'properties' => [
- 'id' => ['type' => 'keyword'],
- 'resource_id' => ['type' => 'keyword'],
- 'resource_type' => ['type' => 'keyword'],
- 'title' => [
- 'properties' => [
- 'pali' => [
- 'type' => 'text',
- 'fields' => [
- /**模糊查询 */
- 'text' => [
- 'type' => 'text',
- 'analyzer' => 'pali_index_analyzer',
- 'search_analyzer' => 'pali_query_analyzer',
- ],
- /**准确查询 */
- 'exact' => [
- 'type' => 'text',
- 'analyzer' => 'markdown_clean',
- ],
- ],
- ],
- 'zh' => [
- 'type' => 'text',
- 'analyzer' => 'zh_index_analyzer',
- 'search_analyzer' => 'zh_query_analyzer',
- ],
- 'vector' => [
- 'type' => 'knn_vector',
- 'dimension' => 1536,
- 'method' => [
- 'name' => 'hnsw',
- 'space_type' => 'cosinesimil',
- 'engine' => 'nmslib',
- ],
- ],
- // 自动建议字段
- 'suggest' => [
- 'type' => 'completion',
- 'analyzer' => 'pali_suggest_analyzer'
- ],
- ],
- ],
- /** 简体中文 llm生成 */
- 'summary' => [
- 'properties' => [
- 'text' => [
- 'type' => 'text',
- 'analyzer' => 'zh_index_analyzer',
- 'search_analyzer' => 'zh_query_analyzer',
- ],
- 'vector' => [
- 'type' => 'knn_vector',
- 'dimension' => 1536,
- 'method' => [
- 'name' => 'hnsw',
- 'space_type' => 'cosinesimil',
- 'engine' => 'nmslib',
- ],
- ],
- ]
- ],
- 'content' => [
- 'properties' => [
- 'pali' => [
- 'type' => 'text',
- 'fields' => [
- /**模糊查询 */
- 'text' => [
- 'type' => 'text',
- 'analyzer' => 'pali_index_analyzer',
- 'search_analyzer' => 'pali_query_analyzer',
- ],
- /**准确查询 */
- 'exact' => [
- 'type' => 'text',
- 'analyzer' => 'markdown_clean',
- ],
- ],
- ],
- 'zh' => [
- 'type' => 'text',
- 'analyzer' => 'zh_index_analyzer',
- 'search_analyzer' => 'zh_query_analyzer',
- ],
- 'tokens' => [
- 'type' => 'nested',
- 'properties' => [
- 'surface' => ['type' => 'keyword'],
- 'lemma' => ['type' => 'keyword'],
- 'compound_parts' => ['type' => 'keyword'],
- 'case' => ['type' => 'keyword'],
- ],
- ],
- 'vector' => [
- 'type' => 'knn_vector',
- 'dimension' => 1536,
- 'method' => [
- 'name' => 'hnsw',
- 'space_type' => 'cosinesimil',
- 'engine' => 'nmslib',
- ],
- ],
- 'suggest' => [
- 'type' => 'completion',
- 'analyzer' => 'pali_suggest_analyzer'
- ]
- ],
- ],
- 'related_id' => ['type' => 'keyword'],
- 'bold_single' => [
- 'type' => 'text',
- 'analyzer' => 'standard',
- 'search_analyzer' => 'pali_query_analyzer',
- ],
- 'bold_multi' => [
- 'type' => 'text',
- 'analyzer' => 'standard',
- 'search_analyzer' => 'pali_query_analyzer',
- ],
- 'path' => ['type' => 'text', 'analyzer' => 'standard'],
- 'page_refs' => [
- 'type' => 'keyword',
- ],
- 'tags' => ['type' => 'keyword'],
- 'category' => ['type' => 'keyword'],
- 'author' => ['type' => 'text'],
- 'language' => ['type' => 'keyword'],
- 'updated_at' => ['type' => 'date'],
- 'granularity' => ['type' => 'keyword'],
- 'metadata' => [
- 'properties' => [
- 'APA' => ['type' => 'text', 'index' => false],
- 'MLA' => ['type' => 'text', 'index' => false],
- 'widget' => ['type' => 'text', 'index' => false],
- 'author' => ['type' => 'text'], //
- 'channel' => ['type' => 'text'], //
- ],
- ],
- ],
- ],
- ];
- public function __construct()
- {
- $config = config('mint.opensearch.config');
- $hostUrl = "{$config['scheme']}://{$config['host']}:{$config['port']}";
- $this->client = (new GuzzleClientFactory())->create([
- 'base_uri' => $hostUrl,
- 'auth' => [$config['username'], $config['password']],
- 'verify' => $config['ssl_verification'],
- ]);
- $this->openaiApiKey = env('OPENAI_API_KEY');
- $this->http = new Client([
- 'base_uri' => 'https://api.openai.com/v1/',
- 'timeout' => 15,
- ]);
- }
- public function setWeights(string $mode, array $weights)
- {
- if (isset($this->weights[$mode])) {
- $this->weights[$mode] = array_merge($this->weights[$mode], $weights);
- }
- }
- public function testConnection()
- {
- try {
- $info = $this->client->info();
- $message = 'OpenSearch 连接成功: ' . json_encode($info['version']['number']);
- Log::info($message);
- return [true, $message];
- } catch (\Exception $e) {
- $message = 'OpenSearch 连接失败: ' . $e->getMessage();
- Log::error($message);
- return [false, $message];
- }
- }
- /** 索引管理方法保持不变... **/
- public function createIndex()
- {
- $index = config('mint.opensearch.index');
- $exists = $this->client->indices()->exists(['index' => $index]);
- if ($exists) {
- throw new \Exception("Index [$index] already exists.");
- }
- return $this->client->indices()->create([
- 'index' => $index,
- 'body' => $this->indexDefinition
- ]);
- }
- public function updateIndex()
- {
- $index = config('mint.opensearch.index');
- $settings = $this->indexDefinition['settings'] ?? [];
- $mappings = $this->indexDefinition['mappings'] ?? [];
- $response = [];
- if (!empty($settings)) {
- $this->client->indices()->close(['index' => $index]);
- $response['settings'] = $this->client->indices()->putSettings([
- 'index' => $index,
- 'body' => ['settings' => $settings]
- ]);
- $this->client->indices()->open(['index' => $index]);
- }
- if (!empty($mappings)) {
- $response['mappings'] = $this->client->indices()->putMapping([
- 'index' => $index,
- 'body' => $mappings
- ]);
- }
- return $response;
- }
- public function deleteIndex()
- {
- $index = config('mint.opensearch.index');
- return $this->client->indices()->delete(['index' => $index]);
- }
- public function create(string $id, array $body)
- {
- return $this->client->index([
- 'index' => config('mint.opensearch.index'),
- 'id' => $id,
- 'body' => $body
- ]);
- }
- public function delete($id)
- {
- return $this->client->delete(['index' => config('mint.opensearch.index'), 'id' => $id]);
- }
- /**
- * 执行高级搜索(支持 fuzzy / exact / semantic / hybrid 四种模式)
- *
- * @param array $params 搜索参数数组
- * - query: 搜索关键词
- * - searchMode: 搜索模式 (fuzzy|exact|semantic|hybrid)
- * - page: 页码,默认 1
- * - pageSize: 每页条数,默认 20
- * - resourceType / language / category / tags / relatedId / pageRefs / author / channel 等过滤条件
- * @return array OpenSearch 返回的搜索结果
- *
- * @throws \Exception
- */
- public function search(array $params)
- {
- // 分页参数
- $page = $params['page'] ?? 1;
- $pageSize = $params['pageSize'] ?? 20;
- $from = ($page - 1) * $pageSize;
- // 搜索模式,默认 fuzzy
- $mode = $params['searchMode'] ?? 'fuzzy';
- // ---------- 过滤条件 ----------
- $filters = [];
- if (!empty($params['resourceType'])) {
- $filters[] = ['term' => ['resource_type' => $params['resourceType']]];
- }
- if (!empty($params['resourceId'])) {
- $filters[] = ['term' => ['resource_id' => $params['resourceId']]];
- }
- if (!empty($params['granularity'])) {
- $filters[] = ['term' => ['granularity' => $params['granularity']]];
- }
- if (!empty($params['language'])) {
- $filters[] = ['term' => ['language' => $params['language']]];
- }
- if (!empty($params['category'])) {
- $filters[] = ['term' => ['category' => $params['category']]];
- }
- if (!empty($params['tags'])) {
- $filters[] = ['terms' => ['tags' => $params['tags']]];
- }
- if (!empty($params['pageRefs'])) {
- $filters[] = ['terms' => ['page_refs' => $params['pageRefs']]];
- }
- if (!empty($params['relatedId'])) {
- $filters[] = ['term' => ['related_id' => $params['relatedId']]];
- }
- if (!empty($params['author'])) {
- $filters[] = ['match' => ['metadata.author' => $params['author']]];
- }
- if (!empty($params['channel'])) {
- $filters[] = ['term' => ['metadata.channel' => $params['channel']]];
- }
- // ---------- 查询部分 ----------
- switch ($mode) {
- case 'exact':
- $query = $this->buildExactQuery($params['query']);
- break;
- case 'semantic':
- $query = $this->buildSemanticQuery($params['query']);
- break;
- case 'hybrid':
- $query = $this->buildHybridQuery($params['query']);
- break;
- case 'fuzzy':
- default:
- $query = $this->buildFuzzyQuery($params['query']);
- }
- // ---------- 最终 DSL ----------
- $dsl = [
- 'from' => $from,
- 'size' => $pageSize,
- '_source' => [
- 'excludes' => $this->sourceExcludes
- ],
- 'query' => !empty($filters)
- ? ['bool' => ['must' => [$query], 'filter' => $filters]]
- : $query,
- 'aggs' => [
- 'resource_type' => ['terms' => ['field' => 'resource_type']],
- 'language' => ['terms' => ['field' => 'language']],
- 'category' => ['terms' => ['field' => 'category']],
- 'granularity' => ['terms' => ['field' => 'granularity']],
- ],
- 'highlight' => [
- 'fields' => [
- 'title.pali.text' => new \stdClass(),
- 'title.zh' => new \stdClass(),
- 'summary.text' => new \stdClass(),
- 'content.pali.text' => new \stdClass(),
- 'content.zh' => new \stdClass(),
- ],
- "fragmenter" => "sentence",
- "fragment_size" => 200,
- "number_of_fragments" => 1,
- 'pre_tags' => ['_'],
- 'post_tags' => ['_'],
- ],
- ];
- Log::debug('search', ['dsl' => json_encode($dsl, JSON_UNESCAPED_UNICODE)]);
- // ---------- 执行查询 ----------
- $response = $this->client->search([
- 'index' => config('mint.opensearch.index'),
- 'body' => $dsl
- ]);
- return $response;
- }
- /**
- * 构建 exact 查询
- * 精确匹配 title.pali.exact, content.pali.exact, summary
- */
- protected function buildExactQuery(string $query): array
- {
- return [
- 'multi_match' => [
- 'query' => $query,
- 'fields' => [
- 'title.pali.exact',
- 'content.pali.exact',
- 'summary.text'
- ],
- 'type' => 'best_fields',
- ]
- ];
- }
- /**
- * 构建 semantic 查询
- * 使用 OpenAI embedding,同时查询三个向量字段
- */
- protected function buildSemanticQuery(string $query): array
- {
- $vector = $this->embedText($query);
- // OpenSearch 支持多个 knn 查询,使用 bool should
- return [
- 'bool' => [
- 'should' => [
- [
- 'knn' => [
- 'content.vector' => [
- 'vector' => $vector,
- 'k' => 20,
- ]
- ]
- ],
- [
- 'knn' => [
- 'summary.vector' => [
- 'vector' => $vector,
- 'k' => 10,
- ]
- ]
- ],
- [
- 'knn' => [
- 'title.vector' => [
- 'vector' => $vector,
- 'k' => 5,
- ]
- ]
- ]
- ],
- 'minimum_should_match' => 1
- ]
- ];
- }
- /**
- * 构建 fuzzy 查询
- */
- protected function buildFuzzyQuery(string $query)
- {
- $fields = [];
- foreach ($this->weights['fuzzy'] as $field => $weight) {
- $fields[] = $field . "^" . $weight;
- }
- return [
- 'multi_match' => [
- 'query' => $query,
- 'fields' => $fields,
- 'type' => 'best_fields'
- ]
- ];
- }
- /**
- * 构建 hybrid 查询 (fuzzy + semantic)
- */
- protected function buildHybridQuery(string $query)
- {
- $fuzzyFields = [];
- foreach ($this->weights['hybrid'] as $field => $weight) {
- if (in_array($field, ['fuzzy_ratio', 'semantic_ratio'])) {
- continue;
- }
- $fuzzyFields[] = $field . "^" . $weight;
- }
- $fuzzyPart = [
- 'multi_match' => [
- 'query' => $query,
- 'fields' => $fuzzyFields,
- 'type' => 'best_fields'
- ]
- ];
- $vector = $this->embedText($query);
- $fuzzyRatio = $this->weights['hybrid']['fuzzy_ratio'];
- $semanticRatio = $this->weights['hybrid']['semantic_ratio'];
- // 使用 bool should 组合 fuzzy 和 semantic 查询
- return [
- 'bool' => [
- 'should' => [
- // Fuzzy 部分,带权重
- [
- 'constant_score' => [
- 'filter' => $fuzzyPart,
- 'boost' => $fuzzyRatio
- ]
- ],
- // Semantic 部分 - content
- [
- 'knn' => [
- 'content.vector' => [
- 'vector' => $vector,
- 'k' => 20,
- 'boost' => $semanticRatio * 1.0 // 主要权重
- ]
- ]
- ],
- // Semantic 部分 - summary
- [
- 'knn' => [
- 'summary.vector' => [
- 'vector' => $vector,
- 'k' => 10,
- 'boost' => $semanticRatio * 0.8
- ]
- ]
- ],
- // Semantic 部分 - title
- [
- 'knn' => [
- 'title.vector' => [
- 'vector' => $vector,
- 'k' => 5,
- 'boost' => $semanticRatio * 1.2 // title 稍微高一点
- ]
- ]
- ]
- ]
- ]
- ];
- }
- /**
- * 调用 OpenAI Embedding API
- * 使用 Redis 缓存,避免重复调用
- *
- * @param string $text 输入文本
- * @return array 向量 embedding
- * @throws \Exception
- */
- protected function embedText(string $text): array
- {
- if (!$this->openaiApiKey) {
- throw new Exception("请在 .env 设置 OPENAI_API_KEY");
- }
- // 缓存 key,可以用 md5 保证唯一
- $cacheKey = "embedding:" . md5($text);
- // 先查缓存
- return Cache::remember($cacheKey, now()->addDays(7), function () use ($text) {
- $response = $this->http->post('embeddings', [
- 'headers' => [
- 'Authorization' => 'Bearer ' . $this->openaiApiKey,
- 'Content-Type' => 'application/json',
- ],
- 'json' => [
- 'model' => 'text-embedding-3-small',
- 'input' => $text,
- ],
- ]);
- $json = json_decode((string)$response->getBody(), true);
- if (empty($json['data'][0]['embedding'])) {
- throw new Exception("OpenAI embedding 返回异常: " . json_encode($json));
- }
- return $json['data'][0]['embedding'];
- });
- }
- /**
- * 清理指定文本的 embedding 缓存
- * $service = app(App\Services\OpenSearchService::class);
- // 清理某个文本的缓存
- $service->clearEmbeddingCache("sabbe dhammā anattā");
- // 清理所有 embedding 缓存
- $count = $service->clearAllEmbeddingCache();
- echo "已清理缓存 {$count} 条";
- *
- * @param string $text
- * @return bool
- */
- public function clearEmbeddingCache(string $text): bool
- {
- $cacheKey = "embedding:" . md5($text);
- return Cache::forget($cacheKey);
- }
- /**
- * 清理所有 embedding 缓存
- * 注意:这会删除 Redis 里所有 "embedding:*" 的缓存
- *
- * @return int 删除的条数
- */
- public function clearAllEmbeddingCache(): int
- {
- $redis = Cache::getRedis();
- $pattern = "embedding:*";
- $keys = $redis->keys($pattern);
- if (!empty($keys)) {
- $redis->del($keys);
- }
- return count($keys);
- }
- /**
- * 自动建议
- *
- * @param string $query 查询文本
- * @param array|string|null $fields 要查询的字段,可选值:
- * - null: 查询所有字段 ['title', 'content', 'page_refs']
- * - 'title': 只查询 title.suggest
- * - 'content': 只查询 content.pali.suggest
- * - 'page_refs': 只查询 page_refs.suggest
- * - ['title', 'content']: 查询多个字段
- * @param string|null $language 语言过滤(可选)
- * @param int $limit 每个字段返回的建议数量
- * @return array
- */
- public function suggest(
- string $query,
- $fields = null,
- ?string $language = null,
- int $limit = 10
- ): array {
- // 字段映射配置
- $fieldMap = [
- 'title' => 'title.suggest',
- 'content' => 'content.suggest',
- ];
- // 处理字段参数
- if ($fields === null) {
- // 默认查询所有字段
- $searchFields = array_keys($fieldMap);
- } elseif (is_string($fields)) {
- // 单个字段
- $searchFields = [$fields];
- } else {
- // 数组形式
- $searchFields = $fields;
- }
- // 验证字段有效性
- $searchFields = array_filter($searchFields, function ($field) use ($fieldMap) {
- return isset($fieldMap[$field]);
- });
- if (empty($searchFields)) {
- throw new \InvalidArgumentException('Invalid fields specified for suggestion');
- }
- // 构建 suggest 查询
- $suggests = [];
- foreach ($searchFields as $field) {
- $suggests[$field . '_suggest'] = [
- 'prefix' => $query,
- 'completion' => [
- 'field' => $fieldMap[$field],
- 'size' => $limit,
- 'skip_duplicates' => true,
- ]
- ];
- }
- $dsl = ['suggest' => $suggests];
- // 添加语言过滤
- if ($language) {
- $dsl['query'] = ['term' => ['language' => $language]];
- }
- $response = $this->client->search([
- 'index' => config('mint.opensearch.index'),
- 'body' => $dsl
- ]);
- // 处理返回结果,包含来源信息
- $results = [];
- foreach ($searchFields as $field) {
- $options = $response['suggest'][$field . '_suggest'][0]['options'] ?? [];
- foreach ($options as $opt) {
- $results[] = [
- 'text' => $opt['text'] ?? '',
- 'source' => $field, // 添加来源字段
- 'score' => $opt['_score'] ?? 0,
- // 可选:添加文档信息
- 'doc_id' => $opt['_id'] ?? null,
- 'doc_source' => $opt['_source'] ?? null,
- ];
- }
- }
- // 按分数排序
- usort($results, function ($a, $b) {
- return $b['score'] <=> $a['score'];
- });
- return $results;
- }
- }
|