OpenSearchService.php 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084
  1. <?php
  2. // api-v8/app/Services/OpenSearchService.php
  3. namespace App\Services;
  4. use OpenSearch\GuzzleClientFactory;
  5. use Illuminate\Support\Facades\Log;
  6. use GuzzleHttp\Client;
  7. use Illuminate\Support\Facades\Cache;
  8. use Exception;
  9. class OpenSearchService
  10. {
  11. protected $client;
  12. protected $http;
  13. protected $openaiApiKey;
  14. /**
  15. * 默认查询排除字段
  16. *
  17. * @var array
  18. */
  19. private $sourceExcludes = [
  20. 'title.suggest.pali',
  21. 'title.suggest.zh',
  22. 'content.suggest.pali',
  23. 'content.suggest.zh',
  24. 'content.display', // 新增,列表页不返回 HTML
  25. ];
  26. /**
  27. * 默认权重配置
  28. *
  29. * fuzzy / hybrid 两种模式各自的字段权重。
  30. * hybrid 额外包含 fuzzy_ratio / semantic_ratio 用于控制两路得分的混合比例。
  31. *
  32. * 字段名已按新映射结构调整:
  33. * title.text.pali → 原 title.pali.text
  34. * title.text.zh → 原 title.zh
  35. * content.text.pali → 原 content.pali.text
  36. * content.text.zh → 原 content.zh
  37. *
  38. * @var array
  39. */
  40. private $weights = [
  41. 'fuzzy' => [
  42. 'bold_single' => 50,
  43. 'bold_multi' => 10,
  44. 'title.text.pali' => 3,
  45. 'title.text.zh' => 3,
  46. 'summary.text' => 2,
  47. 'content.text.pali' => 1,
  48. 'content.text.zh' => 1,
  49. ],
  50. 'hybrid' => [
  51. 'fuzzy_ratio' => 0.7,
  52. 'semantic_ratio' => 0.3,
  53. 'bold_single' => 50,
  54. 'bold_multi' => 10,
  55. 'title.text.pali' => 3,
  56. 'title.text.zh' => 3,
  57. 'summary.text' => 2,
  58. 'content.text.pali' => 1,
  59. 'content.text.zh' => 1,
  60. ],
  61. ];
  62. /**
  63. * OpenSearch 索引定义(settings + mappings)
  64. *
  65. * 字段结构说明:
  66. *
  67. * title
  68. * ├── text
  69. * │ ├── pali (text) 模糊查询 + exact subfield 精确查询
  70. * │ └── zh (text) 中文分词查询
  71. * ├── vector (knn_vector, dim=1536)
  72. * └── suggest
  73. * ├── pali (completion)
  74. * └── zh (completion)
  75. *
  76. * content(结构与 title 一致,额外包含 tokens nested 字段)
  77. * ├── text
  78. * │ ├── pali (text)
  79. * │ └── zh (text)
  80. * ├── tokens (nested)
  81. * ├── vector (knn_vector, dim=1536)
  82. * └── suggest
  83. * ├── pali (completion)
  84. * └── zh (completion)
  85. *
  86. * summary(中文摘要,结构保持不变)
  87. * ├── text (text)
  88. * └── vector (knn_vector, dim=1536)
  89. *
  90. * @var array
  91. */
  92. private $indexDefinition = [
  93. 'settings' => [
  94. 'index' => [
  95. 'knn' => true,
  96. ],
  97. 'analysis' => [
  98. 'analyzer' => [
  99. 'pali_query_analyzer' => [
  100. 'tokenizer' => 'standard',
  101. 'filter' => ['lowercase', 'pali_synonyms'],
  102. ],
  103. 'pali_index_analyzer' => [
  104. 'type' => 'custom',
  105. 'tokenizer' => 'standard',
  106. 'char_filter' => ['markdown_strip'],
  107. 'filter' => ['lowercase'],
  108. ],
  109. 'markdown_clean' => [
  110. 'type' => 'custom',
  111. 'tokenizer' => 'standard',
  112. 'char_filter' => ['markdown_strip'],
  113. 'filter' => ['lowercase'],
  114. ],
  115. // Suggest 专用(忽略大小写 + 变音)
  116. 'pali_suggest_analyzer' => [
  117. 'tokenizer' => 'standard',
  118. 'filter' => ['lowercase', 'asciifolding'],
  119. ],
  120. 'zh_suggest_analyzer' => [
  121. 'tokenizer' => 'ik_max_word',
  122. 'char_filter' => ['tsconvert'],
  123. ],
  124. // 中文简繁统一 (繁 -> 简)
  125. 'zh_index_analyzer' => [
  126. 'tokenizer' => 'ik_max_word',
  127. 'char_filter' => ['tsconvert'],
  128. ],
  129. 'zh_query_analyzer' => [
  130. 'tokenizer' => 'ik_smart',
  131. 'char_filter' => ['tsconvert'],
  132. ],
  133. ],
  134. 'filter' => [
  135. 'pali_synonyms' => [
  136. 'type' => 'synonym_graph',
  137. 'synonyms_path' => 'analysis/pali_synonyms.txt',
  138. 'updateable' => true,
  139. ],
  140. ],
  141. 'char_filter' => [
  142. 'markdown_strip' => [
  143. 'type' => 'pattern_replace',
  144. 'pattern' => '\\*\\*|\\*|_|`|~',
  145. 'replacement' => '',
  146. ],
  147. 'tsconvert' => [
  148. 'type' => 'stconvert',
  149. 'convert_type' => 't2s',
  150. ],
  151. ],
  152. ],
  153. ],
  154. 'mappings' => [
  155. 'properties' => [
  156. 'id' => ['type' => 'keyword'],
  157. 'resource_id' => ['type' => 'keyword'],
  158. 'resource_type' => ['type' => 'keyword'],
  159. // ----------------------------------------------------------------
  160. // title
  161. // text.pali → 模糊查询(+ exact subfield 精确查询)
  162. // text.zh → 中文查询
  163. // vector → 语义向量
  164. // suggest.pali / suggest.zh → 自动建议
  165. // ----------------------------------------------------------------
  166. 'title' => [
  167. 'properties' => [
  168. 'text' => [
  169. 'properties' => [
  170. 'pali' => [
  171. 'type' => 'text',
  172. 'analyzer' => 'pali_index_analyzer',
  173. 'search_analyzer' => 'pali_query_analyzer',
  174. 'fields' => [
  175. 'exact' => [
  176. 'type' => 'text',
  177. 'analyzer' => 'markdown_clean',
  178. ],
  179. ],
  180. ],
  181. 'zh' => [
  182. 'type' => 'text',
  183. 'analyzer' => 'zh_index_analyzer',
  184. 'search_analyzer' => 'zh_query_analyzer',
  185. ],
  186. ],
  187. ],
  188. 'vector' => [
  189. 'type' => 'knn_vector',
  190. 'dimension' => 1536,
  191. 'method' => [
  192. 'name' => 'hnsw',
  193. 'space_type' => 'cosinesimil',
  194. 'engine' => 'nmslib',
  195. ],
  196. ],
  197. 'suggest' => [
  198. 'properties' => [
  199. 'pali' => [
  200. 'type' => 'completion',
  201. 'analyzer' => 'pali_suggest_analyzer',
  202. ],
  203. 'zh' => [
  204. 'type' => 'completion',
  205. 'analyzer' => 'zh_suggest_analyzer',
  206. ],
  207. ],
  208. ],
  209. ],
  210. ],
  211. // ----------------------------------------------------------------
  212. // summary(LLM 生成的简体中文摘要,结构保持不变)
  213. // text → 中文查询
  214. // vector → 语义向量
  215. // ----------------------------------------------------------------
  216. 'summary' => [
  217. 'properties' => [
  218. 'text' => [
  219. 'type' => 'text',
  220. 'analyzer' => 'zh_index_analyzer',
  221. 'search_analyzer' => 'zh_query_analyzer',
  222. ],
  223. 'vector' => [
  224. 'type' => 'knn_vector',
  225. 'dimension' => 1536,
  226. 'method' => [
  227. 'name' => 'hnsw',
  228. 'space_type' => 'cosinesimil',
  229. 'engine' => 'nmslib',
  230. ],
  231. ],
  232. ],
  233. ],
  234. // ----------------------------------------------------------------
  235. // content(结构与 title 对称,额外包含 tokens nested 字段)
  236. // text.pali → 模糊查询(+ exact subfield 精确查询)
  237. // text.zh → 中文查询
  238. // tokens → 词法分析结果(nested)
  239. // vector → 语义向量
  240. // suggest.pali / suggest.zh → 自动建议
  241. // ----------------------------------------------------------------
  242. 'content' => [
  243. 'properties' => [
  244. 'text' => [
  245. 'properties' => [
  246. 'pali' => [
  247. 'type' => 'text',
  248. 'analyzer' => 'pali_index_analyzer',
  249. 'search_analyzer' => 'pali_query_analyzer',
  250. 'fields' => [
  251. 'exact' => [
  252. 'type' => 'text',
  253. 'analyzer' => 'markdown_clean',
  254. ],
  255. ],
  256. ],
  257. 'zh' => [
  258. 'type' => 'text',
  259. 'analyzer' => 'zh_index_analyzer',
  260. 'search_analyzer' => 'zh_query_analyzer',
  261. ],
  262. ],
  263. ],
  264. 'tokens' => [
  265. 'type' => 'nested',
  266. 'properties' => [
  267. 'surface' => ['type' => 'keyword'],
  268. 'lemma' => ['type' => 'keyword'],
  269. 'compound_parts' => ['type' => 'keyword'],
  270. 'case' => ['type' => 'keyword'],
  271. ],
  272. ],
  273. 'vector' => [
  274. 'type' => 'knn_vector',
  275. 'dimension' => 1536,
  276. 'method' => [
  277. 'name' => 'hnsw',
  278. 'space_type' => 'cosinesimil',
  279. 'engine' => 'nmslib',
  280. ],
  281. ],
  282. 'suggest' => [
  283. 'properties' => [
  284. 'pali' => [
  285. 'type' => 'completion',
  286. 'analyzer' => 'pali_suggest_analyzer',
  287. ],
  288. 'zh' => [
  289. 'type' => 'completion',
  290. 'analyzer' => 'zh_suggest_analyzer',
  291. ],
  292. ],
  293. ],
  294. // 前端展示用,原始 HTML,不参与索引
  295. 'display' => [
  296. 'type' => 'text',
  297. 'index' => false,
  298. ],
  299. ],
  300. ],
  301. 'related_id' => ['type' => 'keyword'],
  302. 'bold_single' => [
  303. 'type' => 'text',
  304. 'analyzer' => 'standard',
  305. 'search_analyzer' => 'pali_query_analyzer',
  306. ],
  307. 'bold_multi' => [
  308. 'type' => 'text',
  309. 'analyzer' => 'standard',
  310. 'search_analyzer' => 'pali_query_analyzer',
  311. ],
  312. 'path' => ['type' => 'text', 'analyzer' => 'standard'],
  313. 'page_refs' => ['type' => 'keyword'],
  314. 'tags' => ['type' => 'keyword'],
  315. 'category' => ['type' => 'keyword'],
  316. 'author' => ['type' => 'text'],
  317. 'language' => ['type' => 'keyword'],
  318. 'updated_at' => ['type' => 'date'],
  319. 'granularity' => ['type' => 'keyword'],
  320. 'metadata' => [
  321. 'properties' => [
  322. 'APA' => ['type' => 'text', 'index' => false],
  323. 'MLA' => ['type' => 'text', 'index' => false],
  324. 'widget' => ['type' => 'text', 'index' => false],
  325. 'author' => ['type' => 'text'],
  326. 'channel' => ['type' => 'text'],
  327. ],
  328. ],
  329. ],
  330. ],
  331. ];
  332. /**
  333. * 创建 OpenSearchService 实例
  334. *
  335. * 从 config('mint.opensearch.config') 读取连接配置,
  336. * 同时初始化 OpenAI HTTP 客户端用于 embedding 调用。
  337. */
  338. public function __construct()
  339. {
  340. $config = config('mint.opensearch.config');
  341. $hostUrl = "{$config['scheme']}://{$config['host']}:{$config['port']}";
  342. $this->client = (new GuzzleClientFactory())->create([
  343. 'base_uri' => $hostUrl,
  344. 'auth' => [$config['username'], $config['password']],
  345. 'verify' => $config['ssl_verification'],
  346. ]);
  347. $this->openaiApiKey = env('OPENAI_API_KEY');
  348. $this->http = new Client([
  349. 'base_uri' => 'https://api.openai.com/v1/',
  350. 'timeout' => 15,
  351. ]);
  352. }
  353. /**
  354. * 动态覆盖指定搜索模式的字段权重
  355. *
  356. * @param string $mode 搜索模式,支持 'fuzzy' | 'hybrid'
  357. * @param array $weights 需要覆盖的权重键值对,例如:['title.text.pali' => 5]
  358. * @return void
  359. */
  360. public function setWeights(string $mode, array $weights): void
  361. {
  362. if (isset($this->weights[$mode])) {
  363. $this->weights[$mode] = array_merge($this->weights[$mode], $weights);
  364. }
  365. }
  366. /**
  367. * 测试与 OpenSearch 集群的连接状态
  368. *
  369. * @return array{0: bool, 1: string} [连接是否成功, 描述信息]
  370. */
  371. public function testConnection(): array
  372. {
  373. try {
  374. $info = $this->client->info();
  375. $message = 'OpenSearch 连接成功: ' . json_encode($info['version']['number']);
  376. Log::info($message);
  377. return [true, $message];
  378. } catch (\Exception $e) {
  379. $message = 'OpenSearch 连接失败: ' . $e->getMessage();
  380. Log::error($message);
  381. return [false, $message];
  382. }
  383. }
  384. /**
  385. * 检查当前索引是否已存在
  386. *
  387. * @return bool
  388. */
  389. public function indexExists(): bool
  390. {
  391. $index = config('mint.opensearch.index');
  392. return $this->client->indices()->exists(['index' => $index]);
  393. }
  394. /**
  395. * 创建 OpenSearch 索引
  396. *
  397. * 使用 $indexDefinition 中定义的 settings 和 mappings 创建索引。
  398. * 若索引已存在则抛出异常,避免覆盖生产数据。
  399. *
  400. * @return array OpenSearch 响应
  401. *
  402. * @throws \Exception 索引已存在时抛出
  403. */
  404. public function createIndex(): array
  405. {
  406. $index = config('mint.opensearch.index');
  407. $exists = $this->client->indices()->exists(['index' => $index]);
  408. if ($exists) {
  409. throw new \Exception("Index [$index] already exists.");
  410. }
  411. return $this->client->indices()->create([
  412. 'index' => $index,
  413. 'body' => $this->indexDefinition,
  414. ]);
  415. }
  416. /**
  417. * 更新已有索引的 settings 和 mappings
  418. *
  419. * 更新 settings 时会临时关闭索引(close → putSettings → open),
  420. * 更新 mappings 支持热更新(新增字段),不可修改已有字段类型。
  421. *
  422. * @return array 包含 'settings' 和/或 'mappings' 的响应数组
  423. */
  424. public function updateIndex(): array
  425. {
  426. $index = config('mint.opensearch.index');
  427. $settings = $this->indexDefinition['settings'] ?? [];
  428. $mappings = $this->indexDefinition['mappings'] ?? [];
  429. $response = [];
  430. if (!empty($settings)) {
  431. $this->client->indices()->close(['index' => $index]);
  432. $response['settings'] = $this->client->indices()->putSettings([
  433. 'index' => $index,
  434. 'body' => ['settings' => $settings],
  435. ]);
  436. $this->client->indices()->open(['index' => $index]);
  437. }
  438. if (!empty($mappings)) {
  439. $response['mappings'] = $this->client->indices()->putMapping([
  440. 'index' => $index,
  441. 'body' => $mappings,
  442. ]);
  443. }
  444. return $response;
  445. }
  446. /**
  447. * 删除当前索引
  448. *
  449. * @return array OpenSearch 响应
  450. */
  451. public function deleteIndex(): array
  452. {
  453. $index = config('mint.opensearch.index');
  454. return $this->client->indices()->delete(['index' => $index]);
  455. }
  456. /**
  457. * 统计索引文档数量(支持可选条件过滤)
  458. *
  459. * @param array|null $query OpenSearch DSL query 子句,为 null 时统计全部文档。
  460. * 示例:['term' => ['language' => 'zh']]
  461. * ['exists' => ['field' => 'content.vector']]
  462. * @return int 文档总数
  463. *
  464. * @throws \Exception
  465. *
  466. * @example
  467. * $service->count();
  468. * $service->count(['exists' => ['field' => 'content.vector']]);
  469. */
  470. public function count(?array $query = null): int
  471. {
  472. $index = config('mint.opensearch.index');
  473. $params = ['index' => $index];
  474. if (!empty($query)) {
  475. $params['body'] = ['query' => $query];
  476. }
  477. $response = $this->client->count($params);
  478. return (int) ($response['count'] ?? 0);
  479. }
  480. /**
  481. * 写入或覆盖单条文档
  482. *
  483. * @param string $id 文档 ID
  484. * @param array $body 文档内容,字段结构须与 mappings 一致
  485. * @return array OpenSearch 响应
  486. */
  487. public function create(string $id, array $body): array
  488. {
  489. return $this->client->index([
  490. 'index' => config('mint.opensearch.index'),
  491. 'id' => $id,
  492. 'body' => $body,
  493. ]);
  494. }
  495. /**
  496. * 删除单条文档
  497. *
  498. * @param string $id 文档 ID
  499. * @return array OpenSearch 响应
  500. */
  501. public function delete(string $id): array
  502. {
  503. return $this->client->delete([
  504. 'index' => config('mint.opensearch.index'),
  505. 'id' => $id,
  506. ]);
  507. }
  508. /**
  509. * 执行高级搜索
  510. *
  511. * 支持四种搜索模式:
  512. * - fuzzy 多字段模糊查询(默认),基于 BM25
  513. * - exact 精确匹配,使用 markdown_clean analyzer
  514. * - semantic 纯语义向量搜索,需要 OpenAI embedding
  515. * - hybrid fuzzy + semantic 混合,权重由 fuzzy_ratio / semantic_ratio 控制
  516. *
  517. * 支持的过滤参数:
  518. * resourceType, resourceId, granularity, language, category,
  519. * tags, pageRefs, relatedId, author, channel
  520. *
  521. * @param array $params {
  522. * @type string $query 搜索关键词(必填)
  523. * @type string $searchMode 搜索模式,默认 'fuzzy'
  524. * @type int $page 页码,默认 1
  525. * @type int $pageSize 每页条数,默认 20
  526. * @type string $resourceType 按资源类型过滤
  527. * @type string $resourceId 按资源 ID 过滤
  528. * @type string $granularity 按粒度过滤
  529. * @type string $language 按语言过滤
  530. * @type string $category 按分类过滤
  531. * @type array $tags 按标签过滤(terms)
  532. * @type array $pageRefs 按页码引用过滤(terms)
  533. * @type string $relatedId 按关联 ID 过滤
  534. * @type string $author 按作者过滤
  535. * @type string $channel 按频道过滤
  536. * @type array $highlight_pre_tags 高亮前置标签,默认 ['<mark>']
  537. * @type array $highlight_post_tags 高亮后置标签,默认 ['</mark>']
  538. * }
  539. * @return array OpenSearch 原始响应
  540. *
  541. * @throws \Exception semantic / hybrid 模式下 embedding 调用失败时抛出
  542. */
  543. public function search(array $params): array
  544. {
  545. $page = $params['page'] ?? 1;
  546. $pageSize = $params['pageSize'] ?? 20;
  547. $from = ($page - 1) * $pageSize;
  548. $mode = $params['searchMode'] ?? 'fuzzy';
  549. // 排除字段
  550. if (!empty($params['excludes']) && is_array($params['excludes'])) {
  551. $excludes = array_merge($this->sourceExcludes, $params['excludes']);
  552. } else {
  553. $excludes = $this->sourceExcludes;
  554. }
  555. // ---------- 过滤条件 ----------
  556. $filters = [];
  557. if (!empty($params['resourceType'])) {
  558. $filters[] = ['term' => ['resource_type' => $params['resourceType']]];
  559. }
  560. if (!empty($params['resourceId'])) {
  561. $filters[] = ['term' => ['resource_id' => $params['resourceId']]];
  562. }
  563. if (!empty($params['granularity'])) {
  564. $filters[] = ['term' => ['granularity' => $params['granularity']]];
  565. }
  566. if (!empty($params['language'])) {
  567. $filters[] = ['term' => ['language' => $params['language']]];
  568. }
  569. if (!empty($params['category'])) {
  570. if (is_array($params['category'])) {
  571. $categories = $params['category'];
  572. } else {
  573. $categories = [$params['category']];
  574. }
  575. // 必须匹配全部:为每个 category 创建一个 term 条件
  576. foreach ($categories as $category) {
  577. $filters[] = ['term' => ['category' => $category]];
  578. }
  579. }
  580. if (!empty($params['tags'])) {
  581. $filters[] = ['terms' => ['tags' => $params['tags']]];
  582. }
  583. if (!empty($params['pageRefs'])) {
  584. $filters[] = ['terms' => ['page_refs' => $params['pageRefs']]];
  585. }
  586. if (!empty($params['relatedId'])) {
  587. $filters[] = ['term' => ['related_id' => $params['relatedId']]];
  588. }
  589. if (!empty($params['author'])) {
  590. $filters[] = ['match' => ['metadata.author' => $params['author']]];
  591. }
  592. if (!empty($params['channel'])) {
  593. $filters[] = ['term' => ['metadata.channel' => $params['channel']]];
  594. }
  595. // ---------- 查询部分 ----------
  596. $queryText = trim($params['query'] ?? '');
  597. if ($queryText === '') {
  598. $query = ['match_all' => new \stdClass()];
  599. } else {
  600. switch ($mode) {
  601. case 'exact':
  602. $query = $this->buildExactQuery($queryText);
  603. break;
  604. case 'semantic':
  605. $query = $this->buildSemanticQuery($queryText);
  606. break;
  607. case 'hybrid':
  608. $query = $this->buildHybridQuery($queryText);
  609. break;
  610. case 'fuzzy':
  611. default:
  612. $query = $this->buildFuzzyQuery($queryText);
  613. break;
  614. }
  615. }
  616. $highlightPreTags = $params['highlight_pre_tags'] ?? ['<mark>'];
  617. $highlightPostTags = $params['highlight_post_tags'] ?? ['</mark>'];
  618. // ---------- 最终 DSL ----------
  619. $dsl = [
  620. 'from' => $from,
  621. 'size' => $pageSize,
  622. '_source' => ['excludes' => $excludes],
  623. 'query' => !empty($filters)
  624. ? [
  625. 'bool' => [
  626. 'must' => [$query],
  627. 'filter' => $filters,
  628. ]
  629. ]
  630. : $query,
  631. 'aggs' => [
  632. 'resource_type' => [
  633. 'terms' => ['field' => 'resource_type']
  634. ],
  635. 'language' => [
  636. 'terms' => ['field' => 'language']
  637. ],
  638. 'category' => [
  639. 'terms' => ['field' => 'category']
  640. ],
  641. 'granularity' => [
  642. 'terms' => ['field' => 'granularity']
  643. ],
  644. ],
  645. ];
  646. // 只有有搜索词时才开启高亮
  647. if ($queryText !== '') {
  648. $dsl['highlight'] = [
  649. 'fields' => [
  650. 'title.text.pali' => new \stdClass(),
  651. 'title.text.zh' => new \stdClass(),
  652. 'summary.text' => new \stdClass(),
  653. 'content.text.pali' => new \stdClass(),
  654. 'content.text.zh' => new \stdClass(),
  655. ],
  656. 'fragmenter' => 'sentence',
  657. 'fragment_size' => 200,
  658. 'number_of_fragments' => 1,
  659. 'pre_tags' => $highlightPreTags,
  660. 'post_tags' => $highlightPostTags,
  661. ];
  662. }
  663. Log::debug(
  664. 'OpenSearchService::search',
  665. ['dsl' => json_encode($dsl, JSON_UNESCAPED_UNICODE)]
  666. );
  667. return $this->client->search([
  668. 'index' => config('mint.opensearch.index'),
  669. 'body' => $dsl,
  670. ]);
  671. }
  672. /**
  673. * 构建 exact(精确匹配)查询
  674. *
  675. * 使用 markdown_clean analyzer 的 exact subfield 进行匹配,
  676. * 适合巴利文词形精确检索场景。
  677. *
  678. * 查询字段:title.text.pali.exact, content.text.pali.exact, summary.text
  679. *
  680. * @param string $query 搜索关键词
  681. * @return array OpenSearch DSL query 片段
  682. */
  683. protected function buildExactQuery(string $query): array
  684. {
  685. return [
  686. 'multi_match' => [
  687. 'query' => $query,
  688. 'fields' => [
  689. 'title.text.pali.exact',
  690. 'content.text.pali.exact',
  691. 'summary.text',
  692. ],
  693. 'type' => 'best_fields',
  694. ],
  695. ];
  696. }
  697. /**
  698. * 构建 semantic(纯语义向量)查询
  699. *
  700. * 将查询文本通过 OpenAI embedding API 转为向量,
  701. * 同时对 content.vector、summary.vector、title.vector 三个 knn 字段检索,
  702. * 使用 bool should 合并结果。
  703. *
  704. * @param string $query 搜索关键词
  705. * @return array OpenSearch DSL query 片段
  706. *
  707. * @throws \Exception embedding 调用失败时抛出
  708. */
  709. protected function buildSemanticQuery(string $query): array
  710. {
  711. $vector = $this->embedText($query);
  712. return [
  713. 'bool' => [
  714. 'should' => [
  715. ['knn' => ['content.vector' => ['vector' => $vector, 'k' => 20]]],
  716. ['knn' => ['summary.vector' => ['vector' => $vector, 'k' => 10]]],
  717. ['knn' => ['title.vector' => ['vector' => $vector, 'k' => 5]]],
  718. ],
  719. 'minimum_should_match' => 1,
  720. ],
  721. ];
  722. }
  723. /**
  724. * 构建 fuzzy(多字段模糊)查询
  725. *
  726. * 基于 BM25 的 multi_match best_fields 查询,
  727. * 字段权重取自 $weights['fuzzy']。
  728. *
  729. * @param string $query 搜索关键词
  730. * @return array OpenSearch DSL query 片段
  731. */
  732. protected function buildFuzzyQuery(string $query): array
  733. {
  734. $fields = [];
  735. foreach ($this->weights['fuzzy'] as $field => $weight) {
  736. $fields[] = $field . '^' . $weight;
  737. }
  738. return [
  739. 'multi_match' => [
  740. 'query' => $query,
  741. 'fields' => $fields,
  742. 'type' => 'best_fields',
  743. ],
  744. ];
  745. }
  746. /**
  747. * 构建 hybrid(模糊 + 语义混合)查询
  748. *
  749. * 使用 bool should 将 fuzzy(constant_score 包裹)与三路 knn 向量查询合并,
  750. * 权重比例由 $weights['hybrid']['fuzzy_ratio'] 和 'semantic_ratio' 控制。
  751. * title.vector 的语义权重略高(×1.2),以提升标题匹配的排名。
  752. *
  753. * @param string $query 搜索关键词
  754. * @return array OpenSearch DSL query 片段
  755. *
  756. * @throws \Exception embedding 调用失败时抛出
  757. */
  758. protected function buildHybridQuery(string $query): array
  759. {
  760. $fuzzyFields = [];
  761. foreach ($this->weights['hybrid'] as $field => $weight) {
  762. if (in_array($field, ['fuzzy_ratio', 'semantic_ratio'])) {
  763. continue;
  764. }
  765. $fuzzyFields[] = $field . '^' . $weight;
  766. }
  767. $fuzzyPart = [
  768. 'multi_match' => [
  769. 'query' => $query,
  770. 'fields' => $fuzzyFields,
  771. 'type' => 'best_fields',
  772. ],
  773. ];
  774. $vector = $this->embedText($query);
  775. $fuzzyRatio = $this->weights['hybrid']['fuzzy_ratio'];
  776. $semanticRatio = $this->weights['hybrid']['semantic_ratio'];
  777. return [
  778. 'bool' => [
  779. 'should' => [
  780. [
  781. 'constant_score' => [
  782. 'filter' => $fuzzyPart,
  783. 'boost' => $fuzzyRatio,
  784. ],
  785. ],
  786. [
  787. 'knn' => [
  788. 'content.vector' => [
  789. 'vector' => $vector,
  790. 'k' => 20,
  791. 'boost' => $semanticRatio * 1.0,
  792. ],
  793. ],
  794. ],
  795. [
  796. 'knn' => [
  797. 'summary.vector' => [
  798. 'vector' => $vector,
  799. 'k' => 10,
  800. 'boost' => $semanticRatio * 0.8,
  801. ],
  802. ],
  803. ],
  804. [
  805. 'knn' => [
  806. 'title.vector' => [
  807. 'vector' => $vector,
  808. 'k' => 5,
  809. 'boost' => $semanticRatio * 1.2, // title 权重略高
  810. ],
  811. ],
  812. ],
  813. ],
  814. ],
  815. ];
  816. }
  817. /**
  818. * 调用 OpenAI Embedding API 将文本转为向量
  819. *
  820. * 使用 Redis 缓存(TTL 7 天),相同文本不会重复请求 API,
  821. * 缓存 key 格式为 "embedding:{md5(text)}"。
  822. *
  823. * @param string $text 输入文本
  824. * @return array 1536 维 float 向量
  825. *
  826. * @throws \Exception 未设置 OPENAI_API_KEY 或 API 返回异常时抛出
  827. */
  828. protected function embedText(string $text): array
  829. {
  830. if (!$this->openaiApiKey) {
  831. throw new Exception('请在 .env 设置 OPENAI_API_KEY');
  832. }
  833. $cacheKey = 'embedding:' . md5($text);
  834. return Cache::remember($cacheKey, now()->addDays(7), function () use ($text) {
  835. $response = $this->http->post('embeddings', [
  836. 'headers' => [
  837. 'Authorization' => 'Bearer ' . $this->openaiApiKey,
  838. 'Content-Type' => 'application/json',
  839. ],
  840. 'json' => [
  841. 'model' => 'text-embedding-3-small',
  842. 'input' => $text,
  843. ],
  844. ]);
  845. $json = json_decode((string) $response->getBody(), true);
  846. if (empty($json['data'][0]['embedding'])) {
  847. throw new Exception('OpenAI embedding 返回异常: ' . json_encode($json));
  848. }
  849. return $json['data'][0]['embedding'];
  850. });
  851. }
  852. /**
  853. * 清除指定文本的 embedding 缓存
  854. *
  855. * @param string $text 原始文本(与调用 embedText 时一致)
  856. * @return bool 缓存是否成功删除
  857. *
  858. * @example
  859. * $service->clearEmbeddingCache('sabbe dhammā anattā');
  860. */
  861. public function clearEmbeddingCache(string $text): bool
  862. {
  863. $cacheKey = 'embedding:' . md5($text);
  864. return Cache::forget($cacheKey);
  865. }
  866. /**
  867. * 清除 Redis 中所有 embedding 缓存
  868. *
  869. * 匹配 "embedding:*" 模式的全部键,生产环境请谨慎调用。
  870. *
  871. * @return int 已删除的缓存条数
  872. *
  873. * @example
  874. * $count = $service->clearAllEmbeddingCache();
  875. * echo "已清理缓存 {$count} 条";
  876. */
  877. public function clearAllEmbeddingCache(): int
  878. {
  879. $redis = Cache::getRedis();
  880. $keys = $redis->keys('embedding:*');
  881. if (!empty($keys)) {
  882. $redis->del($keys);
  883. }
  884. return count($keys);
  885. }
  886. /**
  887. * 自动建议(Completion Suggest)
  888. *
  889. * 基于 completion 字段实现前缀补全,支持同时查询多个语言字段。
  890. * 结果按 _score 降序排序,跨字段去重。
  891. *
  892. * 可用字段标识符($fields 参数):
  893. * - 'title_pali' → title.suggest.pali
  894. * - 'title_zh' → title.suggest.zh
  895. * - 'content_pali' → content.suggest.pali
  896. * - 'content_zh' → content.suggest.zh
  897. *
  898. * @param string $query 查询前缀文本
  899. * @param array|string|null $fields 要查询的字段标识符,null 表示全部字段
  900. * @param string|null $language 可选的语言过滤(term query)
  901. * @param int $limit 每个字段返回的建议数量,默认 10
  902. * @return array 建议结果列表,每项包含:
  903. * text, source(字段标识符), score, doc_id, doc_source
  904. *
  905. * @throws \InvalidArgumentException $fields 中含无效字段标识符时抛出
  906. *
  907. * @example
  908. * // 查询所有字段
  909. * $service->suggest('nibb');
  910. *
  911. * // 只查询巴利文标题建议
  912. * $service->suggest('nibb', 'title_pali');
  913. *
  914. * // 查询多个字段,限制语言
  915. * $service->suggest('涅', ['title_zh', 'content_zh'], 'zh', 5);
  916. */
  917. public function suggest(
  918. string $query,
  919. $fields = null,
  920. ?string $language = null,
  921. int $limit = 10
  922. ): array {
  923. // 字段标识符 → OpenSearch completion 字段路径
  924. $fieldMap = [
  925. 'title_pali' => 'title.suggest.pali',
  926. 'title_zh' => 'title.suggest.zh',
  927. 'content_pali' => 'content.suggest.pali',
  928. 'content_zh' => 'content.suggest.zh',
  929. ];
  930. // 处理字段参数
  931. if ($fields === null) {
  932. $searchFields = array_keys($fieldMap);
  933. } elseif (is_string($fields)) {
  934. $searchFields = [$fields];
  935. } else {
  936. $searchFields = $fields;
  937. }
  938. // 过滤无效字段
  939. $searchFields = array_values(array_filter(
  940. $searchFields,
  941. fn($field) => isset($fieldMap[$field])
  942. ));
  943. if (empty($searchFields)) {
  944. throw new \InvalidArgumentException('Invalid fields specified for suggestion');
  945. }
  946. // 构建 suggest DSL
  947. $suggests = [];
  948. foreach ($searchFields as $field) {
  949. $suggests[$field . '_suggest'] = [
  950. 'prefix' => $query,
  951. 'completion' => [
  952. 'field' => $fieldMap[$field],
  953. 'size' => $limit,
  954. 'skip_duplicates' => true,
  955. ],
  956. ];
  957. }
  958. $dsl = ['suggest' => $suggests];
  959. if ($language) {
  960. $dsl['query'] = ['term' => ['language' => $language]];
  961. }
  962. $response = $this->client->search([
  963. 'index' => config('mint.opensearch.index'),
  964. 'body' => $dsl,
  965. ]);
  966. // 整理结果,附加来源字段
  967. $results = [];
  968. foreach ($searchFields as $field) {
  969. $options = $response['suggest'][$field . '_suggest'][0]['options'] ?? [];
  970. foreach ($options as $opt) {
  971. $results[] = [
  972. 'text' => $opt['text'] ?? '',
  973. 'source' => $field,
  974. 'score' => $opt['_score'] ?? 0,
  975. 'doc_id' => $opt['_id'] ?? null,
  976. 'doc_source' => $opt['_source'] ?? null,
  977. ];
  978. }
  979. }
  980. // 按分数降序排序
  981. usort($results, fn($a, $b) => $b['score'] <=> $a['score']);
  982. return $results;
  983. }
  984. /**
  985. * 按文档 ID 获取单条完整文档(包含 content.display)
  986. *
  987. * @param string $id 文档 ID,例如 "term_{guid}"
  988. * @return array OpenSearch 原始响应
  989. */
  990. public function get(string $id): array
  991. {
  992. return $this->client->get([
  993. 'index' => config('mint.opensearch.index'),
  994. 'id' => $id,
  995. ]);
  996. }
  997. }