OpenSearchService.php 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. <?php
  2. // api-v8/app/Services/OpenSearchService.php
  3. namespace App\Services;
  4. use OpenSearch\GuzzleClientFactory;
  5. use Illuminate\Support\Facades\Log;
  6. use GuzzleHttp\Client;
  7. use Illuminate\Support\Facades\Cache;
  8. use Exception;
  9. class OpenSearchService
  10. {
  11. protected $client;
  12. protected $http;
  13. protected $openaiApiKey;
  14. /**
  15. * 默认查询排除字段
  16. *
  17. * @var array
  18. */
  19. private $sourceExcludes = [
  20. 'title.suggest.pali',
  21. 'title.suggest.zh',
  22. 'content.suggest.pali',
  23. 'content.suggest.zh',
  24. 'content.display', // 新增,列表页不返回 HTML
  25. ];
  26. /**
  27. * 默认权重配置
  28. *
  29. * fuzzy / hybrid 两种模式各自的字段权重。
  30. * hybrid 额外包含 fuzzy_ratio / semantic_ratio 用于控制两路得分的混合比例。
  31. *
  32. * 字段名已按新映射结构调整:
  33. * title.text.pali → 原 title.pali.text
  34. * title.text.zh → 原 title.zh
  35. * content.text.pali → 原 content.pali.text
  36. * content.text.zh → 原 content.zh
  37. *
  38. * @var array
  39. */
  40. private $weights = [
  41. 'fuzzy' => [
  42. 'bold_single' => 50,
  43. 'bold_multi' => 10,
  44. 'title.text.pali' => 3,
  45. 'title.text.zh' => 3,
  46. 'summary.text' => 2,
  47. 'content.text.pali' => 1,
  48. 'content.text.zh' => 1,
  49. ],
  50. 'hybrid' => [
  51. 'fuzzy_ratio' => 0.7,
  52. 'semantic_ratio' => 0.3,
  53. 'bold_single' => 50,
  54. 'bold_multi' => 10,
  55. 'title.text.pali' => 3,
  56. 'title.text.zh' => 3,
  57. 'summary.text' => 2,
  58. 'content.text.pali' => 1,
  59. 'content.text.zh' => 1,
  60. ],
  61. ];
  62. /**
  63. * OpenSearch 索引定义(settings + mappings)
  64. *
  65. * 字段结构说明:
  66. *
  67. * title
  68. * ├── text
  69. * │ ├── pali (text) 模糊查询 + exact subfield 精确查询
  70. * │ └── zh (text) 中文分词查询
  71. * ├── vector (knn_vector, dim=1536)
  72. * └── suggest
  73. * ├── pali (completion)
  74. * └── zh (completion)
  75. *
  76. * content(结构与 title 一致,额外包含 tokens nested 字段)
  77. * ├── text
  78. * │ ├── pali (text)
  79. * │ └── zh (text)
  80. * ├── tokens (nested)
  81. * ├── vector (knn_vector, dim=1536)
  82. * └── suggest
  83. * ├── pali (completion)
  84. * └── zh (completion)
  85. *
  86. * summary(中文摘要,结构保持不变)
  87. * ├── text (text)
  88. * └── vector (knn_vector, dim=1536)
  89. *
  90. * @var array
  91. */
  92. private $indexDefinition = [
  93. 'settings' => [
  94. 'index' => [
  95. 'knn' => true,
  96. ],
  97. 'analysis' => [
  98. 'analyzer' => [
  99. 'pali_query_analyzer' => [
  100. 'tokenizer' => 'standard',
  101. 'filter' => ['lowercase', 'pali_synonyms'],
  102. ],
  103. 'pali_index_analyzer' => [
  104. 'type' => 'custom',
  105. 'tokenizer' => 'standard',
  106. 'char_filter' => ['markdown_strip'],
  107. 'filter' => ['lowercase'],
  108. ],
  109. 'markdown_clean' => [
  110. 'type' => 'custom',
  111. 'tokenizer' => 'standard',
  112. 'char_filter' => ['markdown_strip'],
  113. 'filter' => ['lowercase'],
  114. ],
  115. // Suggest 专用(忽略大小写 + 变音)
  116. 'pali_suggest_analyzer' => [
  117. 'tokenizer' => 'standard',
  118. 'filter' => ['lowercase', 'asciifolding'],
  119. ],
  120. 'zh_suggest_analyzer' => [
  121. 'tokenizer' => 'ik_max_word',
  122. 'char_filter' => ['tsconvert'],
  123. ],
  124. // 中文简繁统一 (繁 -> 简)
  125. 'zh_index_analyzer' => [
  126. 'tokenizer' => 'ik_max_word',
  127. 'char_filter' => ['tsconvert'],
  128. ],
  129. 'zh_query_analyzer' => [
  130. 'tokenizer' => 'ik_smart',
  131. 'char_filter' => ['tsconvert'],
  132. ],
  133. ],
  134. 'filter' => [
  135. 'pali_synonyms' => [
  136. 'type' => 'synonym_graph',
  137. 'synonyms_path' => 'analysis/pali_synonyms.txt',
  138. 'updateable' => true,
  139. ],
  140. ],
  141. 'char_filter' => [
  142. 'markdown_strip' => [
  143. 'type' => 'pattern_replace',
  144. 'pattern' => '\\*\\*|\\*|_|`|~',
  145. 'replacement' => '',
  146. ],
  147. 'tsconvert' => [
  148. 'type' => 'stconvert',
  149. 'convert_type' => 't2s',
  150. ],
  151. ],
  152. ],
  153. ],
  154. 'mappings' => [
  155. 'properties' => [
  156. 'id' => ['type' => 'keyword'],
  157. 'resource_id' => ['type' => 'keyword'],
  158. 'resource_type' => ['type' => 'keyword'],
  159. // ----------------------------------------------------------------
  160. // title
  161. // text.pali → 模糊查询(+ exact subfield 精确查询)
  162. // text.zh → 中文查询
  163. // vector → 语义向量
  164. // suggest.pali / suggest.zh → 自动建议
  165. // ----------------------------------------------------------------
  166. 'title' => [
  167. 'properties' => [
  168. 'text' => [
  169. 'properties' => [
  170. 'pali' => [
  171. 'type' => 'text',
  172. 'analyzer' => 'pali_index_analyzer',
  173. 'search_analyzer' => 'pali_query_analyzer',
  174. 'fields' => [
  175. 'exact' => [
  176. 'type' => 'text',
  177. 'analyzer' => 'markdown_clean',
  178. ],
  179. ],
  180. ],
  181. 'zh' => [
  182. 'type' => 'text',
  183. 'analyzer' => 'zh_index_analyzer',
  184. 'search_analyzer' => 'zh_query_analyzer',
  185. ],
  186. ],
  187. ],
  188. 'vector' => [
  189. 'type' => 'knn_vector',
  190. 'dimension' => 1536,
  191. 'method' => [
  192. 'name' => 'hnsw',
  193. 'space_type' => 'cosinesimil',
  194. 'engine' => 'nmslib',
  195. ],
  196. ],
  197. 'suggest' => [
  198. 'properties' => [
  199. 'pali' => [
  200. 'type' => 'completion',
  201. 'analyzer' => 'pali_suggest_analyzer',
  202. ],
  203. 'zh' => [
  204. 'type' => 'completion',
  205. 'analyzer' => 'zh_suggest_analyzer',
  206. ],
  207. ],
  208. ],
  209. ],
  210. ],
  211. // ----------------------------------------------------------------
  212. // summary(LLM 生成的简体中文摘要,结构保持不变)
  213. // text → 中文查询
  214. // vector → 语义向量
  215. // ----------------------------------------------------------------
  216. 'summary' => [
  217. 'properties' => [
  218. 'text' => [
  219. 'type' => 'text',
  220. 'analyzer' => 'zh_index_analyzer',
  221. 'search_analyzer' => 'zh_query_analyzer',
  222. ],
  223. 'vector' => [
  224. 'type' => 'knn_vector',
  225. 'dimension' => 1536,
  226. 'method' => [
  227. 'name' => 'hnsw',
  228. 'space_type' => 'cosinesimil',
  229. 'engine' => 'nmslib',
  230. ],
  231. ],
  232. ],
  233. ],
  234. // ----------------------------------------------------------------
  235. // content(结构与 title 对称,额外包含 tokens nested 字段)
  236. // text.pali → 模糊查询(+ exact subfield 精确查询)
  237. // text.zh → 中文查询
  238. // tokens → 词法分析结果(nested)
  239. // vector → 语义向量
  240. // suggest.pali / suggest.zh → 自动建议
  241. // ----------------------------------------------------------------
  242. 'content' => [
  243. 'properties' => [
  244. 'text' => [
  245. 'properties' => [
  246. 'pali' => [
  247. 'type' => 'text',
  248. 'analyzer' => 'pali_index_analyzer',
  249. 'search_analyzer' => 'pali_query_analyzer',
  250. 'fields' => [
  251. 'exact' => [
  252. 'type' => 'text',
  253. 'analyzer' => 'markdown_clean',
  254. ],
  255. ],
  256. ],
  257. 'zh' => [
  258. 'type' => 'text',
  259. 'analyzer' => 'zh_index_analyzer',
  260. 'search_analyzer' => 'zh_query_analyzer',
  261. ],
  262. ],
  263. ],
  264. 'tokens' => [
  265. 'type' => 'nested',
  266. 'properties' => [
  267. 'surface' => ['type' => 'keyword'],
  268. 'lemma' => ['type' => 'keyword'],
  269. 'compound_parts' => ['type' => 'keyword'],
  270. 'case' => ['type' => 'keyword'],
  271. ],
  272. ],
  273. 'vector' => [
  274. 'type' => 'knn_vector',
  275. 'dimension' => 1536,
  276. 'method' => [
  277. 'name' => 'hnsw',
  278. 'space_type' => 'cosinesimil',
  279. 'engine' => 'nmslib',
  280. ],
  281. ],
  282. 'suggest' => [
  283. 'properties' => [
  284. 'pali' => [
  285. 'type' => 'completion',
  286. 'analyzer' => 'pali_suggest_analyzer',
  287. ],
  288. 'zh' => [
  289. 'type' => 'completion',
  290. 'analyzer' => 'zh_suggest_analyzer',
  291. ],
  292. ],
  293. ],
  294. // 前端展示用,原始 HTML,不参与索引
  295. 'display' => [
  296. 'type' => 'text',
  297. 'index' => false,
  298. ],
  299. ],
  300. ],
  301. 'related_id' => ['type' => 'keyword'],
  302. 'bold_single' => [
  303. 'type' => 'text',
  304. 'analyzer' => 'standard',
  305. 'search_analyzer' => 'pali_query_analyzer',
  306. ],
  307. 'bold_multi' => [
  308. 'type' => 'text',
  309. 'analyzer' => 'standard',
  310. 'search_analyzer' => 'pali_query_analyzer',
  311. ],
  312. 'path' => ['type' => 'text', 'analyzer' => 'standard'],
  313. 'page_refs' => ['type' => 'keyword'],
  314. 'tags' => ['type' => 'keyword'],
  315. 'category' => ['type' => 'keyword'],
  316. 'author' => ['type' => 'text'],
  317. 'language' => ['type' => 'keyword'],
  318. 'updated_at' => ['type' => 'date'],
  319. 'granularity' => ['type' => 'keyword'],
  320. 'metadata' => [
  321. 'properties' => [
  322. 'APA' => ['type' => 'text', 'index' => false],
  323. 'MLA' => ['type' => 'text', 'index' => false],
  324. 'widget' => ['type' => 'text', 'index' => false],
  325. 'author' => ['type' => 'text'],
  326. 'channel' => ['type' => 'text'],
  327. ],
  328. ],
  329. ],
  330. ],
  331. ];
  332. /**
  333. * 创建 OpenSearchService 实例
  334. *
  335. * 从 config('mint.opensearch.config') 读取连接配置,
  336. * 同时初始化 OpenAI HTTP 客户端用于 embedding 调用。
  337. */
  338. public function __construct()
  339. {
  340. $config = config('mint.opensearch.config');
  341. $hostUrl = "{$config['scheme']}://{$config['host']}:{$config['port']}";
  342. $this->client = (new GuzzleClientFactory())->create([
  343. 'base_uri' => $hostUrl,
  344. 'auth' => [$config['username'], $config['password']],
  345. 'verify' => $config['ssl_verification'],
  346. ]);
  347. $this->openaiApiKey = env('OPENAI_API_KEY');
  348. $this->http = new Client([
  349. 'base_uri' => 'https://api.openai.com/v1/',
  350. 'timeout' => 15,
  351. ]);
  352. }
  353. /**
  354. * 动态覆盖指定搜索模式的字段权重
  355. *
  356. * @param string $mode 搜索模式,支持 'fuzzy' | 'hybrid'
  357. * @param array $weights 需要覆盖的权重键值对,例如:['title.text.pali' => 5]
  358. * @return void
  359. */
  360. public function setWeights(string $mode, array $weights): void
  361. {
  362. if (isset($this->weights[$mode])) {
  363. $this->weights[$mode] = array_merge($this->weights[$mode], $weights);
  364. }
  365. }
  366. /**
  367. * 测试与 OpenSearch 集群的连接状态
  368. *
  369. * @return array{0: bool, 1: string} [连接是否成功, 描述信息]
  370. */
  371. public function testConnection(): array
  372. {
  373. try {
  374. $info = $this->client->info();
  375. $message = 'OpenSearch 连接成功: ' . json_encode($info['version']['number']);
  376. Log::info($message);
  377. return [true, $message];
  378. } catch (\Exception $e) {
  379. $message = 'OpenSearch 连接失败: ' . $e->getMessage();
  380. Log::error($message);
  381. return [false, $message];
  382. }
  383. }
  384. /**
  385. * 检查当前索引是否已存在
  386. *
  387. * @return bool
  388. */
  389. public function indexExists(): bool
  390. {
  391. $index = config('mint.opensearch.index');
  392. return $this->client->indices()->exists(['index' => $index]);
  393. }
  394. /**
  395. * 创建 OpenSearch 索引
  396. *
  397. * 使用 $indexDefinition 中定义的 settings 和 mappings 创建索引。
  398. * 若索引已存在则抛出异常,避免覆盖生产数据。
  399. *
  400. * @return array OpenSearch 响应
  401. *
  402. * @throws \Exception 索引已存在时抛出
  403. */
  404. public function createIndex(): array
  405. {
  406. $index = config('mint.opensearch.index');
  407. $exists = $this->client->indices()->exists(['index' => $index]);
  408. if ($exists) {
  409. throw new \Exception("Index [$index] already exists.");
  410. }
  411. return $this->client->indices()->create([
  412. 'index' => $index,
  413. 'body' => $this->indexDefinition,
  414. ]);
  415. }
  416. /**
  417. * 更新已有索引的 settings 和 mappings
  418. *
  419. * 更新 settings 时会临时关闭索引(close → putSettings → open),
  420. * 更新 mappings 支持热更新(新增字段),不可修改已有字段类型。
  421. *
  422. * @return array 包含 'settings' 和/或 'mappings' 的响应数组
  423. */
  424. public function updateIndex(): array
  425. {
  426. $index = config('mint.opensearch.index');
  427. $settings = $this->indexDefinition['settings'] ?? [];
  428. $mappings = $this->indexDefinition['mappings'] ?? [];
  429. $response = [];
  430. if (!empty($settings)) {
  431. $this->client->indices()->close(['index' => $index]);
  432. $response['settings'] = $this->client->indices()->putSettings([
  433. 'index' => $index,
  434. 'body' => ['settings' => $settings],
  435. ]);
  436. $this->client->indices()->open(['index' => $index]);
  437. }
  438. if (!empty($mappings)) {
  439. $response['mappings'] = $this->client->indices()->putMapping([
  440. 'index' => $index,
  441. 'body' => $mappings,
  442. ]);
  443. }
  444. return $response;
  445. }
  446. /**
  447. * 删除当前索引
  448. *
  449. * @return array OpenSearch 响应
  450. */
  451. public function deleteIndex(): array
  452. {
  453. $index = config('mint.opensearch.index');
  454. return $this->client->indices()->delete(['index' => $index]);
  455. }
  456. /**
  457. * 统计索引文档数量(支持可选条件过滤)
  458. *
  459. * @param array|null $query OpenSearch DSL query 子句,为 null 时统计全部文档。
  460. * 示例:['term' => ['language' => 'zh']]
  461. * ['exists' => ['field' => 'content.vector']]
  462. * @return int 文档总数
  463. *
  464. * @throws \Exception
  465. *
  466. * @example
  467. * $service->count();
  468. * $service->count(['exists' => ['field' => 'content.vector']]);
  469. */
  470. public function count(?array $query = null): int
  471. {
  472. $index = config('mint.opensearch.index');
  473. $params = ['index' => $index];
  474. if (!empty($query)) {
  475. $params['body'] = ['query' => $query];
  476. }
  477. $response = $this->client->count($params);
  478. return (int) ($response['count'] ?? 0);
  479. }
  480. /**
  481. * 写入或覆盖单条文档
  482. *
  483. * @param string $id 文档 ID
  484. * @param array $body 文档内容,字段结构须与 mappings 一致
  485. * @return array OpenSearch 响应
  486. */
  487. public function create(string $id, array $body): array
  488. {
  489. return $this->client->index([
  490. 'index' => config('mint.opensearch.index'),
  491. 'id' => $id,
  492. 'body' => $body,
  493. ]);
  494. }
  495. /**
  496. * 删除单条文档
  497. *
  498. * @param string $id 文档 ID
  499. * @return array OpenSearch 响应
  500. */
  501. public function delete(string $id): array
  502. {
  503. return $this->client->delete([
  504. 'index' => config('mint.opensearch.index'),
  505. 'id' => $id,
  506. ]);
  507. }
  508. /**
  509. * 执行高级搜索
  510. *
  511. * 支持四种搜索模式:
  512. * - fuzzy 多字段模糊查询(默认),基于 BM25
  513. * - exact 精确匹配,使用 markdown_clean analyzer
  514. * - semantic 纯语义向量搜索,需要 OpenAI embedding
  515. * - hybrid fuzzy + semantic 混合,权重由 fuzzy_ratio / semantic_ratio 控制
  516. *
  517. * 支持的过滤参数:
  518. * resourceType, resourceId, granularity, language, category,
  519. * tags, pageRefs, relatedId, author, channel
  520. *
  521. * @param array $params {
  522. * @type string $query 搜索关键词(必填)
  523. * @type string $searchMode 搜索模式,默认 'fuzzy'
  524. * @type int $page 页码,默认 1
  525. * @type int $pageSize 每页条数,默认 20
  526. * @type string $resourceType 按资源类型过滤
  527. * @type string $resourceId 按资源 ID 过滤
  528. * @type string $granularity 按粒度过滤
  529. * @type string $language 按语言过滤
  530. * @type string $category 按分类过滤
  531. * @type array $tags 按标签过滤(terms)
  532. * @type array $pageRefs 按页码引用过滤(terms)
  533. * @type string $relatedId 按关联 ID 过滤
  534. * @type string $author 按作者过滤
  535. * @type string $channel 按频道过滤
  536. * @type array $highlight_pre_tags 高亮前置标签,默认 ['<mark>']
  537. * @type array $highlight_post_tags 高亮后置标签,默认 ['</mark>']
  538. * }
  539. * @return array OpenSearch 原始响应
  540. *
  541. * @throws \Exception semantic / hybrid 模式下 embedding 调用失败时抛出
  542. */
  543. public function search(array $params): array
  544. {
  545. $page = $params['page'] ?? 1;
  546. $pageSize = $params['pageSize'] ?? 20;
  547. $from = ($page - 1) * $pageSize;
  548. $mode = $params['searchMode'] ?? 'fuzzy';
  549. // ---------- 过滤条件 ----------
  550. $filters = [];
  551. if (!empty($params['resourceType'])) {
  552. $filters[] = ['term' => ['resource_type' => $params['resourceType']]];
  553. }
  554. if (!empty($params['resourceId'])) {
  555. $filters[] = ['term' => ['resource_id' => $params['resourceId']]];
  556. }
  557. if (!empty($params['granularity'])) {
  558. $filters[] = ['term' => ['granularity' => $params['granularity']]];
  559. }
  560. if (!empty($params['language'])) {
  561. $filters[] = ['term' => ['language' => $params['language']]];
  562. }
  563. if (!empty($params['category'])) {
  564. $filters[] = ['term' => ['category' => $params['category']]];
  565. }
  566. if (!empty($params['tags'])) {
  567. $filters[] = ['terms' => ['tags' => $params['tags']]];
  568. }
  569. if (!empty($params['pageRefs'])) {
  570. $filters[] = ['terms' => ['page_refs' => $params['pageRefs']]];
  571. }
  572. if (!empty($params['relatedId'])) {
  573. $filters[] = ['term' => ['related_id' => $params['relatedId']]];
  574. }
  575. if (!empty($params['author'])) {
  576. $filters[] = ['match' => ['metadata.author' => $params['author']]];
  577. }
  578. if (!empty($params['channel'])) {
  579. $filters[] = ['term' => ['metadata.channel' => $params['channel']]];
  580. }
  581. // ---------- 查询部分 ----------
  582. switch ($mode) {
  583. case 'exact':
  584. $query = $this->buildExactQuery($params['query']);
  585. break;
  586. case 'semantic':
  587. $query = $this->buildSemanticQuery($params['query']);
  588. break;
  589. case 'hybrid':
  590. $query = $this->buildHybridQuery($params['query']);
  591. break;
  592. case 'fuzzy':
  593. default:
  594. $query = $this->buildFuzzyQuery($params['query']);
  595. }
  596. $highlightPreTags = $params['highlight_pre_tags'] ?? ['<mark>'];
  597. $highlightPostTags = $params['highlight_post_tags'] ?? ['</mark>'];
  598. // ---------- 最终 DSL ----------
  599. $dsl = [
  600. 'from' => $from,
  601. 'size' => $pageSize,
  602. '_source' => ['excludes' => $this->sourceExcludes],
  603. 'query' => !empty($filters)
  604. ? ['bool' => ['must' => [$query], 'filter' => $filters]]
  605. : $query,
  606. 'aggs' => [
  607. 'resource_type' => ['terms' => ['field' => 'resource_type']],
  608. 'language' => ['terms' => ['field' => 'language']],
  609. 'category' => ['terms' => ['field' => 'category']],
  610. 'granularity' => ['terms' => ['field' => 'granularity']],
  611. ],
  612. 'highlight' => [
  613. 'fields' => [
  614. 'title.text.pali' => new \stdClass(),
  615. 'title.text.zh' => new \stdClass(),
  616. 'summary.text' => new \stdClass(),
  617. 'content.text.pali' => new \stdClass(),
  618. 'content.text.zh' => new \stdClass(),
  619. ],
  620. 'fragmenter' => 'sentence',
  621. 'fragment_size' => 200,
  622. 'number_of_fragments' => 1,
  623. 'pre_tags' => $highlightPreTags,
  624. 'post_tags' => $highlightPostTags,
  625. ],
  626. ];
  627. Log::debug('OpenSearchService::search', ['dsl' => json_encode($dsl, JSON_UNESCAPED_UNICODE)]);
  628. return $this->client->search([
  629. 'index' => config('mint.opensearch.index'),
  630. 'body' => $dsl,
  631. ]);
  632. }
  633. /**
  634. * 构建 exact(精确匹配)查询
  635. *
  636. * 使用 markdown_clean analyzer 的 exact subfield 进行匹配,
  637. * 适合巴利文词形精确检索场景。
  638. *
  639. * 查询字段:title.text.pali.exact, content.text.pali.exact, summary.text
  640. *
  641. * @param string $query 搜索关键词
  642. * @return array OpenSearch DSL query 片段
  643. */
  644. protected function buildExactQuery(string $query): array
  645. {
  646. return [
  647. 'multi_match' => [
  648. 'query' => $query,
  649. 'fields' => [
  650. 'title.text.pali.exact',
  651. 'content.text.pali.exact',
  652. 'summary.text',
  653. ],
  654. 'type' => 'best_fields',
  655. ],
  656. ];
  657. }
  658. /**
  659. * 构建 semantic(纯语义向量)查询
  660. *
  661. * 将查询文本通过 OpenAI embedding API 转为向量,
  662. * 同时对 content.vector、summary.vector、title.vector 三个 knn 字段检索,
  663. * 使用 bool should 合并结果。
  664. *
  665. * @param string $query 搜索关键词
  666. * @return array OpenSearch DSL query 片段
  667. *
  668. * @throws \Exception embedding 调用失败时抛出
  669. */
  670. protected function buildSemanticQuery(string $query): array
  671. {
  672. $vector = $this->embedText($query);
  673. return [
  674. 'bool' => [
  675. 'should' => [
  676. ['knn' => ['content.vector' => ['vector' => $vector, 'k' => 20]]],
  677. ['knn' => ['summary.vector' => ['vector' => $vector, 'k' => 10]]],
  678. ['knn' => ['title.vector' => ['vector' => $vector, 'k' => 5]]],
  679. ],
  680. 'minimum_should_match' => 1,
  681. ],
  682. ];
  683. }
  684. /**
  685. * 构建 fuzzy(多字段模糊)查询
  686. *
  687. * 基于 BM25 的 multi_match best_fields 查询,
  688. * 字段权重取自 $weights['fuzzy']。
  689. *
  690. * @param string $query 搜索关键词
  691. * @return array OpenSearch DSL query 片段
  692. */
  693. protected function buildFuzzyQuery(string $query): array
  694. {
  695. $fields = [];
  696. foreach ($this->weights['fuzzy'] as $field => $weight) {
  697. $fields[] = $field . '^' . $weight;
  698. }
  699. return [
  700. 'multi_match' => [
  701. 'query' => $query,
  702. 'fields' => $fields,
  703. 'type' => 'best_fields',
  704. ],
  705. ];
  706. }
  707. /**
  708. * 构建 hybrid(模糊 + 语义混合)查询
  709. *
  710. * 使用 bool should 将 fuzzy(constant_score 包裹)与三路 knn 向量查询合并,
  711. * 权重比例由 $weights['hybrid']['fuzzy_ratio'] 和 'semantic_ratio' 控制。
  712. * title.vector 的语义权重略高(×1.2),以提升标题匹配的排名。
  713. *
  714. * @param string $query 搜索关键词
  715. * @return array OpenSearch DSL query 片段
  716. *
  717. * @throws \Exception embedding 调用失败时抛出
  718. */
  719. protected function buildHybridQuery(string $query): array
  720. {
  721. $fuzzyFields = [];
  722. foreach ($this->weights['hybrid'] as $field => $weight) {
  723. if (in_array($field, ['fuzzy_ratio', 'semantic_ratio'])) {
  724. continue;
  725. }
  726. $fuzzyFields[] = $field . '^' . $weight;
  727. }
  728. $fuzzyPart = [
  729. 'multi_match' => [
  730. 'query' => $query,
  731. 'fields' => $fuzzyFields,
  732. 'type' => 'best_fields',
  733. ],
  734. ];
  735. $vector = $this->embedText($query);
  736. $fuzzyRatio = $this->weights['hybrid']['fuzzy_ratio'];
  737. $semanticRatio = $this->weights['hybrid']['semantic_ratio'];
  738. return [
  739. 'bool' => [
  740. 'should' => [
  741. [
  742. 'constant_score' => [
  743. 'filter' => $fuzzyPart,
  744. 'boost' => $fuzzyRatio,
  745. ],
  746. ],
  747. [
  748. 'knn' => [
  749. 'content.vector' => [
  750. 'vector' => $vector,
  751. 'k' => 20,
  752. 'boost' => $semanticRatio * 1.0,
  753. ],
  754. ],
  755. ],
  756. [
  757. 'knn' => [
  758. 'summary.vector' => [
  759. 'vector' => $vector,
  760. 'k' => 10,
  761. 'boost' => $semanticRatio * 0.8,
  762. ],
  763. ],
  764. ],
  765. [
  766. 'knn' => [
  767. 'title.vector' => [
  768. 'vector' => $vector,
  769. 'k' => 5,
  770. 'boost' => $semanticRatio * 1.2, // title 权重略高
  771. ],
  772. ],
  773. ],
  774. ],
  775. ],
  776. ];
  777. }
  778. /**
  779. * 调用 OpenAI Embedding API 将文本转为向量
  780. *
  781. * 使用 Redis 缓存(TTL 7 天),相同文本不会重复请求 API,
  782. * 缓存 key 格式为 "embedding:{md5(text)}"。
  783. *
  784. * @param string $text 输入文本
  785. * @return array 1536 维 float 向量
  786. *
  787. * @throws \Exception 未设置 OPENAI_API_KEY 或 API 返回异常时抛出
  788. */
  789. protected function embedText(string $text): array
  790. {
  791. if (!$this->openaiApiKey) {
  792. throw new Exception('请在 .env 设置 OPENAI_API_KEY');
  793. }
  794. $cacheKey = 'embedding:' . md5($text);
  795. return Cache::remember($cacheKey, now()->addDays(7), function () use ($text) {
  796. $response = $this->http->post('embeddings', [
  797. 'headers' => [
  798. 'Authorization' => 'Bearer ' . $this->openaiApiKey,
  799. 'Content-Type' => 'application/json',
  800. ],
  801. 'json' => [
  802. 'model' => 'text-embedding-3-small',
  803. 'input' => $text,
  804. ],
  805. ]);
  806. $json = json_decode((string) $response->getBody(), true);
  807. if (empty($json['data'][0]['embedding'])) {
  808. throw new Exception('OpenAI embedding 返回异常: ' . json_encode($json));
  809. }
  810. return $json['data'][0]['embedding'];
  811. });
  812. }
  813. /**
  814. * 清除指定文本的 embedding 缓存
  815. *
  816. * @param string $text 原始文本(与调用 embedText 时一致)
  817. * @return bool 缓存是否成功删除
  818. *
  819. * @example
  820. * $service->clearEmbeddingCache('sabbe dhammā anattā');
  821. */
  822. public function clearEmbeddingCache(string $text): bool
  823. {
  824. $cacheKey = 'embedding:' . md5($text);
  825. return Cache::forget($cacheKey);
  826. }
  827. /**
  828. * 清除 Redis 中所有 embedding 缓存
  829. *
  830. * 匹配 "embedding:*" 模式的全部键,生产环境请谨慎调用。
  831. *
  832. * @return int 已删除的缓存条数
  833. *
  834. * @example
  835. * $count = $service->clearAllEmbeddingCache();
  836. * echo "已清理缓存 {$count} 条";
  837. */
  838. public function clearAllEmbeddingCache(): int
  839. {
  840. $redis = Cache::getRedis();
  841. $keys = $redis->keys('embedding:*');
  842. if (!empty($keys)) {
  843. $redis->del($keys);
  844. }
  845. return count($keys);
  846. }
  847. /**
  848. * 自动建议(Completion Suggest)
  849. *
  850. * 基于 completion 字段实现前缀补全,支持同时查询多个语言字段。
  851. * 结果按 _score 降序排序,跨字段去重。
  852. *
  853. * 可用字段标识符($fields 参数):
  854. * - 'title_pali' → title.suggest.pali
  855. * - 'title_zh' → title.suggest.zh
  856. * - 'content_pali' → content.suggest.pali
  857. * - 'content_zh' → content.suggest.zh
  858. *
  859. * @param string $query 查询前缀文本
  860. * @param array|string|null $fields 要查询的字段标识符,null 表示全部字段
  861. * @param string|null $language 可选的语言过滤(term query)
  862. * @param int $limit 每个字段返回的建议数量,默认 10
  863. * @return array 建议结果列表,每项包含:
  864. * text, source(字段标识符), score, doc_id, doc_source
  865. *
  866. * @throws \InvalidArgumentException $fields 中含无效字段标识符时抛出
  867. *
  868. * @example
  869. * // 查询所有字段
  870. * $service->suggest('nibb');
  871. *
  872. * // 只查询巴利文标题建议
  873. * $service->suggest('nibb', 'title_pali');
  874. *
  875. * // 查询多个字段,限制语言
  876. * $service->suggest('涅', ['title_zh', 'content_zh'], 'zh', 5);
  877. */
  878. public function suggest(
  879. string $query,
  880. $fields = null,
  881. ?string $language = null,
  882. int $limit = 10
  883. ): array {
  884. // 字段标识符 → OpenSearch completion 字段路径
  885. $fieldMap = [
  886. 'title_pali' => 'title.suggest.pali',
  887. 'title_zh' => 'title.suggest.zh',
  888. 'content_pali' => 'content.suggest.pali',
  889. 'content_zh' => 'content.suggest.zh',
  890. ];
  891. // 处理字段参数
  892. if ($fields === null) {
  893. $searchFields = array_keys($fieldMap);
  894. } elseif (is_string($fields)) {
  895. $searchFields = [$fields];
  896. } else {
  897. $searchFields = $fields;
  898. }
  899. // 过滤无效字段
  900. $searchFields = array_values(array_filter(
  901. $searchFields,
  902. fn($field) => isset($fieldMap[$field])
  903. ));
  904. if (empty($searchFields)) {
  905. throw new \InvalidArgumentException('Invalid fields specified for suggestion');
  906. }
  907. // 构建 suggest DSL
  908. $suggests = [];
  909. foreach ($searchFields as $field) {
  910. $suggests[$field . '_suggest'] = [
  911. 'prefix' => $query,
  912. 'completion' => [
  913. 'field' => $fieldMap[$field],
  914. 'size' => $limit,
  915. 'skip_duplicates' => true,
  916. ],
  917. ];
  918. }
  919. $dsl = ['suggest' => $suggests];
  920. if ($language) {
  921. $dsl['query'] = ['term' => ['language' => $language]];
  922. }
  923. $response = $this->client->search([
  924. 'index' => config('mint.opensearch.index'),
  925. 'body' => $dsl,
  926. ]);
  927. // 整理结果,附加来源字段
  928. $results = [];
  929. foreach ($searchFields as $field) {
  930. $options = $response['suggest'][$field . '_suggest'][0]['options'] ?? [];
  931. foreach ($options as $opt) {
  932. $results[] = [
  933. 'text' => $opt['text'] ?? '',
  934. 'source' => $field,
  935. 'score' => $opt['_score'] ?? 0,
  936. 'doc_id' => $opt['_id'] ?? null,
  937. 'doc_source' => $opt['_source'] ?? null,
  938. ];
  939. }
  940. }
  941. // 按分数降序排序
  942. usort($results, fn($a, $b) => $b['score'] <=> $a['score']);
  943. return $results;
  944. }
  945. /**
  946. * 按文档 ID 获取单条完整文档(包含 content.display)
  947. *
  948. * @param string $id 文档 ID,例如 "term_{guid}"
  949. * @return array OpenSearch 原始响应
  950. */
  951. public function get(string $id): array
  952. {
  953. return $this->client->get([
  954. 'index' => config('mint.opensearch.index'),
  955. 'id' => $id,
  956. ]);
  957. }
  958. }