OpenSearchService.php 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866
  1. <?php
  2. // api-v8/app/Services/OpenSearchService.php
  3. namespace App\Services;
  4. use OpenSearch\GuzzleClientFactory;
  5. use Illuminate\Support\Facades\Log;
  6. use GuzzleHttp\Client;
  7. use Illuminate\Support\Facades\Cache;
  8. use Exception;
  9. class OpenSearchService
  10. {
  11. protected $client;
  12. protected $http;
  13. protected $openaiApiKey;
  14. /** 默认查询排除字段 **/
  15. private $sourceExcludes = [
  16. 'title.suggest',
  17. 'content.suggest',
  18. ];
  19. /** 默认权重配置 **/
  20. private $weights = [
  21. 'fuzzy' => [
  22. 'bold_single' => 50,
  23. 'bold_multi' => 10,
  24. 'title.pali.text' => 3,
  25. 'title.zh' => 3,
  26. 'summary.text' => 2,
  27. 'content.pali.text' => 1,
  28. 'content.zh' => 1,
  29. ],
  30. 'hybrid' => [
  31. 'fuzzy_ratio' => 0.7,
  32. 'semantic_ratio' => 0.3,
  33. 'bold_single' => 50,
  34. 'bold_multi' => 10,
  35. 'title.pali.text' => 3,
  36. 'title.zh' => 3,
  37. 'summary.text' => 2,
  38. 'content.pali.text' => 1,
  39. 'content.zh' => 1,
  40. ],
  41. ];
  42. private $indexDefinition = [
  43. 'settings' => [
  44. 'index' => [
  45. 'knn' => true,
  46. ],
  47. 'analysis' => [
  48. 'analyzer' => [
  49. /** */
  50. 'pali_query_analyzer' => [
  51. 'tokenizer' => 'standard',
  52. 'filter' => ['lowercase', 'pali_synonyms'],
  53. ],
  54. 'pali_index_analyzer' => [
  55. 'type' => 'custom',
  56. 'tokenizer' => 'standard',
  57. 'char_filter' => ['markdown_strip'],
  58. 'filter' => ['lowercase'],
  59. ],
  60. 'markdown_clean' => [
  61. 'type' => 'custom',
  62. 'tokenizer' => 'standard',
  63. 'char_filter' => ['markdown_strip'],
  64. 'filter' => ['lowercase'],
  65. ],
  66. // Suggest 专用(忽略大小写 + 变音)
  67. 'pali_suggest_analyzer' => [
  68. 'tokenizer' => 'standard',
  69. 'filter' => ['lowercase', 'asciifolding']
  70. ],
  71. 'zh_suggest_analyzer' => [
  72. 'tokenizer' => 'ik_smart',
  73. 'char_filter' => ['tsconvert'],
  74. ],
  75. // 中文简繁统一 (繁 -> 简)
  76. 'zh_index_analyzer' => [
  77. 'tokenizer' => 'ik_max_word',
  78. 'char_filter' => ['tsconvert'],
  79. ],
  80. 'zh_query_analyzer' => [
  81. 'tokenizer' => 'ik_smart',
  82. 'char_filter' => ['tsconvert'],
  83. ]
  84. ],
  85. 'filter' => [
  86. 'pali_synonyms' => [
  87. 'type' => 'synonym_graph',
  88. 'synonyms_path' => 'analysis/pali_synonyms.txt',
  89. 'updateable' => true,
  90. ],
  91. ],
  92. 'char_filter' => [
  93. 'markdown_strip' => [
  94. 'type' => 'pattern_replace',
  95. 'pattern' => '\\*\\*|\\*|_|`|~',
  96. 'replacement' => '',
  97. ],
  98. "tsconvert" => [
  99. "type" => "stconvert",
  100. "convert_type" => "t2s"
  101. ]
  102. ],
  103. ],
  104. ],
  105. 'mappings' => [
  106. 'properties' => [
  107. 'id' => ['type' => 'keyword'],
  108. 'resource_id' => ['type' => 'keyword'],
  109. 'resource_type' => ['type' => 'keyword'],
  110. 'title' => [
  111. 'properties' => [
  112. 'pali' => [
  113. 'type' => 'text',
  114. 'fields' => [
  115. /**模糊查询 */
  116. 'text' => [
  117. 'type' => 'text',
  118. 'analyzer' => 'pali_index_analyzer',
  119. 'search_analyzer' => 'pali_query_analyzer',
  120. ],
  121. /**准确查询 */
  122. 'exact' => [
  123. 'type' => 'text',
  124. 'analyzer' => 'markdown_clean',
  125. ],
  126. ],
  127. ],
  128. 'zh' => [
  129. 'type' => 'text',
  130. 'analyzer' => 'zh_index_analyzer',
  131. 'search_analyzer' => 'zh_query_analyzer',
  132. ],
  133. 'vector' => [
  134. 'type' => 'knn_vector',
  135. 'dimension' => 1536,
  136. 'method' => [
  137. 'name' => 'hnsw',
  138. 'space_type' => 'cosinesimil',
  139. 'engine' => 'nmslib',
  140. ],
  141. ],
  142. // 自动建议字段
  143. 'suggest_pali' => [
  144. 'type' => 'completion',
  145. 'analyzer' => 'pali_suggest_analyzer'
  146. ],
  147. 'suggest_zh' => [
  148. 'type' => 'completion',
  149. 'analyzer' => 'zh_suggest_analyzer'
  150. ],
  151. ],
  152. ],
  153. /** 简体中文 llm生成 */
  154. 'summary' => [
  155. 'properties' => [
  156. 'text' => [
  157. 'type' => 'text',
  158. 'analyzer' => 'zh_index_analyzer',
  159. 'search_analyzer' => 'zh_query_analyzer',
  160. ],
  161. 'vector' => [
  162. 'type' => 'knn_vector',
  163. 'dimension' => 1536,
  164. 'method' => [
  165. 'name' => 'hnsw',
  166. 'space_type' => 'cosinesimil',
  167. 'engine' => 'nmslib',
  168. ],
  169. ],
  170. ]
  171. ],
  172. 'content' => [
  173. 'properties' => [
  174. 'pali' => [
  175. 'type' => 'text',
  176. 'fields' => [
  177. /**模糊查询 */
  178. 'text' => [
  179. 'type' => 'text',
  180. 'analyzer' => 'pali_index_analyzer',
  181. 'search_analyzer' => 'pali_query_analyzer',
  182. ],
  183. /**准确查询 */
  184. 'exact' => [
  185. 'type' => 'text',
  186. 'analyzer' => 'markdown_clean',
  187. ],
  188. ],
  189. ],
  190. 'zh' => [
  191. 'type' => 'text',
  192. 'analyzer' => 'zh_index_analyzer',
  193. 'search_analyzer' => 'zh_query_analyzer',
  194. ],
  195. 'tokens' => [
  196. 'type' => 'nested',
  197. 'properties' => [
  198. 'surface' => ['type' => 'keyword'],
  199. 'lemma' => ['type' => 'keyword'],
  200. 'compound_parts' => ['type' => 'keyword'],
  201. 'case' => ['type' => 'keyword'],
  202. ],
  203. ],
  204. 'vector' => [
  205. 'type' => 'knn_vector',
  206. 'dimension' => 1536,
  207. 'method' => [
  208. 'name' => 'hnsw',
  209. 'space_type' => 'cosinesimil',
  210. 'engine' => 'nmslib',
  211. ],
  212. ],
  213. 'suggest_pali' => [
  214. 'type' => 'completion',
  215. 'analyzer' => 'pali_suggest_analyzer'
  216. ],
  217. 'suggest_zh' => [
  218. 'type' => 'completion',
  219. 'analyzer' => 'zh_suggest_analyzer'
  220. ],
  221. ],
  222. ],
  223. 'related_id' => ['type' => 'keyword'],
  224. 'bold_single' => [
  225. 'type' => 'text',
  226. 'analyzer' => 'standard',
  227. 'search_analyzer' => 'pali_query_analyzer',
  228. ],
  229. 'bold_multi' => [
  230. 'type' => 'text',
  231. 'analyzer' => 'standard',
  232. 'search_analyzer' => 'pali_query_analyzer',
  233. ],
  234. 'path' => ['type' => 'text', 'analyzer' => 'standard'],
  235. 'page_refs' => [
  236. 'type' => 'keyword',
  237. ],
  238. 'tags' => ['type' => 'keyword'],
  239. 'category' => ['type' => 'keyword'],
  240. 'author' => ['type' => 'text'],
  241. 'language' => ['type' => 'keyword'],
  242. 'updated_at' => ['type' => 'date'],
  243. 'granularity' => ['type' => 'keyword'],
  244. 'metadata' => [
  245. 'properties' => [
  246. 'APA' => ['type' => 'text', 'index' => false],
  247. 'MLA' => ['type' => 'text', 'index' => false],
  248. 'widget' => ['type' => 'text', 'index' => false],
  249. 'author' => ['type' => 'text'], //
  250. 'channel' => ['type' => 'text'], //
  251. ],
  252. ],
  253. ],
  254. ],
  255. ];
  256. public function __construct()
  257. {
  258. $config = config('mint.opensearch.config');
  259. $hostUrl = "{$config['scheme']}://{$config['host']}:{$config['port']}";
  260. $this->client = (new GuzzleClientFactory())->create([
  261. 'base_uri' => $hostUrl,
  262. 'auth' => [$config['username'], $config['password']],
  263. 'verify' => $config['ssl_verification'],
  264. ]);
  265. $this->openaiApiKey = env('OPENAI_API_KEY');
  266. $this->http = new Client([
  267. 'base_uri' => 'https://api.openai.com/v1/',
  268. 'timeout' => 15,
  269. ]);
  270. }
  271. public function setWeights(string $mode, array $weights)
  272. {
  273. if (isset($this->weights[$mode])) {
  274. $this->weights[$mode] = array_merge($this->weights[$mode], $weights);
  275. }
  276. }
  277. public function testConnection()
  278. {
  279. try {
  280. $info = $this->client->info();
  281. $message = 'OpenSearch 连接成功: ' . json_encode($info['version']['number']);
  282. Log::info($message);
  283. return [true, $message];
  284. } catch (\Exception $e) {
  285. $message = 'OpenSearch 连接失败: ' . $e->getMessage();
  286. Log::error($message);
  287. return [false, $message];
  288. }
  289. }
  290. public function indexExists()
  291. {
  292. $index = config('mint.opensearch.index');
  293. return $this->client->indices()->exists(['index' => $index]);
  294. }
  295. /** 索引管理方法保持不变... **/
  296. public function createIndex()
  297. {
  298. $index = config('mint.opensearch.index');
  299. $exists = $this->client->indices()->exists(['index' => $index]);
  300. if ($exists) {
  301. throw new \Exception("Index [$index] already exists.");
  302. }
  303. return $this->client->indices()->create([
  304. 'index' => $index,
  305. 'body' => $this->indexDefinition
  306. ]);
  307. }
  308. public function updateIndex()
  309. {
  310. $index = config('mint.opensearch.index');
  311. $settings = $this->indexDefinition['settings'] ?? [];
  312. $mappings = $this->indexDefinition['mappings'] ?? [];
  313. $response = [];
  314. if (!empty($settings)) {
  315. $this->client->indices()->close(['index' => $index]);
  316. $response['settings'] = $this->client->indices()->putSettings([
  317. 'index' => $index,
  318. 'body' => ['settings' => $settings]
  319. ]);
  320. $this->client->indices()->open(['index' => $index]);
  321. }
  322. if (!empty($mappings)) {
  323. $response['mappings'] = $this->client->indices()->putMapping([
  324. 'index' => $index,
  325. 'body' => $mappings
  326. ]);
  327. }
  328. return $response;
  329. }
  330. public function deleteIndex()
  331. {
  332. $index = config('mint.opensearch.index');
  333. return $this->client->indices()->delete(['index' => $index]);
  334. }
  335. /**
  336. * 获取索引文档数量(支持条件统计)
  337. *
  338. * @param array|null $query 可选的查询条件(OpenSearch DSL query 部分)
  339. * 例如:
  340. * [
  341. * 'term' => ['language' => 'zh']
  342. * ]
  343. *
  344. * @return int 文档总数
  345. *
  346. * @throws \Exception
  347. *
  348. * @example
  349. * // 获取索引全部文档数量
  350. * $count = $service->count();
  351. *
  352. * // 按条件统计(例如:只统计有 embedding 的文档)
  353. * $count = $service->count([
  354. * 'exists' => ['field' => 'content.vector']
  355. * ]);
  356. */
  357. public function count(?array $query = null): int
  358. {
  359. $index = config('mint.opensearch.index');
  360. $params = [
  361. 'index' => $index,
  362. ];
  363. // 如果传入 query,则按条件统计
  364. if (!empty($query)) {
  365. $params['body'] = [
  366. 'query' => $query
  367. ];
  368. }
  369. $response = $this->client->count($params);
  370. return (int) ($response['count'] ?? 0);
  371. }
  372. public function create(string $id, array $body)
  373. {
  374. return $this->client->index([
  375. 'index' => config('mint.opensearch.index'),
  376. 'id' => $id,
  377. 'body' => $body
  378. ]);
  379. }
  380. public function delete($id)
  381. {
  382. return $this->client->delete(['index' => config('mint.opensearch.index'), 'id' => $id]);
  383. }
  384. /**
  385. * 执行高级搜索(支持 fuzzy / exact / semantic / hybrid 四种模式)
  386. *
  387. * @param array $params 搜索参数数组
  388. * - query: 搜索关键词
  389. * - searchMode: 搜索模式 (fuzzy|exact|semantic|hybrid)
  390. * - page: 页码,默认 1
  391. * - pageSize: 每页条数,默认 20
  392. * - resourceType / language / category / tags / relatedId / pageRefs / author / channel 等过滤条件
  393. * @return array OpenSearch 返回的搜索结果
  394. *
  395. * @throws \Exception
  396. */
  397. public function search(array $params)
  398. {
  399. // 分页参数
  400. $page = $params['page'] ?? 1;
  401. $pageSize = $params['pageSize'] ?? 20;
  402. $from = ($page - 1) * $pageSize;
  403. // 搜索模式,默认 fuzzy
  404. $mode = $params['searchMode'] ?? 'fuzzy';
  405. // ---------- 过滤条件 ----------
  406. $filters = [];
  407. if (!empty($params['resourceType'])) {
  408. $filters[] = ['term' => ['resource_type' => $params['resourceType']]];
  409. }
  410. if (!empty($params['resourceId'])) {
  411. $filters[] = ['term' => ['resource_id' => $params['resourceId']]];
  412. }
  413. if (!empty($params['granularity'])) {
  414. $filters[] = ['term' => ['granularity' => $params['granularity']]];
  415. }
  416. if (!empty($params['language'])) {
  417. $filters[] = ['term' => ['language' => $params['language']]];
  418. }
  419. if (!empty($params['category'])) {
  420. $filters[] = ['term' => ['category' => $params['category']]];
  421. }
  422. if (!empty($params['tags'])) {
  423. $filters[] = ['terms' => ['tags' => $params['tags']]];
  424. }
  425. if (!empty($params['pageRefs'])) {
  426. $filters[] = ['terms' => ['page_refs' => $params['pageRefs']]];
  427. }
  428. if (!empty($params['relatedId'])) {
  429. $filters[] = ['term' => ['related_id' => $params['relatedId']]];
  430. }
  431. if (!empty($params['author'])) {
  432. $filters[] = ['match' => ['metadata.author' => $params['author']]];
  433. }
  434. if (!empty($params['channel'])) {
  435. $filters[] = ['term' => ['metadata.channel' => $params['channel']]];
  436. }
  437. // ---------- 查询部分 ----------
  438. switch ($mode) {
  439. case 'exact':
  440. $query = $this->buildExactQuery($params['query']);
  441. break;
  442. case 'semantic':
  443. $query = $this->buildSemanticQuery($params['query']);
  444. break;
  445. case 'hybrid':
  446. $query = $this->buildHybridQuery($params['query']);
  447. break;
  448. case 'fuzzy':
  449. default:
  450. $query = $this->buildFuzzyQuery($params['query']);
  451. }
  452. if (!empty($params['highlight_pre_tags'])) {
  453. $highlight_pre_tags = $params['highlight_pre_tags'];
  454. } else {
  455. $highlight_pre_tags = ['<mark>'];
  456. }
  457. if (!empty($params['highlight_post_tags'])) {
  458. $highlight_post_tags = $params['highlight_post_tags'];
  459. } else {
  460. $highlight_post_tags = ['</mark>'];
  461. }
  462. // ---------- 最终 DSL ----------
  463. $dsl = [
  464. 'from' => $from,
  465. 'size' => $pageSize,
  466. '_source' => [
  467. 'excludes' => $this->sourceExcludes
  468. ],
  469. 'query' => !empty($filters)
  470. ? ['bool' => ['must' => [$query], 'filter' => $filters]]
  471. : $query,
  472. 'aggs' => [
  473. 'resource_type' => ['terms' => ['field' => 'resource_type']],
  474. 'language' => ['terms' => ['field' => 'language']],
  475. 'category' => ['terms' => ['field' => 'category']],
  476. 'granularity' => ['terms' => ['field' => 'granularity']],
  477. ],
  478. 'highlight' => [
  479. 'fields' => [
  480. 'title.pali.text' => new \stdClass(),
  481. 'title.zh' => new \stdClass(),
  482. 'summary.text' => new \stdClass(),
  483. 'content.pali.text' => new \stdClass(),
  484. 'content.zh' => new \stdClass(),
  485. ],
  486. "fragmenter" => "sentence",
  487. "fragment_size" => 200,
  488. "number_of_fragments" => 1,
  489. 'pre_tags' => $highlight_pre_tags,
  490. 'post_tags' => $highlight_post_tags,
  491. ],
  492. ];
  493. Log::debug('search', ['dsl' => json_encode($dsl, JSON_UNESCAPED_UNICODE)]);
  494. // ---------- 执行查询 ----------
  495. $response = $this->client->search([
  496. 'index' => config('mint.opensearch.index'),
  497. 'body' => $dsl
  498. ]);
  499. return $response;
  500. }
  501. /**
  502. * 构建 exact 查询
  503. * 精确匹配 title.pali.exact, content.pali.exact, summary
  504. */
  505. protected function buildExactQuery(string $query): array
  506. {
  507. return [
  508. 'multi_match' => [
  509. 'query' => $query,
  510. 'fields' => [
  511. 'title.pali.exact',
  512. 'content.pali.exact',
  513. 'summary.text'
  514. ],
  515. 'type' => 'best_fields',
  516. ]
  517. ];
  518. }
  519. /**
  520. * 构建 semantic 查询
  521. * 使用 OpenAI embedding,同时查询三个向量字段
  522. */
  523. protected function buildSemanticQuery(string $query): array
  524. {
  525. $vector = $this->embedText($query);
  526. // OpenSearch 支持多个 knn 查询,使用 bool should
  527. return [
  528. 'bool' => [
  529. 'should' => [
  530. [
  531. 'knn' => [
  532. 'content.vector' => [
  533. 'vector' => $vector,
  534. 'k' => 20,
  535. ]
  536. ]
  537. ],
  538. [
  539. 'knn' => [
  540. 'summary.vector' => [
  541. 'vector' => $vector,
  542. 'k' => 10,
  543. ]
  544. ]
  545. ],
  546. [
  547. 'knn' => [
  548. 'title.vector' => [
  549. 'vector' => $vector,
  550. 'k' => 5,
  551. ]
  552. ]
  553. ]
  554. ],
  555. 'minimum_should_match' => 1
  556. ]
  557. ];
  558. }
  559. /**
  560. * 构建 fuzzy 查询
  561. */
  562. protected function buildFuzzyQuery(string $query)
  563. {
  564. $fields = [];
  565. foreach ($this->weights['fuzzy'] as $field => $weight) {
  566. $fields[] = $field . "^" . $weight;
  567. }
  568. return [
  569. 'multi_match' => [
  570. 'query' => $query,
  571. 'fields' => $fields,
  572. 'type' => 'best_fields'
  573. ]
  574. ];
  575. }
  576. /**
  577. * 构建 hybrid 查询 (fuzzy + semantic)
  578. */
  579. protected function buildHybridQuery(string $query)
  580. {
  581. $fuzzyFields = [];
  582. foreach ($this->weights['hybrid'] as $field => $weight) {
  583. if (in_array($field, ['fuzzy_ratio', 'semantic_ratio'])) {
  584. continue;
  585. }
  586. $fuzzyFields[] = $field . "^" . $weight;
  587. }
  588. $fuzzyPart = [
  589. 'multi_match' => [
  590. 'query' => $query,
  591. 'fields' => $fuzzyFields,
  592. 'type' => 'best_fields'
  593. ]
  594. ];
  595. $vector = $this->embedText($query);
  596. $fuzzyRatio = $this->weights['hybrid']['fuzzy_ratio'];
  597. $semanticRatio = $this->weights['hybrid']['semantic_ratio'];
  598. // 使用 bool should 组合 fuzzy 和 semantic 查询
  599. return [
  600. 'bool' => [
  601. 'should' => [
  602. // Fuzzy 部分,带权重
  603. [
  604. 'constant_score' => [
  605. 'filter' => $fuzzyPart,
  606. 'boost' => $fuzzyRatio
  607. ]
  608. ],
  609. // Semantic 部分 - content
  610. [
  611. 'knn' => [
  612. 'content.vector' => [
  613. 'vector' => $vector,
  614. 'k' => 20,
  615. 'boost' => $semanticRatio * 1.0 // 主要权重
  616. ]
  617. ]
  618. ],
  619. // Semantic 部分 - summary
  620. [
  621. 'knn' => [
  622. 'summary.vector' => [
  623. 'vector' => $vector,
  624. 'k' => 10,
  625. 'boost' => $semanticRatio * 0.8
  626. ]
  627. ]
  628. ],
  629. // Semantic 部分 - title
  630. [
  631. 'knn' => [
  632. 'title.vector' => [
  633. 'vector' => $vector,
  634. 'k' => 5,
  635. 'boost' => $semanticRatio * 1.2 // title 稍微高一点
  636. ]
  637. ]
  638. ]
  639. ]
  640. ]
  641. ];
  642. }
  643. /**
  644. * 调用 OpenAI Embedding API
  645. * 使用 Redis 缓存,避免重复调用
  646. *
  647. * @param string $text 输入文本
  648. * @return array 向量 embedding
  649. * @throws \Exception
  650. */
  651. protected function embedText(string $text): array
  652. {
  653. if (!$this->openaiApiKey) {
  654. throw new Exception("请在 .env 设置 OPENAI_API_KEY");
  655. }
  656. // 缓存 key,可以用 md5 保证唯一
  657. $cacheKey = "embedding:" . md5($text);
  658. // 先查缓存
  659. return Cache::remember($cacheKey, now()->addDays(7), function () use ($text) {
  660. $response = $this->http->post('embeddings', [
  661. 'headers' => [
  662. 'Authorization' => 'Bearer ' . $this->openaiApiKey,
  663. 'Content-Type' => 'application/json',
  664. ],
  665. 'json' => [
  666. 'model' => 'text-embedding-3-small',
  667. 'input' => $text,
  668. ],
  669. ]);
  670. $json = json_decode((string)$response->getBody(), true);
  671. if (empty($json['data'][0]['embedding'])) {
  672. throw new Exception("OpenAI embedding 返回异常: " . json_encode($json));
  673. }
  674. return $json['data'][0]['embedding'];
  675. });
  676. }
  677. /**
  678. * 清理指定文本的 embedding 缓存
  679. * $service = app(App\Services\OpenSearchService::class);
  680. // 清理某个文本的缓存
  681. $service->clearEmbeddingCache("sabbe dhammā anattā");
  682. // 清理所有 embedding 缓存
  683. $count = $service->clearAllEmbeddingCache();
  684. echo "已清理缓存 {$count} 条";
  685. *
  686. * @param string $text
  687. * @return bool
  688. */
  689. public function clearEmbeddingCache(string $text): bool
  690. {
  691. $cacheKey = "embedding:" . md5($text);
  692. return Cache::forget($cacheKey);
  693. }
  694. /**
  695. * 清理所有 embedding 缓存
  696. * 注意:这会删除 Redis 里所有 "embedding:*" 的缓存
  697. *
  698. * @return int 删除的条数
  699. */
  700. public function clearAllEmbeddingCache(): int
  701. {
  702. $redis = Cache::getRedis();
  703. $pattern = "embedding:*";
  704. $keys = $redis->keys($pattern);
  705. if (!empty($keys)) {
  706. $redis->del($keys);
  707. }
  708. return count($keys);
  709. }
  710. /**
  711. * 自动建议
  712. *
  713. * @param string $query 查询文本
  714. * @param array|string|null $fields 要查询的字段,可选值:
  715. * - null: 查询所有字段 ['title', 'content', 'page_refs']
  716. * - 'title': 只查询 title.suggest
  717. * - 'content': 只查询 content.pali.suggest
  718. * - 'page_refs': 只查询 page_refs.suggest
  719. * - ['title', 'content']: 查询多个字段
  720. * @param string|null $language 语言过滤(可选)
  721. * @param int $limit 每个字段返回的建议数量
  722. * @return array
  723. */
  724. public function suggest(
  725. string $query,
  726. $fields = null,
  727. ?string $language = null,
  728. int $limit = 10
  729. ): array {
  730. // 字段映射配置
  731. $fieldMap = [
  732. 'title_pali' => 'title.suggest_pali',
  733. 'title_zh' => 'title.suggest_zh',
  734. 'content_pali' => 'content.suggest_pali',
  735. 'content_zh' => 'content.suggest_zh',
  736. ];
  737. // 处理字段参数
  738. if ($fields === null) {
  739. // 默认查询所有字段
  740. $searchFields = array_keys($fieldMap);
  741. } elseif (is_string($fields)) {
  742. // 单个字段
  743. $searchFields = [$fields];
  744. } else {
  745. // 数组形式
  746. $searchFields = $fields;
  747. }
  748. // 验证字段有效性
  749. $searchFields = array_filter($searchFields, function ($field) use ($fieldMap) {
  750. return isset($fieldMap[$field]);
  751. });
  752. if (empty($searchFields)) {
  753. throw new \InvalidArgumentException('Invalid fields specified for suggestion');
  754. }
  755. // 构建 suggest 查询
  756. $suggests = [];
  757. foreach ($searchFields as $field) {
  758. $suggests[$field . '_suggest'] = [
  759. 'prefix' => $query,
  760. 'completion' => [
  761. 'field' => $fieldMap[$field],
  762. 'size' => $limit,
  763. 'skip_duplicates' => true,
  764. ]
  765. ];
  766. }
  767. $dsl = ['suggest' => $suggests];
  768. // 添加语言过滤
  769. if ($language) {
  770. $dsl['query'] = ['term' => ['language' => $language]];
  771. }
  772. $response = $this->client->search([
  773. 'index' => config('mint.opensearch.index'),
  774. 'body' => $dsl
  775. ]);
  776. // 处理返回结果,包含来源信息
  777. $results = [];
  778. foreach ($searchFields as $field) {
  779. $options = $response['suggest'][$field . '_suggest'][0]['options'] ?? [];
  780. foreach ($options as $opt) {
  781. $results[] = [
  782. 'text' => $opt['text'] ?? '',
  783. 'source' => $field, // 添加来源字段
  784. 'score' => $opt['_score'] ?? 0,
  785. // 可选:添加文档信息
  786. 'doc_id' => $opt['_id'] ?? null,
  787. 'doc_source' => $opt['_source'] ?? null,
  788. ];
  789. }
  790. }
  791. // 按分数排序
  792. usort($results, function ($a, $b) {
  793. return $b['score'] <=> $a['score'];
  794. });
  795. return $results;
  796. }
  797. }