IndexTerm.php 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. <?php
  2. namespace App\Console\Commands;
  3. use App\Models\DhammaTerm;
  4. use App\Services\OpenSearchService;
  5. use App\Services\TermService;
  6. use Illuminate\Console\Command;
  7. use Illuminate\Support\Facades\Cache;
  8. use Illuminate\Support\Facades\Log;
  9. class IndexTerm extends Command
  10. {
  11. protected $signature = 'opensearch:index-term
  12. {--test}
  13. {--word= : 指定单个词条进行索引,省略则索引全部}
  14. {--fresh : 清除缓存断点,从头开始}';
  15. protected $description = 'Index Term data into OpenSearch(可重入:中断后重跑自动跳过已索引的词条)';
  16. // 缓存键:记录最后成功索引的游标位置,48h 过期
  17. private const CACHE_KEY = 'index-term:cursor';
  18. private bool $isTest = false;
  19. public function __construct(
  20. protected OpenSearchService $openSearchService,
  21. protected TermService $termService,
  22. ) {
  23. parent::__construct();
  24. }
  25. public function handle(): int
  26. {
  27. $word = $this->option('word');
  28. if ($this->option('test')) {
  29. $this->isTest = true;
  30. $this->info('test mode');
  31. }
  32. if ($this->option('fresh')) {
  33. Cache::forget(self::CACHE_KEY);
  34. $this->info('Cleared cached cursor.');
  35. }
  36. try {
  37. [$connected, $message] = $this->openSearchService->testConnection();
  38. if (! $connected) {
  39. $this->error($message);
  40. Log::error($message);
  41. return 1;
  42. }
  43. // 按自增 id 排序,保证游标稳定(updated_at 可能在运行中被修改)
  44. $terms = DhammaTerm::select(['id', 'guid', 'word'])->orderBy('id');
  45. if ($word) {
  46. $terms = $terms->where('word', $word);
  47. }
  48. // 从缓存恢复断点:跳过上次已处理的记录
  49. $lastId = Cache::get(self::CACHE_KEY);
  50. if ($lastId && ! $word) {
  51. $terms = $terms->where('id', '>', $lastId);
  52. $this->info("Resuming after id={$lastId}");
  53. }
  54. $total = $terms->count();
  55. $this->info("terms to index: {$total}");
  56. $curr = 0;
  57. foreach ($terms->cursor() as $term) {
  58. $curr++;
  59. if ($curr % 10 === 0) {
  60. $percent = (int) ($curr * 100 / $total);
  61. $this->info("[{$percent}%]-{$curr}/{$total} {$term->word}");
  62. // 每 10 条保存一次断点
  63. Cache::put(self::CACHE_KEY, $term->id, now()->addHours(48));
  64. }
  65. $this->indexTerm($term->guid);
  66. }
  67. // 全部完成,清除断点缓存
  68. Cache::forget(self::CACHE_KEY);
  69. $this->info("index-term finished. total: {$curr}");
  70. return 0;
  71. } catch (\Exception $e) {
  72. $this->error('Failed to index Term data: '.$e->getMessage());
  73. Log::error('Failed to index Term data', ['error' => $e]);
  74. return 1;
  75. }
  76. }
  77. /**
  78. * 构建单条词条文档并写入 OpenSearch
  79. *
  80. * 文档结构遵循新版 mapping:
  81. * title.text.pali / title.text.zh → 全文检索
  82. * title.suggest.pali / title.suggest.zh → 自动建议
  83. * content.text.pali / content.text.zh → 正文内容
  84. *
  85. * @param string $id DhammaTerm 的 guid
  86. */
  87. protected function indexTerm(string $id): void
  88. {
  89. $termData = $this->termService->find($id, 'text');
  90. $channelName = $termData['channel']['name'] ?? '';
  91. $isCommunity = $this->termService->isCommunity($termData['channel_id']);
  92. $content = $termData['html'] ?? $termData['meaning'];
  93. $categories = $this->extractCategories($termData['note'] ?? '');
  94. $quality = $this->extractFirstQuality($termData['note'] ?? '');
  95. $tags = [];
  96. foreach ($categories as $key => $category) {
  97. $tags[] = "category:{$category}";
  98. }
  99. if (! empty($quality)) {
  100. $tags[] = "quality:{$quality}";
  101. }
  102. $document = [
  103. 'id' => "term_{$id}",
  104. 'resource_id' => $id,
  105. 'resource_type' => 'term',
  106. 'title' => [
  107. 'text' => [
  108. 'pali' => $termData['word'],
  109. 'zh' => $termData['meaning'],
  110. ],
  111. 'suggest' => [
  112. 'pali' => [$termData['word']],
  113. 'zh' => [$termData['meaning']],
  114. ],
  115. ],
  116. 'summary' => [
  117. 'text' => $termData['summary'] ?? '',
  118. ],
  119. 'content' => [],
  120. 'bold_single' => [$termData['meaning'], $termData['word']],
  121. 'related_id' => $termData['word'],
  122. 'category' => null,
  123. 'tags' => $tags,
  124. 'language' => $termData['language'],
  125. 'updated_at' => now()->toIso8601String(),
  126. 'path' => $termData['studio']['realName']."/{$channelName}",
  127. 'metadata' => ['channel' => $termData['channel_id']],
  128. ];
  129. // TODO: 补充语言判断,将内容放入对应的 text.pali 或 text.zh 字段
  130. $plainText = strip_tags($content);
  131. if (str_contains($termData['language'], 'zh')) {
  132. $document['content']['text']['zh'] = $plainText;
  133. } else {
  134. $document['content']['text']['zh'] = $plainText;
  135. }
  136. $document['content']['display'] = $content; // 展示
  137. if ($this->isTest) {
  138. $this->info($document['title']['text']['pali']);
  139. //$this->info($document['summary']['text']);
  140. } else {
  141. $this->openSearchService->create($document['id'], $document);
  142. }
  143. }
  144. /**
  145. * 提取 Markdown 中的 {{category|...}} 分类标签
  146. */
  147. private function extractCategories(string $content): array
  148. {
  149. if (empty($content)) {
  150. return [];
  151. }
  152. preg_match_all('/\{\{category\|([^}]+)\}\}/u', $content, $matches);
  153. return array_values(array_filter(array_map(
  154. fn ($item) => trim($item),
  155. $matches[1] ?? []
  156. )));
  157. }
  158. /**
  159. * 提取 Markdown 中第一个 {{quality|...}} 标签内的内容
  160. */
  161. private function extractFirstQuality(string $content): string
  162. {
  163. if (empty($content)) {
  164. return '';
  165. }
  166. preg_match('/\{\{quality\|([^}]+)\}\}/u', $content, $matches);
  167. return isset($matches[1]) ? trim($matches[1]) : '';
  168. }
  169. }