modelService = $model; $this->sentenceService = $sent; $this->openAIService = $openAI; parent::__construct(); } /** * Execute the console command. * * @return int */ public function handle() { if ($this->option('list')) { $result = RelatedParagraph::whereNotNull('book_name') ->groupBy('book_name') ->selectRaw('book_name,count(*)') ->get(); foreach ($result as $key => $value) { $this->info($value['book_name'].'['.$value['count'].']'); } return 0; } if ($this->option('model')) { $this->model = $this->modelService->getModelById($this->option('model')); // getModelById 始终返回 AiModelResource,未查到时其底层 resource 为 null,需据此判断 if (empty($this->model->resource)) { $this->error('no model found id='.$this->option('model')); return 1; } $this->info("model:{$this->model['model']}"); } if ($this->option('fresh')) { Cache::forget(self::CACHE_KEY); $this->info('Cleared cached cursor.'); } // 是否为完整遍历(未指定 book/para),仅此情形在结束后清空断点缓存 $isFullRun = ! $this->option('book') && ! $this->option('para'); // 从缓存恢复已完成的 (book_name, cs_para) 集合,作为重入时的稳定游标 $done = Cache::get(self::CACHE_KEY, []); $channel = ChannelApi::getChannelByName('_System_commentary_'); $books = []; if ($this->option('book')) { $books[] = ['book_name' => $this->option('book')]; } else { // orderBy 保证每次遍历顺序一致,游标才稳定 $books = RelatedParagraph::whereNotNull('book_name') ->where('book_name', '!=', '') ->where('cs_para', '>', 0) ->groupBy('book_name') ->orderBy('book_name') ->select('book_name') ->get()->toArray(); } foreach ($books as $key => $currBook) { $paragraphs = []; if ($this->option('para')) { $paragraphs[] = ['cs_para' => $this->option('para')]; } else { // orderBy 保证每次遍历顺序一致,游标才稳定 $paragraphs = RelatedParagraph::where('book_name', $currBook['book_name']) ->where('cs_para', '>', 0) ->groupBy('cs_para') ->orderBy('cs_para') ->select('cs_para') ->get()->toArray(); } foreach ($paragraphs as $key => $paragraph) { // 稳定游标:以 book_name|cs_para 唯一标识一个处理单元 $cursor = $currBook['book_name'].'|'.$paragraph['cs_para']; // 已完成的单元直接跳过,实现中断后重入续跑 if (isset($done[$cursor])) { continue; } $message = 'ai commentary '.$currBook['book_name'].'-'.$paragraph['cs_para']; $this->info($message); $result = RelatedParagraph::where('book_name', $currBook['book_name']) ->where('cs_para', $paragraph['cs_para']) ->where('book_id', '>', 0) ->orderBy('book_id') ->orderBy('para') ->get(); $pcdBooks = []; $type = []; foreach ($result as $rBook) { // 把段落整合成书。有几本书就有几条输出纪录 if (! isset($pcdBooks[$rBook->book_id])) { $bookType = $this->getBookType($rBook->book_id); $pcdBooks[$rBook->book_id] = $bookType; if (! isset($type[$bookType])) { $type[$bookType] = []; } $type[$bookType][$rBook->book_id] = []; } $currType = $pcdBooks[$rBook->book_id]; $type[$currType][$rBook->book_id][] = ['book' => $rBook->book, 'para' => $rBook->para]; } foreach ($type as $keyType => $info) { Log::debug($keyType); foreach ($info as $bookId => $paragraphs) { Log::debug($bookId); foreach ($paragraphs as $paragraph) { Log::debug($paragraph['book'].'-'.$paragraph['para']); } } } // 处理pali if ( $this->hasData($type, 'pāḷi') && $this->hasData($type, 'aṭṭhakathā') ) { $paliJson = []; foreach ($type['pāḷi'] as $keyBook => $paragraphs) { foreach ($paragraphs as $paraData) { $sentData = $this->getParaContent($paraData['book'], $paraData['para']); $paliJson = array_merge($paliJson, $sentData); } } $attaJson = []; foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) { foreach ($paragraphs as $paraData) { $sentData = $this->getParaContent($paraData['book'], $paraData['para']); $attaJson = array_merge($attaJson, $sentData); } } // llm 对齐 $result = $this->textAlign($paliJson, $attaJson); // 写入db $this->save($result, $channel); } // 处理义注 if ( $this->hasData($type, 'aṭṭhakathā') && $this->hasData($type, 'ṭīkā') ) { $tikaResult = []; foreach ($type['ṭīkā'] as $keyBook => $paragraphs) { $tikaJson = []; foreach ($paragraphs as $key => $paraData) { $sentData = $this->getParaContent($paraData['book'], $paraData['para']); $tikaJson = array_merge($tikaJson, $sentData); } // llm 对齐 $result = $this->textAlign($attaJson, $tikaJson); // 将新旧数据合并 如果原来没有,就添加,有,就合并数据 foreach ($result as $new) { $found = false; foreach ($tikaResult as $key => $old) { if ($old['id'] === $new['id']) { $found = true; if (isset($new['commentary']) && is_array($new['commentary'])) { $tikaResult[$key]['commentary'] = array_merge($tikaResult[$key]['commentary'], $new['commentary']); } break; } } if (! $found) { array_push($tikaResult, $new); } } } // 写入db $this->save($tikaResult, $channel); } // 该处理单元全部写库完成后再标记游标,确保中途中断不会误跳过 $done[$cursor] = true; Cache::put(self::CACHE_KEY, $done, now()->addHours(24)); } } // 完整遍历正常结束,清空断点缓存 if ($isFullRun) { Cache::forget(self::CACHE_KEY); } return 0; } private function hasData($typeData, $typeName) { if ( ! isset($typeData[$typeName]) || $this->getParagraphNumber($typeData[$typeName]) === 0 ) { Log::warning($typeName.' data is missing'); return false; } return true; } private function getParagraphNumber($type) { if (! isset($type) || ! is_array($type)) { return 0; } $count = 0; foreach ($type as $bookId => $paragraphs) { $count += count($paragraphs); } return $count; } private function getBookType($bookId) { $bookTitle = BookTitle::where('sn', $bookId)->first(); $paliTextUuid = PaliText::where('book', $bookTitle->book)->where('paragraph', $bookTitle->paragraph)->value('uid'); $tagIds = TagMap::where('anchor_id', $paliTextUuid)->select('tag_id')->get(); $tags = Tag::whereIn('id', $tagIds)->select('name')->get(); foreach ($tags as $key => $tag) { if (in_array($tag->name, ['pāḷi', 'aṭṭhakathā', 'ṭīkā'])) { return $tag->name; } } return null; } private function getParaContent($book, $para) { $sentenceService = app(SearchPaliDataService::class); $sentences = PaliSentence::where('book', $book) ->where('paragraph', $para) ->orderBy('word_begin') ->get(); if (! $sentences) { return null; } $json = []; foreach ($sentences as $key => $sentence) { $content = $sentenceService->getSentenceContent($book, $para, $sentence->word_begin, $sentence->word_end); $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}"; $json[] = ['id' => $id, 'content' => $content['markdown']]; } return $json; } private function arrayIndexed(array $input): array { $output = []; foreach ($input as $key => $value) { $value['id'] = $key; $output[] = $value; } return $output; } private function arrayUnIndexed(array $input, array $original, array $commentary): array { $output = []; foreach ($input as $key => $value) { if (! isset($original[$key])) { Log::warning('no id'); continue; } $value['id'] = $original[$key]['id']; if (isset($value['commentary'])) { $newCommentary = array_map(function ($n) use ($commentary) { if (isset($commentary[$n])) { return $commentary[$n]['id']; } return ''; }, $value['commentary']); $value['commentary'] = $newCommentary; } $output[] = $value; } return $output; } private function textAlign(array $original, array $commentary) { if (! $this->model) { Log::error('model is invalid'); return []; } $originalSn = $this->arrayIndexed($original); $commentarySn = $this->arrayIndexed($commentary); $originalText = "```jsonl\n".LlmResponseParser::jsonl_encode($originalSn)."\n```"; $commentaryText = "```jsonl\n".LlmResponseParser::jsonl_encode($commentarySn)."\n```"; Log::debug('ai request', [ 'original' => $originalText, 'commentary' => $commentaryText, ]); $totalSentences = count($original) + count($commentary); $maxTokens = (int) ($this->tokensPerSentence * $totalSentences * 1.5); $this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens"); Log::debug('requesting…… '.$this->model['model']); $startAt = time(); $response = $this->openAIService->setApiUrl($this->model['url']) ->setModel($this->model['model']) ->setApiKey($this->model['key']) ->setSystemPrompt($this->prompt) ->setTemperature(0.0) ->setStream(false) ->setMaxToken($maxTokens) ->send("# pali\n\n{$originalText}\n\n# commentary\n\n{$commentaryText}"); $completeAt = time(); $answer = $response['choices'][0]['message']['content'] ?? '[]'; Log::debug('ai response', ['data' => $answer]); $message = ($completeAt - $startAt).'s'; if (isset($response['usage']['completion_tokens'])) { Log::debug('usage', $response['usage']); $message .= ' completion_tokens:'.$response['usage']['completion_tokens']; $curr = (int) ($response['usage']['completion_tokens'] / $totalSentences); if ($curr > $this->tokensPerSentence) { $this->tokensPerSentence = $curr; } } $this->info($message); $json = []; if (is_string($answer)) { $json = LlmResponseParser::jsonl($answer); $json = $this->arrayUnIndexed($json, $original, $commentary); Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE)); } if (count($json) === 0) { Log::error('jsonl is empty'); } return $json; } private function save($json, $channel) { if (! is_array($json)) { Log::warning('llm return null'); return false; } foreach ($json as $key => $sentence) { if (! isset($sentence['commentary'])) { continue; } $sentId = explode('-', $sentence['id']); $arrCommentary = $sentence['commentary']; if ( isset($arrCommentary) && is_array($arrCommentary) && count($arrCommentary) > 0 ) { $content = array_map(function ($n) { if (is_string($n)) { return '{{'.$n.'}}'; } elseif (is_array($n) && isset($n['id']) && is_string($n['id'])) { return '{{'.$n['id'].'}}'; } else { return ''; } }, $arrCommentary); $this->sentenceService->save( [ 'book_id' => $sentId[0], 'paragraph' => $sentId[1], 'word_start' => $sentId[2], 'word_end' => $sentId[3], 'channel_uid' => $channel->uid, 'content' => implode("\n", $content), 'lang' => $channel->lang, 'status' => $channel->status, 'editor_uid' => $this->model['uid'], ] ); $this->info($sentence['id'].' saved'); } } } }