|
|
@@ -2,33 +2,34 @@
|
|
|
|
|
|
namespace App\Console\Commands;
|
|
|
|
|
|
-use Illuminate\Console\Command;
|
|
|
-use Illuminate\Support\Facades\Log;
|
|
|
-
|
|
|
-use App\Models\RelatedParagraph;
|
|
|
+use App\Helpers\LlmResponseParser;
|
|
|
+use App\Http\Api\ChannelApi;
|
|
|
+use App\Http\Resources\AiModelResource;
|
|
|
use App\Models\BookTitle;
|
|
|
+use App\Models\PaliSentence;
|
|
|
use App\Models\PaliText;
|
|
|
-use App\Models\TagMap;
|
|
|
+use App\Models\RelatedParagraph;
|
|
|
use App\Models\Tag;
|
|
|
-use App\Models\PaliSentence;
|
|
|
-
|
|
|
-use App\Services\SearchPaliDataService;
|
|
|
-use App\Services\OpenAIService;
|
|
|
+use App\Models\TagMap;
|
|
|
use App\Services\AIModelService;
|
|
|
+use App\Services\OpenAIService;
|
|
|
+use App\Services\SearchPaliDataService;
|
|
|
use App\Services\SentenceService;
|
|
|
-
|
|
|
-use App\Helpers\LlmResponseParser;
|
|
|
-use App\Http\Api\ChannelApi;
|
|
|
+use Illuminate\Console\Command;
|
|
|
+use Illuminate\Support\Facades\Cache;
|
|
|
+use Illuminate\Support\Facades\Log;
|
|
|
|
|
|
class UpgradeSystemCommentary extends Command
|
|
|
{
|
|
|
/**
|
|
|
* The name and signature of the console command.
|
|
|
* php artisan upgrade:sys.commentary
|
|
|
+ *
|
|
|
* @var string
|
|
|
*/
|
|
|
- protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=}';
|
|
|
- protected $prompt = <<<md
|
|
|
+ protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=} {--fresh : 清除缓存断点,从头开始}';
|
|
|
+
|
|
|
+ protected $prompt = <<<'md'
|
|
|
你是一个注释对照阅读助手。
|
|
|
pali 是巴利原文,jsonl格式, 每条记录是一个句子。包括id 和 content 两个字段
|
|
|
commentary 是pali的注释,jsonl 格式,每条记录是一个句子。包括id 和 content 两个字段
|
|
|
@@ -54,17 +55,27 @@ class UpgradeSystemCommentary extends Command
|
|
|
{"id":0,"commentary":[0,1]}
|
|
|
{"id":1,"commentary":[2]}
|
|
|
md;
|
|
|
+
|
|
|
/**
|
|
|
* The console command description.
|
|
|
*
|
|
|
* @var string
|
|
|
*/
|
|
|
protected $description = 'Command description';
|
|
|
+
|
|
|
+ // 缓存键:记录已完成的 "book_name|cs_para" 集合,中断后重跑自动跳过,48h 过期
|
|
|
+ private const CACHE_KEY = 'upgrade:sys.commentary:done';
|
|
|
+
|
|
|
protected $sentenceService;
|
|
|
+
|
|
|
protected $modelService;
|
|
|
+
|
|
|
protected $openAIService;
|
|
|
- protected $model;
|
|
|
+
|
|
|
+ protected AiModelResource $model;
|
|
|
+
|
|
|
protected $tokensPerSentence = 0;
|
|
|
+
|
|
|
/**
|
|
|
* Create a new command instance.
|
|
|
*
|
|
|
@@ -94,15 +105,32 @@ md;
|
|
|
->selectRaw('book_name,count(*)')
|
|
|
->get();
|
|
|
foreach ($result as $key => $value) {
|
|
|
- $this->info($value['book_name'] . "[" . $value['count'] . "]");
|
|
|
+ $this->info($value['book_name'].'['.$value['count'].']');
|
|
|
}
|
|
|
+
|
|
|
return 0;
|
|
|
}
|
|
|
if ($this->option('model')) {
|
|
|
$this->model = $this->modelService->getModelById($this->option('model'));
|
|
|
+ // getModelById 始终返回 AiModelResource,未查到时其底层 resource 为 null,需据此判断
|
|
|
+ if (empty($this->model->resource)) {
|
|
|
+ $this->error('no model found id='.$this->option('model'));
|
|
|
+
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
$this->info("model:{$this->model['model']}");
|
|
|
}
|
|
|
|
|
|
+ if ($this->option('fresh')) {
|
|
|
+ Cache::forget(self::CACHE_KEY);
|
|
|
+ $this->info('Cleared cached cursor.');
|
|
|
+ }
|
|
|
+
|
|
|
+ // 是否为完整遍历(未指定 book/para),仅此情形在结束后清空断点缓存
|
|
|
+ $isFullRun = ! $this->option('book') && ! $this->option('para');
|
|
|
+
|
|
|
+ // 从缓存恢复已完成的 (book_name, cs_para) 集合,作为重入时的稳定游标
|
|
|
+ $done = Cache::get(self::CACHE_KEY, []);
|
|
|
|
|
|
$channel = ChannelApi::getChannelByName('_System_commentary_');
|
|
|
|
|
|
@@ -110,9 +138,12 @@ md;
|
|
|
if ($this->option('book')) {
|
|
|
$books[] = ['book_name' => $this->option('book')];
|
|
|
} else {
|
|
|
+ // orderBy 保证每次遍历顺序一致,游标才稳定
|
|
|
$books = RelatedParagraph::whereNotNull('book_name')
|
|
|
+ ->where('book_name', '!=', '')
|
|
|
->where('cs_para', '>', 0)
|
|
|
->groupBy('book_name')
|
|
|
+ ->orderBy('book_name')
|
|
|
->select('book_name')
|
|
|
->get()->toArray();
|
|
|
}
|
|
|
@@ -121,16 +152,24 @@ md;
|
|
|
if ($this->option('para')) {
|
|
|
$paragraphs[] = ['cs_para' => $this->option('para')];
|
|
|
} else {
|
|
|
+ // orderBy 保证每次遍历顺序一致,游标才稳定
|
|
|
$paragraphs = RelatedParagraph::where('book_name', $currBook['book_name'])
|
|
|
->where('cs_para', '>', 0)
|
|
|
->groupBy('cs_para')
|
|
|
+ ->orderBy('cs_para')
|
|
|
->select('cs_para')
|
|
|
->get()->toArray();
|
|
|
}
|
|
|
foreach ($paragraphs as $key => $paragraph) {
|
|
|
- $message = 'ai commentary ' . $currBook['book_name'] . '-' . $paragraph['cs_para'];
|
|
|
+ // 稳定游标:以 book_name|cs_para 唯一标识一个处理单元
|
|
|
+ $cursor = $currBook['book_name'].'|'.$paragraph['cs_para'];
|
|
|
+ // 已完成的单元直接跳过,实现中断后重入续跑
|
|
|
+ if (isset($done[$cursor])) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ $message = 'ai commentary '.$currBook['book_name'].'-'.$paragraph['cs_para'];
|
|
|
$this->info($message);
|
|
|
- Log::info($message);
|
|
|
$result = RelatedParagraph::where('book_name', $currBook['book_name'])
|
|
|
->where('cs_para', $paragraph['cs_para'])
|
|
|
->where('book_id', '>', 0)
|
|
|
@@ -140,11 +179,11 @@ md;
|
|
|
$pcdBooks = [];
|
|
|
$type = [];
|
|
|
foreach ($result as $rBook) {
|
|
|
- # 把段落整合成书。有几本书就有几条输出纪录
|
|
|
- if (!isset($pcdBooks[$rBook->book_id])) {
|
|
|
+ // 把段落整合成书。有几本书就有几条输出纪录
|
|
|
+ if (! isset($pcdBooks[$rBook->book_id])) {
|
|
|
$bookType = $this->getBookType($rBook->book_id);
|
|
|
$pcdBooks[$rBook->book_id] = $bookType;
|
|
|
- if (!isset($type[$bookType])) {
|
|
|
+ if (! isset($type[$bookType])) {
|
|
|
$type[$bookType] = [];
|
|
|
}
|
|
|
$type[$bookType][$rBook->book_id] = [];
|
|
|
@@ -156,20 +195,20 @@ md;
|
|
|
Log::debug($keyType);
|
|
|
foreach ($info as $bookId => $paragraphs) {
|
|
|
Log::debug($bookId);
|
|
|
- foreach ($paragraphs as $paragraph) {
|
|
|
- Log::debug($paragraph['book'] . '-' . $paragraph['para']);
|
|
|
+ foreach ($paragraphs as $paragraph) {
|
|
|
+ Log::debug($paragraph['book'].'-'.$paragraph['para']);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- //处理pali
|
|
|
+ // 处理pali
|
|
|
if (
|
|
|
$this->hasData($type, 'pāḷi') &&
|
|
|
$this->hasData($type, 'aṭṭhakathā')
|
|
|
) {
|
|
|
$paliJson = [];
|
|
|
foreach ($type['pāḷi'] as $keyBook => $paragraphs) {
|
|
|
- foreach ($paragraphs as $paraData) {
|
|
|
+ foreach ($paragraphs as $paraData) {
|
|
|
$sentData = $this->getParaContent($paraData['book'], $paraData['para']);
|
|
|
$paliJson = array_merge($paliJson, $sentData);
|
|
|
}
|
|
|
@@ -177,24 +216,24 @@ md;
|
|
|
|
|
|
$attaJson = [];
|
|
|
foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) {
|
|
|
- foreach ($paragraphs as $paraData) {
|
|
|
+ foreach ($paragraphs as $paraData) {
|
|
|
$sentData = $this->getParaContent($paraData['book'], $paraData['para']);
|
|
|
$attaJson = array_merge($attaJson, $sentData);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- //llm 对齐
|
|
|
+ // llm 对齐
|
|
|
$result = $this->textAlign($paliJson, $attaJson);
|
|
|
- //写入db
|
|
|
+ // 写入db
|
|
|
$this->save($result, $channel);
|
|
|
}
|
|
|
|
|
|
- //处理义注
|
|
|
+ // 处理义注
|
|
|
if (
|
|
|
$this->hasData($type, 'aṭṭhakathā') &&
|
|
|
$this->hasData($type, 'ṭīkā')
|
|
|
) {
|
|
|
- $tikaResult = array();
|
|
|
+ $tikaResult = [];
|
|
|
foreach ($type['ṭīkā'] as $keyBook => $paragraphs) {
|
|
|
$tikaJson = [];
|
|
|
foreach ($paragraphs as $key => $paraData) {
|
|
|
@@ -202,9 +241,9 @@ md;
|
|
|
$tikaJson = array_merge($tikaJson, $sentData);
|
|
|
}
|
|
|
|
|
|
- //llm 对齐
|
|
|
+ // llm 对齐
|
|
|
$result = $this->textAlign($attaJson, $tikaJson);
|
|
|
- //将新旧数据合并 如果原来没有,就添加,有,就合并数据
|
|
|
+ // 将新旧数据合并 如果原来没有,就添加,有,就合并数据
|
|
|
foreach ($result as $new) {
|
|
|
$found = false;
|
|
|
foreach ($tikaResult as $key => $old) {
|
|
|
@@ -216,41 +255,56 @@ md;
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
- if (!$found) {
|
|
|
+ if (! $found) {
|
|
|
array_push($tikaResult, $new);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- //写入db
|
|
|
+ // 写入db
|
|
|
$this->save($tikaResult, $channel);
|
|
|
}
|
|
|
+
|
|
|
+ // 该处理单元全部写库完成后再标记游标,确保中途中断不会误跳过
|
|
|
+ $done[$cursor] = true;
|
|
|
+ Cache::put(self::CACHE_KEY, $done, now()->addHours(24));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ // 完整遍历正常结束,清空断点缓存
|
|
|
+ if ($isFullRun) {
|
|
|
+ Cache::forget(self::CACHE_KEY);
|
|
|
+ }
|
|
|
+
|
|
|
return 0;
|
|
|
}
|
|
|
+
|
|
|
private function hasData($typeData, $typeName)
|
|
|
{
|
|
|
if (
|
|
|
- !isset($typeData[$typeName]) ||
|
|
|
+ ! isset($typeData[$typeName]) ||
|
|
|
$this->getParagraphNumber($typeData[$typeName]) === 0
|
|
|
) {
|
|
|
- Log::warning($typeName . ' data is missing');
|
|
|
+ Log::warning($typeName.' data is missing');
|
|
|
+
|
|
|
return false;
|
|
|
}
|
|
|
+
|
|
|
return true;
|
|
|
}
|
|
|
+
|
|
|
private function getParagraphNumber($type)
|
|
|
{
|
|
|
- if (!isset($type) || !is_array($type)) {
|
|
|
+ if (! isset($type) || ! is_array($type)) {
|
|
|
return 0;
|
|
|
}
|
|
|
$count = 0;
|
|
|
foreach ($type as $bookId => $paragraphs) {
|
|
|
$count += count($paragraphs);
|
|
|
}
|
|
|
+
|
|
|
return $count;
|
|
|
}
|
|
|
+
|
|
|
private function getBookType($bookId)
|
|
|
{
|
|
|
$bookTitle = BookTitle::where('sn', $bookId)->first();
|
|
|
@@ -262,6 +316,7 @@ md;
|
|
|
return $tag->name;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
@@ -272,33 +327,37 @@ md;
|
|
|
->where('paragraph', $para)
|
|
|
->orderBy('word_begin')
|
|
|
->get();
|
|
|
- if (!$sentences) {
|
|
|
+ if (! $sentences) {
|
|
|
return null;
|
|
|
}
|
|
|
$json = [];
|
|
|
foreach ($sentences as $key => $sentence) {
|
|
|
- $content = $sentenceService->getSentenceText($book, $para, $sentence->word_begin, $sentence->word_end);
|
|
|
+ $content = $sentenceService->getSentenceContent($book, $para, $sentence->word_begin, $sentence->word_end);
|
|
|
$id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
|
|
|
$json[] = ['id' => $id, 'content' => $content['markdown']];
|
|
|
}
|
|
|
+
|
|
|
return $json;
|
|
|
}
|
|
|
|
|
|
private function arrayIndexed(array $input): array
|
|
|
{
|
|
|
- $output = [];
|
|
|
+ $output = [];
|
|
|
foreach ($input as $key => $value) {
|
|
|
$value['id'] = $key;
|
|
|
$output[] = $value;
|
|
|
}
|
|
|
+
|
|
|
return $output;
|
|
|
}
|
|
|
+
|
|
|
private function arrayUnIndexed(array $input, array $original, array $commentary): array
|
|
|
{
|
|
|
- $output = [];
|
|
|
+ $output = [];
|
|
|
foreach ($input as $key => $value) {
|
|
|
- if (!isset($original[$key])) {
|
|
|
+ if (! isset($original[$key])) {
|
|
|
Log::warning('no id');
|
|
|
+
|
|
|
continue;
|
|
|
}
|
|
|
$value['id'] = $original[$key]['id'];
|
|
|
@@ -307,35 +366,39 @@ md;
|
|
|
if (isset($commentary[$n])) {
|
|
|
return $commentary[$n]['id'];
|
|
|
}
|
|
|
+
|
|
|
return '';
|
|
|
}, $value['commentary']);
|
|
|
$value['commentary'] = $newCommentary;
|
|
|
}
|
|
|
$output[] = $value;
|
|
|
}
|
|
|
+
|
|
|
return $output;
|
|
|
}
|
|
|
+
|
|
|
private function textAlign(array $original, array $commentary)
|
|
|
{
|
|
|
- if (!$this->model) {
|
|
|
+ if (! $this->model) {
|
|
|
Log::error('model is invalid');
|
|
|
+
|
|
|
return [];
|
|
|
}
|
|
|
- $originalSn = $this->arrayIndexed($original);
|
|
|
- $commentarySn = $this->arrayIndexed($commentary);
|
|
|
+ $originalSn = $this->arrayIndexed($original);
|
|
|
+ $commentarySn = $this->arrayIndexed($commentary);
|
|
|
|
|
|
- $originalText = "```jsonl\n" . LlmResponseParser::jsonl_encode($originalSn) . "\n```";
|
|
|
- $commentaryText = "```jsonl\n" . LlmResponseParser::jsonl_encode($commentarySn) . "\n```";
|
|
|
+ $originalText = "```jsonl\n".LlmResponseParser::jsonl_encode($originalSn)."\n```";
|
|
|
+ $commentaryText = "```jsonl\n".LlmResponseParser::jsonl_encode($commentarySn)."\n```";
|
|
|
|
|
|
Log::debug('ai request', [
|
|
|
'original' => $originalText,
|
|
|
- 'commentary' => $commentaryText
|
|
|
+ 'commentary' => $commentaryText,
|
|
|
]);
|
|
|
|
|
|
$totalSentences = count($original) + count($commentary);
|
|
|
- $maxTokens = (int)($this->tokensPerSentence * $totalSentences * 1.5);
|
|
|
+ $maxTokens = (int) ($this->tokensPerSentence * $totalSentences * 1.5);
|
|
|
$this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens");
|
|
|
- Log::debug('requesting…… ' . $this->model['model']);
|
|
|
+ Log::debug('requesting…… '.$this->model['model']);
|
|
|
$startAt = time();
|
|
|
$response = $this->openAIService->setApiUrl($this->model['url'])
|
|
|
->setModel($this->model['model'])
|
|
|
@@ -348,12 +411,12 @@ md;
|
|
|
$completeAt = time();
|
|
|
$answer = $response['choices'][0]['message']['content'] ?? '[]';
|
|
|
Log::debug('ai response', ['data' => $answer]);
|
|
|
- $message = ($completeAt - $startAt) . 's';
|
|
|
+ $message = ($completeAt - $startAt).'s';
|
|
|
|
|
|
if (isset($response['usage']['completion_tokens'])) {
|
|
|
Log::debug('usage', $response['usage']);
|
|
|
- $message .= " completion_tokens:" . $response['usage']['completion_tokens'];
|
|
|
- $curr = (int)($response['usage']['completion_tokens'] / $totalSentences);
|
|
|
+ $message .= ' completion_tokens:'.$response['usage']['completion_tokens'];
|
|
|
+ $curr = (int) ($response['usage']['completion_tokens'] / $totalSentences);
|
|
|
if ($curr > $this->tokensPerSentence) {
|
|
|
$this->tokensPerSentence = $curr;
|
|
|
}
|
|
|
@@ -366,22 +429,21 @@ md;
|
|
|
Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
|
|
|
}
|
|
|
if (count($json) === 0) {
|
|
|
- Log::error("jsonl is empty");
|
|
|
+ Log::error('jsonl is empty');
|
|
|
}
|
|
|
|
|
|
return $json;
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-
|
|
|
private function save($json, $channel)
|
|
|
{
|
|
|
- if (!is_array($json)) {
|
|
|
+ if (! is_array($json)) {
|
|
|
Log::warning('llm return null');
|
|
|
+
|
|
|
return false;
|
|
|
}
|
|
|
foreach ($json as $key => $sentence) {
|
|
|
- if (!isset($sentence['commentary'])) {
|
|
|
+ if (! isset($sentence['commentary'])) {
|
|
|
continue;
|
|
|
}
|
|
|
$sentId = explode('-', $sentence['id']);
|
|
|
@@ -391,11 +453,11 @@ md;
|
|
|
is_array($arrCommentary) &&
|
|
|
count($arrCommentary) > 0
|
|
|
) {
|
|
|
- $content = array_map(function ($n) {
|
|
|
+ $content = array_map(function ($n) {
|
|
|
if (is_string($n)) {
|
|
|
- return '{{' . $n . '}}';
|
|
|
- } else if (is_array($n) && isset($n['id']) && is_string($n['id'])) {
|
|
|
- return '{{' . $n['id'] . '}}';
|
|
|
+ return '{{'.$n.'}}';
|
|
|
+ } elseif (is_array($n) && isset($n['id']) && is_string($n['id'])) {
|
|
|
+ return '{{'.$n['id'].'}}';
|
|
|
} else {
|
|
|
return '';
|
|
|
}
|
|
|
@@ -413,7 +475,7 @@ md;
|
|
|
'editor_uid' => $this->model['uid'],
|
|
|
]
|
|
|
);
|
|
|
- $this->info($sentence['id'] . ' saved');
|
|
|
+ $this->info($sentence['id'].' saved');
|
|
|
}
|
|
|
}
|
|
|
}
|