UpgradeSystemCommentary.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. <?php
  2. namespace App\Console\Commands;
  3. use Illuminate\Console\Command;
  4. use Illuminate\Support\Facades\Cache;
  5. use Illuminate\Support\Facades\Log;
  6. use App\Helpers\LlmResponseParser;
  7. use App\Http\Api\ChannelApi;
  8. use App\Http\Resources\AiModelResource;
  9. use App\Models\BookTitle;
  10. use App\Models\PaliSentence;
  11. use App\Models\PaliText;
  12. use App\Models\RelatedParagraph;
  13. use App\Models\Tag;
  14. use App\Models\TagMap;
  15. use App\Services\AIModelService;
  16. use App\Services\OpenAIService;
  17. use App\Services\SearchPaliDataService;
  18. use App\Services\SentenceService;
  19. class UpgradeSystemCommentary extends Command
  20. {
  21. /**
  22. * The name and signature of the console command.
  23. * php artisan upgrade:sys.commentary
  24. *
  25. * @var string
  26. */
  27. protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=} {--thinking=} {--skip= : 跳过指定的 book_name,逗号分隔,支持前缀通配,如 abhi*,sn2} {--fresh : 清除缓存断点,从头开始}';
  28. protected $prompt = <<<'md'
  29. 你是一个注释对照阅读助手。
  30. pali 是巴利原文,jsonl格式, 每条记录是一个句子。包括id 和 content 两个字段
  31. commentary 是pali的注释,jsonl 格式,每条记录是一个句子。包括id 和 content 两个字段
  32. commentary里面的内容是对pali内容的注释
  33. commentary里面的黑体字,说明该句子是注释pali中的对应的巴利文。
  34. 你需要按照顺序将commentary中的句子与pali原文对照,。
  35. 输出格式jsonl
  36. 只输出pali数据
  37. 在pali句子数据里面增加一个字段“commentary” 里面放这个句子对应的commentary句子的id
  38. 不要输出content字段,只输出id,commentary字段
  39. 直接输出jsonl数据,无需解释
  40. **关键规则:**
  41. 1. 根据commentary中的句子的意思找到与pali对应的句子
  42. 1. 如果commentary中的某个句子**有黑体字**,它应该放在pali中对应巴利词汇出现的句子之后
  43. 2. 如果commentary中的某个句子**没有黑体字**,请将其与**上面最近的有黑体字的commentary句子**合并在一起(保持在同一个引用块内),不要单独成行
  44. 3. 有些pali原文句子可能没有对应的注释
  45. 4. 请不要遗漏任何commentary中的句子,也不要打乱顺序
  46. 5. 同时保持pali的句子数量不变,不要增删
  47. 6. 应该将全部commentary中的句子都与pali句子对应,不要有遗漏
  48. **输出范例**
  49. {"id":0,"commentary":[0,1]}
  50. {"id":1,"commentary":[2]}
  51. md;
  52. /**
  53. * The console command description.
  54. *
  55. * @var string
  56. */
  57. protected $description = 'Command description';
  58. // 缓存键:记录已完成的 "book_name|cs_para" 集合,中断后重跑自动跳过,48h 过期
  59. private const CACHE_KEY = 'upgrade:sys.commentary:done';
  60. protected $sentenceService;
  61. protected $modelService;
  62. protected $openAIService;
  63. protected AiModelResource $model;
  64. protected $tokensPerSentence = 0;
  65. protected bool $thinking;
  66. /**
  67. * Create a new command instance.
  68. *
  69. * @return void
  70. */
  71. public function __construct(
  72. AIModelService $model,
  73. SentenceService $sent,
  74. OpenAIService $openAI
  75. ) {
  76. $this->modelService = $model;
  77. $this->sentenceService = $sent;
  78. $this->openAIService = $openAI;
  79. parent::__construct();
  80. }
  81. /**
  82. * Execute the console command.
  83. *
  84. * @return int
  85. */
  86. public function handle()
  87. {
  88. if ($this->option('list')) {
  89. $result = RelatedParagraph::whereNotNull('book_name')
  90. ->groupBy('book_name')
  91. ->selectRaw('book_name,count(*)')
  92. ->get();
  93. foreach ($result as $key => $value) {
  94. $this->info($value['book_name'] . '[' . $value['count'] . ']');
  95. }
  96. return 0;
  97. }
  98. if ($this->option('model')) {
  99. $this->model = $this->modelService->getModelById($this->option('model'));
  100. // getModelById 始终返回 AiModelResource,未查到时其底层 resource 为 null,需据此判断
  101. if (empty($this->model->resource)) {
  102. $this->error('no model found id=' . $this->option('model'));
  103. return 1;
  104. }
  105. $this->info("model:{$this->model['model']}");
  106. }
  107. if ($this->option('fresh')) {
  108. Cache::forget(self::CACHE_KEY);
  109. $this->info('Cleared cached cursor.');
  110. }
  111. if ($this->option('thinking')) {
  112. $this->thinking = $this->option('thinking') === 'true';
  113. $this->line('thinking is ' . $this->option('thinking'));
  114. }
  115. // 是否为完整遍历(未指定 book/para),仅此情形在结束后清空断点缓存
  116. $isFullRun = ! $this->option('book') && ! $this->option('para');
  117. // 从缓存恢复已完成的 (book_name, cs_para) 集合,作为重入时的稳定游标
  118. $done = Cache::get(self::CACHE_KEY, []);
  119. // 需要跳过的 book_name 规则,逗号分隔,以 * 结尾为前缀匹配,否则全等匹配
  120. $skipPatterns = [];
  121. if ($this->option('skip')) {
  122. $skipPatterns = array_values(array_filter(array_map('trim', explode(',', $this->option('skip')))));
  123. }
  124. $channel = ChannelApi::getChannelByName('_System_commentary_');
  125. $books = [];
  126. if ($this->option('book')) {
  127. $books[] = ['book_name' => $this->option('book')];
  128. } else {
  129. // orderBy 保证每次遍历顺序一致,游标才稳定
  130. $books = RelatedParagraph::whereNotNull('book_name')
  131. ->where('book_name', '!=', '')
  132. ->where('cs_para', '>', 0)
  133. ->groupBy('book_name')
  134. ->orderBy('book_name')
  135. ->select('book_name')
  136. ->get()->toArray();
  137. }
  138. foreach ($books as $key => $currBook) {
  139. // 命中跳过规则时直接处理下一本:即便上次游标停在此书,也跳到下一个有效 book_name
  140. if ($this->shouldSkipBook($currBook['book_name'], $skipPatterns)) {
  141. $this->info('skip book ' . $currBook['book_name']);
  142. continue;
  143. }
  144. $paragraphs = [];
  145. if ($this->option('para')) {
  146. $paragraphs[] = ['cs_para' => $this->option('para')];
  147. } else {
  148. // orderBy 保证每次遍历顺序一致,游标才稳定
  149. $paragraphs = RelatedParagraph::where('book_name', $currBook['book_name'])
  150. ->where('cs_para', '>', 0)
  151. ->groupBy('cs_para')
  152. ->orderBy('cs_para')
  153. ->select('cs_para')
  154. ->get()->toArray();
  155. }
  156. foreach ($paragraphs as $key => $paragraph) {
  157. // 稳定游标:以 book_name|cs_para 唯一标识一个处理单元
  158. $cursor = $currBook['book_name'] . '|' . $paragraph['cs_para'];
  159. // 已完成的单元直接跳过,实现中断后重入续跑
  160. if (isset($done[$cursor])) {
  161. continue;
  162. }
  163. $message = 'ai commentary ' . $currBook['book_name'] . '-' . $paragraph['cs_para'];
  164. $this->info($message);
  165. $result = RelatedParagraph::where('book_name', $currBook['book_name'])
  166. ->where('cs_para', $paragraph['cs_para'])
  167. ->where('book_id', '>', 0)
  168. ->orderBy('book_id')
  169. ->orderBy('para')
  170. ->get();
  171. $pcdBooks = [];
  172. $type = [];
  173. foreach ($result as $rBook) {
  174. // 把段落整合成书。有几本书就有几条输出纪录
  175. if (! isset($pcdBooks[$rBook->book_id])) {
  176. $bookType = $this->getBookType($rBook->book_id);
  177. $pcdBooks[$rBook->book_id] = $bookType;
  178. if (! isset($type[$bookType])) {
  179. $type[$bookType] = [];
  180. }
  181. $type[$bookType][$rBook->book_id] = [];
  182. }
  183. $currType = $pcdBooks[$rBook->book_id];
  184. $type[$currType][$rBook->book_id][] = ['book' => $rBook->book, 'para' => $rBook->para];
  185. }
  186. foreach ($type as $keyType => $info) {
  187. Log::debug($keyType);
  188. foreach ($info as $bookId => $paragraphs) {
  189. Log::debug($bookId);
  190. foreach ($paragraphs as $paragraph) {
  191. Log::debug($paragraph['book'] . '-' . $paragraph['para']);
  192. }
  193. }
  194. }
  195. // 处理pali
  196. if (
  197. $this->hasData($type, 'pāḷi') &&
  198. $this->hasData($type, 'aṭṭhakathā')
  199. ) {
  200. $paliJson = [];
  201. foreach ($type['pāḷi'] as $keyBook => $paragraphs) {
  202. foreach ($paragraphs as $paraData) {
  203. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  204. $paliJson = array_merge($paliJson, $sentData);
  205. }
  206. }
  207. $attaJson = [];
  208. foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) {
  209. foreach ($paragraphs as $paraData) {
  210. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  211. $attaJson = array_merge($attaJson, $sentData);
  212. }
  213. }
  214. // llm 对齐
  215. $result = $this->textAlign($paliJson, $attaJson);
  216. // 写入db
  217. $this->save($result, $channel);
  218. }
  219. // 处理义注
  220. if (
  221. $this->hasData($type, 'aṭṭhakathā') &&
  222. $this->hasData($type, 'ṭīkā')
  223. ) {
  224. // 独立重建 attaJson,避免依赖上面 pāḷi 块是否执行
  225. $attaJsonForTika = [];
  226. foreach ($type['aṭṭhakathā'] as $keyBook => $attaParas) {
  227. foreach ($attaParas as $paraData) {
  228. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  229. $attaJsonForTika = array_merge($attaJsonForTika, $sentData);
  230. }
  231. }
  232. $tikaResult = [];
  233. foreach ($type['ṭīkā'] as $keyBook => $tikaParas) {
  234. $tikaJson = [];
  235. foreach ($tikaParas as $paraData) {
  236. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  237. $tikaJson = array_merge($tikaJson, $sentData);
  238. }
  239. // llm 对齐
  240. $result = $this->textAlign($attaJsonForTika, $tikaJson);
  241. // 将新旧数据合并 如果原来没有,就添加,有,就合并数据
  242. foreach ($result as $new) {
  243. $found = false;
  244. foreach ($tikaResult as $key => $old) {
  245. if ($old['id'] === $new['id']) {
  246. $found = true;
  247. if (isset($new['commentary']) && is_array($new['commentary'])) {
  248. $tikaResult[$key]['commentary'] = array_merge($tikaResult[$key]['commentary'], $new['commentary']);
  249. }
  250. break;
  251. }
  252. }
  253. if (! $found) {
  254. array_push($tikaResult, $new);
  255. }
  256. }
  257. }
  258. // 写入db
  259. $this->save($tikaResult, $channel);
  260. }
  261. // 该处理单元全部写库完成后再标记游标,确保中途中断不会误跳过
  262. $done[$cursor] = true;
  263. Cache::put(self::CACHE_KEY, $done, now()->addHours(24));
  264. }
  265. }
  266. // 完整遍历正常结束,清空断点缓存
  267. if ($isFullRun) {
  268. Cache::forget(self::CACHE_KEY);
  269. }
  270. return 0;
  271. }
  272. /**
  273. * 判断 book_name 是否命中跳过规则。
  274. *
  275. * @param array<int, string> $patterns 以 * 结尾为前缀匹配,否则全等匹配
  276. */
  277. private function shouldSkipBook(string $bookName, array $patterns): bool
  278. {
  279. foreach ($patterns as $pattern) {
  280. if (str_ends_with($pattern, '*')) {
  281. $prefix = rtrim($pattern, '*');
  282. if ($prefix !== '' && str_starts_with($bookName, $prefix)) {
  283. return true;
  284. }
  285. } elseif ($bookName === $pattern) {
  286. return true;
  287. }
  288. }
  289. return false;
  290. }
  291. private function hasData($typeData, $typeName)
  292. {
  293. if (
  294. ! isset($typeData[$typeName]) ||
  295. $this->getParagraphNumber($typeData[$typeName]) === 0
  296. ) {
  297. Log::warning($typeName . ' data is missing');
  298. return false;
  299. }
  300. return true;
  301. }
  302. private function getParagraphNumber($type)
  303. {
  304. if (! isset($type) || ! is_array($type)) {
  305. return 0;
  306. }
  307. $count = 0;
  308. foreach ($type as $bookId => $paragraphs) {
  309. $count += count($paragraphs);
  310. }
  311. return $count;
  312. }
  313. private function getBookType($bookId)
  314. {
  315. $bookTitle = BookTitle::where('sn', $bookId)->first();
  316. $paliTextUuid = PaliText::where('book', $bookTitle->book)->where('paragraph', $bookTitle->paragraph)->value('uid');
  317. $tagIds = TagMap::where('anchor_id', $paliTextUuid)->select('tag_id')->get();
  318. $tags = Tag::whereIn('id', $tagIds)->select('name')->get();
  319. foreach ($tags as $key => $tag) {
  320. if (in_array($tag->name, ['pāḷi', 'aṭṭhakathā', 'ṭīkā'])) {
  321. return $tag->name;
  322. }
  323. }
  324. return null;
  325. }
  326. private function getParaContent($book, $para)
  327. {
  328. $sentenceService = app(SearchPaliDataService::class);
  329. $sentences = PaliSentence::where('book', $book)
  330. ->where('paragraph', $para)
  331. ->orderBy('word_begin')
  332. ->get();
  333. if (! $sentences) {
  334. return null;
  335. }
  336. $json = [];
  337. foreach ($sentences as $key => $sentence) {
  338. $content = $sentenceService->getSentenceContent($book, $para, $sentence->word_begin, $sentence->word_end);
  339. $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
  340. $json[] = ['id' => $id, 'content' => $content['markdown']];
  341. }
  342. return $json;
  343. }
  344. private function arrayIndexed(array $input): array
  345. {
  346. $output = [];
  347. foreach ($input as $key => $value) {
  348. $value['id'] = $key;
  349. $output[] = $value;
  350. }
  351. return $output;
  352. }
  353. private function arrayUnIndexed(array $input, array $original, array $commentary): array
  354. {
  355. $output = [];
  356. foreach ($input as $key => $value) {
  357. if (! isset($original[$key])) {
  358. Log::warning('no id');
  359. continue;
  360. }
  361. $value['id'] = $original[$key]['id'];
  362. if (isset($value['commentary'])) {
  363. $newCommentary = array_map(function ($n) use ($commentary) {
  364. if (isset($commentary[$n])) {
  365. return $commentary[$n]['id'];
  366. }
  367. return '';
  368. }, $value['commentary']);
  369. $value['commentary'] = $newCommentary;
  370. }
  371. $output[] = $value;
  372. }
  373. return $output;
  374. }
  375. private function textAlign(array $original, array $commentary)
  376. {
  377. if (! $this->model) {
  378. Log::error('model is invalid');
  379. return [];
  380. }
  381. $originalSn = $this->arrayIndexed($original);
  382. $commentarySn = $this->arrayIndexed($commentary);
  383. $originalText = "```jsonl\n" . LlmResponseParser::jsonl_encode($originalSn) . "\n```";
  384. $commentaryText = "```jsonl\n" . LlmResponseParser::jsonl_encode($commentarySn) . "\n```";
  385. Log::debug('ai request', [
  386. 'original' => $originalText,
  387. 'commentary' => $commentaryText,
  388. ]);
  389. $totalSentences = count($original) + count($commentary);
  390. $maxTokens = (int) ($this->tokensPerSentence * $totalSentences * 1.5);
  391. $this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens");
  392. Log::debug('requesting…… ' . $this->model['model']);
  393. $startAt = time();
  394. $llm = $this->openAIService->setApiUrl($this->model['url'])
  395. ->setModel($this->model['model'])
  396. ->setApiKey($this->model['key'])
  397. ->setSystemPrompt($this->prompt)
  398. ->setTemperature(0.0)
  399. ->setStream(false)
  400. ->setMaxToken($maxTokens);
  401. if (isset($this->thinking)) {
  402. $llm = $llm->setThinking($this->thinking);
  403. }
  404. $response = $llm->send("# pali\n\n{$originalText}\n\n# commentary\n\n{$commentaryText}");
  405. $completeAt = time();
  406. $answer = $response['choices'][0]['message']['content'] ?? '[]';
  407. Log::debug('ai response', ['data' => $answer]);
  408. $message = ($completeAt - $startAt) . 's';
  409. if (isset($response['usage']['completion_tokens'])) {
  410. Log::debug('usage', $response['usage']);
  411. $message .= ' completion_tokens:' . $response['usage']['completion_tokens'];
  412. $curr = (int) ($response['usage']['completion_tokens'] / $totalSentences);
  413. if ($curr > $this->tokensPerSentence) {
  414. $this->tokensPerSentence = $curr;
  415. }
  416. }
  417. $this->info($message);
  418. $json = [];
  419. if (is_string($answer)) {
  420. $json = LlmResponseParser::jsonl($answer);
  421. $json = $this->arrayUnIndexed($json, $original, $commentary);
  422. Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
  423. }
  424. if (count($json) === 0) {
  425. Log::error('jsonl is empty');
  426. }
  427. return $json;
  428. }
  429. private function save($json, $channel)
  430. {
  431. if (! is_array($json)) {
  432. Log::warning('llm return null');
  433. return false;
  434. }
  435. foreach ($json as $key => $sentence) {
  436. if (! isset($sentence['commentary'])) {
  437. continue;
  438. }
  439. $sentId = explode('-', $sentence['id']);
  440. $arrCommentary = $sentence['commentary'];
  441. if (
  442. isset($arrCommentary) &&
  443. is_array($arrCommentary) &&
  444. count($arrCommentary) > 0
  445. ) {
  446. $content = array_map(function ($n) {
  447. if (is_string($n)) {
  448. return '{{' . $n . '}}';
  449. } elseif (is_array($n) && isset($n['id']) && is_string($n['id'])) {
  450. return '{{' . $n['id'] . '}}';
  451. } else {
  452. return '';
  453. }
  454. }, $arrCommentary);
  455. $this->sentenceService->save(
  456. [
  457. 'book_id' => $sentId[0],
  458. 'paragraph' => $sentId[1],
  459. 'word_start' => $sentId[2],
  460. 'word_end' => $sentId[3],
  461. 'channel_uid' => $channel->uid,
  462. 'content' => implode("\n", $content),
  463. 'lang' => $channel->lang,
  464. 'status' => $channel->status,
  465. 'editor_uid' => $this->model['uid'],
  466. ]
  467. );
  468. $this->info($sentence['id'] . ' saved');
  469. }
  470. }
  471. }
  472. }