UpgradeSystemCommentary.php 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. <?php
  2. namespace App\Console\Commands;
  3. use App\Helpers\LlmResponseParser;
  4. use App\Http\Api\ChannelApi;
  5. use App\Http\Resources\AiModelResource;
  6. use App\Models\BookTitle;
  7. use App\Models\PaliSentence;
  8. use App\Models\PaliText;
  9. use App\Models\RelatedParagraph;
  10. use App\Models\Tag;
  11. use App\Models\TagMap;
  12. use App\Services\AIModelService;
  13. use App\Services\OpenAIService;
  14. use App\Services\SearchPaliDataService;
  15. use App\Services\SentenceService;
  16. use Illuminate\Console\Command;
  17. use Illuminate\Support\Facades\Cache;
  18. use Illuminate\Support\Facades\Log;
  19. class UpgradeSystemCommentary extends Command
  20. {
  21. /**
  22. * The name and signature of the console command.
  23. * php artisan upgrade:sys.commentary
  24. *
  25. * @var string
  26. */
  27. protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=} {--skip= : 跳过指定的 book_name,逗号分隔,支持前缀通配,如 abhi*,sn2} {--fresh : 清除缓存断点,从头开始}';
  28. protected $prompt = <<<'md'
  29. 你是一个注释对照阅读助手。
  30. pali 是巴利原文,jsonl格式, 每条记录是一个句子。包括id 和 content 两个字段
  31. commentary 是pali的注释,jsonl 格式,每条记录是一个句子。包括id 和 content 两个字段
  32. commentary里面的内容是对pali内容的注释
  33. commentary里面的黑体字,说明该句子是注释pali中的对应的巴利文。
  34. 你需要按照顺序将commentary中的句子与pali原文对照,。
  35. 输出格式jsonl
  36. 只输出pali数据
  37. 在pali句子数据里面增加一个字段“commentary” 里面放这个句子对应的commentary句子的id
  38. 不要输出content字段,只输出id,commentary字段
  39. 直接输出jsonl数据,无需解释
  40. **关键规则:**
  41. 1. 根据commentary中的句子的意思找到与pali对应的句子
  42. 1. 如果commentary中的某个句子**有黑体字**,它应该放在pali中对应巴利词汇出现的句子之后
  43. 2. 如果commentary中的某个句子**没有黑体字**,请将其与**上面最近的有黑体字的commentary句子**合并在一起(保持在同一个引用块内),不要单独成行
  44. 3. 有些pali原文句子可能没有对应的注释
  45. 4. 请不要遗漏任何commentary中的句子,也不要打乱顺序
  46. 5. 同时保持pali的句子数量不变,不要增删
  47. 6. 应该将全部commentary中的句子都与pali句子对应,不要有遗漏
  48. **输出范例**
  49. {"id":0,"commentary":[0,1]}
  50. {"id":1,"commentary":[2]}
  51. md;
  52. /**
  53. * The console command description.
  54. *
  55. * @var string
  56. */
  57. protected $description = 'Command description';
  58. // 缓存键:记录已完成的 "book_name|cs_para" 集合,中断后重跑自动跳过,48h 过期
  59. private const CACHE_KEY = 'upgrade:sys.commentary:done';
  60. protected $sentenceService;
  61. protected $modelService;
  62. protected $openAIService;
  63. protected AiModelResource $model;
  64. protected $tokensPerSentence = 0;
  65. /**
  66. * Create a new command instance.
  67. *
  68. * @return void
  69. */
  70. public function __construct(
  71. AIModelService $model,
  72. SentenceService $sent,
  73. OpenAIService $openAI
  74. ) {
  75. $this->modelService = $model;
  76. $this->sentenceService = $sent;
  77. $this->openAIService = $openAI;
  78. parent::__construct();
  79. }
  80. /**
  81. * Execute the console command.
  82. *
  83. * @return int
  84. */
  85. public function handle()
  86. {
  87. if ($this->option('list')) {
  88. $result = RelatedParagraph::whereNotNull('book_name')
  89. ->groupBy('book_name')
  90. ->selectRaw('book_name,count(*)')
  91. ->get();
  92. foreach ($result as $key => $value) {
  93. $this->info($value['book_name'].'['.$value['count'].']');
  94. }
  95. return 0;
  96. }
  97. if ($this->option('model')) {
  98. $this->model = $this->modelService->getModelById($this->option('model'));
  99. // getModelById 始终返回 AiModelResource,未查到时其底层 resource 为 null,需据此判断
  100. if (empty($this->model->resource)) {
  101. $this->error('no model found id='.$this->option('model'));
  102. return 1;
  103. }
  104. $this->info("model:{$this->model['model']}");
  105. }
  106. if ($this->option('fresh')) {
  107. Cache::forget(self::CACHE_KEY);
  108. $this->info('Cleared cached cursor.');
  109. }
  110. // 是否为完整遍历(未指定 book/para),仅此情形在结束后清空断点缓存
  111. $isFullRun = ! $this->option('book') && ! $this->option('para');
  112. // 从缓存恢复已完成的 (book_name, cs_para) 集合,作为重入时的稳定游标
  113. $done = Cache::get(self::CACHE_KEY, []);
  114. // 需要跳过的 book_name 规则,逗号分隔,以 * 结尾为前缀匹配,否则全等匹配
  115. $skipPatterns = [];
  116. if ($this->option('skip')) {
  117. $skipPatterns = array_values(array_filter(array_map('trim', explode(',', $this->option('skip')))));
  118. }
  119. $channel = ChannelApi::getChannelByName('_System_commentary_');
  120. $books = [];
  121. if ($this->option('book')) {
  122. $books[] = ['book_name' => $this->option('book')];
  123. } else {
  124. // orderBy 保证每次遍历顺序一致,游标才稳定
  125. $books = RelatedParagraph::whereNotNull('book_name')
  126. ->where('book_name', '!=', '')
  127. ->where('cs_para', '>', 0)
  128. ->groupBy('book_name')
  129. ->orderBy('book_name')
  130. ->select('book_name')
  131. ->get()->toArray();
  132. }
  133. foreach ($books as $key => $currBook) {
  134. // 命中跳过规则时直接处理下一本:即便上次游标停在此书,也跳到下一个有效 book_name
  135. if ($this->shouldSkipBook($currBook['book_name'], $skipPatterns)) {
  136. $this->info('skip book '.$currBook['book_name']);
  137. continue;
  138. }
  139. $paragraphs = [];
  140. if ($this->option('para')) {
  141. $paragraphs[] = ['cs_para' => $this->option('para')];
  142. } else {
  143. // orderBy 保证每次遍历顺序一致,游标才稳定
  144. $paragraphs = RelatedParagraph::where('book_name', $currBook['book_name'])
  145. ->where('cs_para', '>', 0)
  146. ->groupBy('cs_para')
  147. ->orderBy('cs_para')
  148. ->select('cs_para')
  149. ->get()->toArray();
  150. }
  151. foreach ($paragraphs as $key => $paragraph) {
  152. // 稳定游标:以 book_name|cs_para 唯一标识一个处理单元
  153. $cursor = $currBook['book_name'].'|'.$paragraph['cs_para'];
  154. // 已完成的单元直接跳过,实现中断后重入续跑
  155. if (isset($done[$cursor])) {
  156. continue;
  157. }
  158. $message = 'ai commentary '.$currBook['book_name'].'-'.$paragraph['cs_para'];
  159. $this->info($message);
  160. $result = RelatedParagraph::where('book_name', $currBook['book_name'])
  161. ->where('cs_para', $paragraph['cs_para'])
  162. ->where('book_id', '>', 0)
  163. ->orderBy('book_id')
  164. ->orderBy('para')
  165. ->get();
  166. $pcdBooks = [];
  167. $type = [];
  168. foreach ($result as $rBook) {
  169. // 把段落整合成书。有几本书就有几条输出纪录
  170. if (! isset($pcdBooks[$rBook->book_id])) {
  171. $bookType = $this->getBookType($rBook->book_id);
  172. $pcdBooks[$rBook->book_id] = $bookType;
  173. if (! isset($type[$bookType])) {
  174. $type[$bookType] = [];
  175. }
  176. $type[$bookType][$rBook->book_id] = [];
  177. }
  178. $currType = $pcdBooks[$rBook->book_id];
  179. $type[$currType][$rBook->book_id][] = ['book' => $rBook->book, 'para' => $rBook->para];
  180. }
  181. foreach ($type as $keyType => $info) {
  182. Log::debug($keyType);
  183. foreach ($info as $bookId => $paragraphs) {
  184. Log::debug($bookId);
  185. foreach ($paragraphs as $paragraph) {
  186. Log::debug($paragraph['book'].'-'.$paragraph['para']);
  187. }
  188. }
  189. }
  190. // 处理pali
  191. if (
  192. $this->hasData($type, 'pāḷi') &&
  193. $this->hasData($type, 'aṭṭhakathā')
  194. ) {
  195. $paliJson = [];
  196. foreach ($type['pāḷi'] as $keyBook => $paragraphs) {
  197. foreach ($paragraphs as $paraData) {
  198. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  199. $paliJson = array_merge($paliJson, $sentData);
  200. }
  201. }
  202. $attaJson = [];
  203. foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) {
  204. foreach ($paragraphs as $paraData) {
  205. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  206. $attaJson = array_merge($attaJson, $sentData);
  207. }
  208. }
  209. // llm 对齐
  210. $result = $this->textAlign($paliJson, $attaJson);
  211. // 写入db
  212. $this->save($result, $channel);
  213. }
  214. // 处理义注
  215. if (
  216. $this->hasData($type, 'aṭṭhakathā') &&
  217. $this->hasData($type, 'ṭīkā')
  218. ) {
  219. $tikaResult = [];
  220. foreach ($type['ṭīkā'] as $keyBook => $paragraphs) {
  221. $tikaJson = [];
  222. foreach ($paragraphs as $key => $paraData) {
  223. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  224. $tikaJson = array_merge($tikaJson, $sentData);
  225. }
  226. // llm 对齐
  227. $result = $this->textAlign($attaJson, $tikaJson);
  228. // 将新旧数据合并 如果原来没有,就添加,有,就合并数据
  229. foreach ($result as $new) {
  230. $found = false;
  231. foreach ($tikaResult as $key => $old) {
  232. if ($old['id'] === $new['id']) {
  233. $found = true;
  234. if (isset($new['commentary']) && is_array($new['commentary'])) {
  235. $tikaResult[$key]['commentary'] = array_merge($tikaResult[$key]['commentary'], $new['commentary']);
  236. }
  237. break;
  238. }
  239. }
  240. if (! $found) {
  241. array_push($tikaResult, $new);
  242. }
  243. }
  244. }
  245. // 写入db
  246. $this->save($tikaResult, $channel);
  247. }
  248. // 该处理单元全部写库完成后再标记游标,确保中途中断不会误跳过
  249. $done[$cursor] = true;
  250. Cache::put(self::CACHE_KEY, $done, now()->addHours(24));
  251. }
  252. }
  253. // 完整遍历正常结束,清空断点缓存
  254. if ($isFullRun) {
  255. Cache::forget(self::CACHE_KEY);
  256. }
  257. return 0;
  258. }
  259. /**
  260. * 判断 book_name 是否命中跳过规则。
  261. *
  262. * @param array<int, string> $patterns 以 * 结尾为前缀匹配,否则全等匹配
  263. */
  264. private function shouldSkipBook(string $bookName, array $patterns): bool
  265. {
  266. foreach ($patterns as $pattern) {
  267. if (str_ends_with($pattern, '*')) {
  268. $prefix = rtrim($pattern, '*');
  269. if ($prefix !== '' && str_starts_with($bookName, $prefix)) {
  270. return true;
  271. }
  272. } elseif ($bookName === $pattern) {
  273. return true;
  274. }
  275. }
  276. return false;
  277. }
  278. private function hasData($typeData, $typeName)
  279. {
  280. if (
  281. ! isset($typeData[$typeName]) ||
  282. $this->getParagraphNumber($typeData[$typeName]) === 0
  283. ) {
  284. Log::warning($typeName.' data is missing');
  285. return false;
  286. }
  287. return true;
  288. }
  289. private function getParagraphNumber($type)
  290. {
  291. if (! isset($type) || ! is_array($type)) {
  292. return 0;
  293. }
  294. $count = 0;
  295. foreach ($type as $bookId => $paragraphs) {
  296. $count += count($paragraphs);
  297. }
  298. return $count;
  299. }
  300. private function getBookType($bookId)
  301. {
  302. $bookTitle = BookTitle::where('sn', $bookId)->first();
  303. $paliTextUuid = PaliText::where('book', $bookTitle->book)->where('paragraph', $bookTitle->paragraph)->value('uid');
  304. $tagIds = TagMap::where('anchor_id', $paliTextUuid)->select('tag_id')->get();
  305. $tags = Tag::whereIn('id', $tagIds)->select('name')->get();
  306. foreach ($tags as $key => $tag) {
  307. if (in_array($tag->name, ['pāḷi', 'aṭṭhakathā', 'ṭīkā'])) {
  308. return $tag->name;
  309. }
  310. }
  311. return null;
  312. }
  313. private function getParaContent($book, $para)
  314. {
  315. $sentenceService = app(SearchPaliDataService::class);
  316. $sentences = PaliSentence::where('book', $book)
  317. ->where('paragraph', $para)
  318. ->orderBy('word_begin')
  319. ->get();
  320. if (! $sentences) {
  321. return null;
  322. }
  323. $json = [];
  324. foreach ($sentences as $key => $sentence) {
  325. $content = $sentenceService->getSentenceContent($book, $para, $sentence->word_begin, $sentence->word_end);
  326. $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
  327. $json[] = ['id' => $id, 'content' => $content['markdown']];
  328. }
  329. return $json;
  330. }
  331. private function arrayIndexed(array $input): array
  332. {
  333. $output = [];
  334. foreach ($input as $key => $value) {
  335. $value['id'] = $key;
  336. $output[] = $value;
  337. }
  338. return $output;
  339. }
  340. private function arrayUnIndexed(array $input, array $original, array $commentary): array
  341. {
  342. $output = [];
  343. foreach ($input as $key => $value) {
  344. if (! isset($original[$key])) {
  345. Log::warning('no id');
  346. continue;
  347. }
  348. $value['id'] = $original[$key]['id'];
  349. if (isset($value['commentary'])) {
  350. $newCommentary = array_map(function ($n) use ($commentary) {
  351. if (isset($commentary[$n])) {
  352. return $commentary[$n]['id'];
  353. }
  354. return '';
  355. }, $value['commentary']);
  356. $value['commentary'] = $newCommentary;
  357. }
  358. $output[] = $value;
  359. }
  360. return $output;
  361. }
  362. private function textAlign(array $original, array $commentary)
  363. {
  364. if (! $this->model) {
  365. Log::error('model is invalid');
  366. return [];
  367. }
  368. $originalSn = $this->arrayIndexed($original);
  369. $commentarySn = $this->arrayIndexed($commentary);
  370. $originalText = "```jsonl\n".LlmResponseParser::jsonl_encode($originalSn)."\n```";
  371. $commentaryText = "```jsonl\n".LlmResponseParser::jsonl_encode($commentarySn)."\n```";
  372. Log::debug('ai request', [
  373. 'original' => $originalText,
  374. 'commentary' => $commentaryText,
  375. ]);
  376. $totalSentences = count($original) + count($commentary);
  377. $maxTokens = (int) ($this->tokensPerSentence * $totalSentences * 1.5);
  378. $this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens");
  379. Log::debug('requesting…… '.$this->model['model']);
  380. $startAt = time();
  381. $response = $this->openAIService->setApiUrl($this->model['url'])
  382. ->setModel($this->model['model'])
  383. ->setApiKey($this->model['key'])
  384. ->setSystemPrompt($this->prompt)
  385. ->setTemperature(0.0)
  386. ->setStream(false)
  387. ->setMaxToken($maxTokens)
  388. ->send("# pali\n\n{$originalText}\n\n# commentary\n\n{$commentaryText}");
  389. $completeAt = time();
  390. $answer = $response['choices'][0]['message']['content'] ?? '[]';
  391. Log::debug('ai response', ['data' => $answer]);
  392. $message = ($completeAt - $startAt).'s';
  393. if (isset($response['usage']['completion_tokens'])) {
  394. Log::debug('usage', $response['usage']);
  395. $message .= ' completion_tokens:'.$response['usage']['completion_tokens'];
  396. $curr = (int) ($response['usage']['completion_tokens'] / $totalSentences);
  397. if ($curr > $this->tokensPerSentence) {
  398. $this->tokensPerSentence = $curr;
  399. }
  400. }
  401. $this->info($message);
  402. $json = [];
  403. if (is_string($answer)) {
  404. $json = LlmResponseParser::jsonl($answer);
  405. $json = $this->arrayUnIndexed($json, $original, $commentary);
  406. Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
  407. }
  408. if (count($json) === 0) {
  409. Log::error('jsonl is empty');
  410. }
  411. return $json;
  412. }
  413. private function save($json, $channel)
  414. {
  415. if (! is_array($json)) {
  416. Log::warning('llm return null');
  417. return false;
  418. }
  419. foreach ($json as $key => $sentence) {
  420. if (! isset($sentence['commentary'])) {
  421. continue;
  422. }
  423. $sentId = explode('-', $sentence['id']);
  424. $arrCommentary = $sentence['commentary'];
  425. if (
  426. isset($arrCommentary) &&
  427. is_array($arrCommentary) &&
  428. count($arrCommentary) > 0
  429. ) {
  430. $content = array_map(function ($n) {
  431. if (is_string($n)) {
  432. return '{{'.$n.'}}';
  433. } elseif (is_array($n) && isset($n['id']) && is_string($n['id'])) {
  434. return '{{'.$n['id'].'}}';
  435. } else {
  436. return '';
  437. }
  438. }, $arrCommentary);
  439. $this->sentenceService->save(
  440. [
  441. 'book_id' => $sentId[0],
  442. 'paragraph' => $sentId[1],
  443. 'word_start' => $sentId[2],
  444. 'word_end' => $sentId[3],
  445. 'channel_uid' => $channel->uid,
  446. 'content' => implode("\n", $content),
  447. 'lang' => $channel->lang,
  448. 'status' => $channel->status,
  449. 'editor_uid' => $this->model['uid'],
  450. ]
  451. );
  452. $this->info($sentence['id'].' saved');
  453. }
  454. }
  455. }
  456. }