UpgradeSystemCommentary.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. <?php
  2. namespace App\Console\Commands;
  3. use App\Helpers\LlmResponseParser;
  4. use App\Http\Api\ChannelApi;
  5. use App\Http\Resources\AiModelResource;
  6. use App\Models\BookTitle;
  7. use App\Models\PaliSentence;
  8. use App\Models\PaliText;
  9. use App\Models\RelatedParagraph;
  10. use App\Models\Tag;
  11. use App\Models\TagMap;
  12. use App\Services\AIModelService;
  13. use App\Services\OpenAIService;
  14. use App\Services\SearchPaliDataService;
  15. use App\Services\SentenceService;
  16. use Illuminate\Console\Command;
  17. use Illuminate\Support\Facades\Cache;
  18. use Illuminate\Support\Facades\Log;
  19. class UpgradeSystemCommentary extends Command
  20. {
  21. /**
  22. * The name and signature of the console command.
  23. * php artisan upgrade:sys.commentary
  24. *
  25. * @var string
  26. */
  27. protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=} {--fresh : 清除缓存断点,从头开始}';
  28. protected $prompt = <<<'md'
  29. 你是一个注释对照阅读助手。
  30. pali 是巴利原文,jsonl格式, 每条记录是一个句子。包括id 和 content 两个字段
  31. commentary 是pali的注释,jsonl 格式,每条记录是一个句子。包括id 和 content 两个字段
  32. commentary里面的内容是对pali内容的注释
  33. commentary里面的黑体字,说明该句子是注释pali中的对应的巴利文。
  34. 你需要按照顺序将commentary中的句子与pali原文对照,。
  35. 输出格式jsonl
  36. 只输出pali数据
  37. 在pali句子数据里面增加一个字段“commentary” 里面放这个句子对应的commentary句子的id
  38. 不要输出content字段,只输出id,commentary字段
  39. 直接输出jsonl数据,无需解释
  40. **关键规则:**
  41. 1. 根据commentary中的句子的意思找到与pali对应的句子
  42. 1. 如果commentary中的某个句子**有黑体字**,它应该放在pali中对应巴利词汇出现的句子之后
  43. 2. 如果commentary中的某个句子**没有黑体字**,请将其与**上面最近的有黑体字的commentary句子**合并在一起(保持在同一个引用块内),不要单独成行
  44. 3. 有些pali原文句子可能没有对应的注释
  45. 4. 请不要遗漏任何commentary中的句子,也不要打乱顺序
  46. 5. 同时保持pali的句子数量不变,不要增删
  47. 6. 应该将全部commentary中的句子都与pali句子对应,不要有遗漏
  48. **输出范例**
  49. {"id":0,"commentary":[0,1]}
  50. {"id":1,"commentary":[2]}
  51. md;
  52. /**
  53. * The console command description.
  54. *
  55. * @var string
  56. */
  57. protected $description = 'Command description';
  58. // 缓存键:记录已完成的 "book_name|cs_para" 集合,中断后重跑自动跳过,48h 过期
  59. private const CACHE_KEY = 'upgrade:sys.commentary:done';
  60. protected $sentenceService;
  61. protected $modelService;
  62. protected $openAIService;
  63. protected AiModelResource $model;
  64. protected $tokensPerSentence = 0;
  65. /**
  66. * Create a new command instance.
  67. *
  68. * @return void
  69. */
  70. public function __construct(
  71. AIModelService $model,
  72. SentenceService $sent,
  73. OpenAIService $openAI
  74. ) {
  75. $this->modelService = $model;
  76. $this->sentenceService = $sent;
  77. $this->openAIService = $openAI;
  78. parent::__construct();
  79. }
  80. /**
  81. * Execute the console command.
  82. *
  83. * @return int
  84. */
  85. public function handle()
  86. {
  87. if ($this->option('list')) {
  88. $result = RelatedParagraph::whereNotNull('book_name')
  89. ->groupBy('book_name')
  90. ->selectRaw('book_name,count(*)')
  91. ->get();
  92. foreach ($result as $key => $value) {
  93. $this->info($value['book_name'].'['.$value['count'].']');
  94. }
  95. return 0;
  96. }
  97. if ($this->option('model')) {
  98. $this->model = $this->modelService->getModelById($this->option('model'));
  99. // getModelById 始终返回 AiModelResource,未查到时其底层 resource 为 null,需据此判断
  100. if (empty($this->model->resource)) {
  101. $this->error('no model found id='.$this->option('model'));
  102. return 1;
  103. }
  104. $this->info("model:{$this->model['model']}");
  105. }
  106. if ($this->option('fresh')) {
  107. Cache::forget(self::CACHE_KEY);
  108. $this->info('Cleared cached cursor.');
  109. }
  110. // 是否为完整遍历(未指定 book/para),仅此情形在结束后清空断点缓存
  111. $isFullRun = ! $this->option('book') && ! $this->option('para');
  112. // 从缓存恢复已完成的 (book_name, cs_para) 集合,作为重入时的稳定游标
  113. $done = Cache::get(self::CACHE_KEY, []);
  114. $channel = ChannelApi::getChannelByName('_System_commentary_');
  115. $books = [];
  116. if ($this->option('book')) {
  117. $books[] = ['book_name' => $this->option('book')];
  118. } else {
  119. // orderBy 保证每次遍历顺序一致,游标才稳定
  120. $books = RelatedParagraph::whereNotNull('book_name')
  121. ->where('book_name', '!=', '')
  122. ->where('cs_para', '>', 0)
  123. ->groupBy('book_name')
  124. ->orderBy('book_name')
  125. ->select('book_name')
  126. ->get()->toArray();
  127. }
  128. foreach ($books as $key => $currBook) {
  129. $paragraphs = [];
  130. if ($this->option('para')) {
  131. $paragraphs[] = ['cs_para' => $this->option('para')];
  132. } else {
  133. // orderBy 保证每次遍历顺序一致,游标才稳定
  134. $paragraphs = RelatedParagraph::where('book_name', $currBook['book_name'])
  135. ->where('cs_para', '>', 0)
  136. ->groupBy('cs_para')
  137. ->orderBy('cs_para')
  138. ->select('cs_para')
  139. ->get()->toArray();
  140. }
  141. foreach ($paragraphs as $key => $paragraph) {
  142. // 稳定游标:以 book_name|cs_para 唯一标识一个处理单元
  143. $cursor = $currBook['book_name'].'|'.$paragraph['cs_para'];
  144. // 已完成的单元直接跳过,实现中断后重入续跑
  145. if (isset($done[$cursor])) {
  146. continue;
  147. }
  148. $message = 'ai commentary '.$currBook['book_name'].'-'.$paragraph['cs_para'];
  149. $this->info($message);
  150. $result = RelatedParagraph::where('book_name', $currBook['book_name'])
  151. ->where('cs_para', $paragraph['cs_para'])
  152. ->where('book_id', '>', 0)
  153. ->orderBy('book_id')
  154. ->orderBy('para')
  155. ->get();
  156. $pcdBooks = [];
  157. $type = [];
  158. foreach ($result as $rBook) {
  159. // 把段落整合成书。有几本书就有几条输出纪录
  160. if (! isset($pcdBooks[$rBook->book_id])) {
  161. $bookType = $this->getBookType($rBook->book_id);
  162. $pcdBooks[$rBook->book_id] = $bookType;
  163. if (! isset($type[$bookType])) {
  164. $type[$bookType] = [];
  165. }
  166. $type[$bookType][$rBook->book_id] = [];
  167. }
  168. $currType = $pcdBooks[$rBook->book_id];
  169. $type[$currType][$rBook->book_id][] = ['book' => $rBook->book, 'para' => $rBook->para];
  170. }
  171. foreach ($type as $keyType => $info) {
  172. Log::debug($keyType);
  173. foreach ($info as $bookId => $paragraphs) {
  174. Log::debug($bookId);
  175. foreach ($paragraphs as $paragraph) {
  176. Log::debug($paragraph['book'].'-'.$paragraph['para']);
  177. }
  178. }
  179. }
  180. // 处理pali
  181. if (
  182. $this->hasData($type, 'pāḷi') &&
  183. $this->hasData($type, 'aṭṭhakathā')
  184. ) {
  185. $paliJson = [];
  186. foreach ($type['pāḷi'] as $keyBook => $paragraphs) {
  187. foreach ($paragraphs as $paraData) {
  188. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  189. $paliJson = array_merge($paliJson, $sentData);
  190. }
  191. }
  192. $attaJson = [];
  193. foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) {
  194. foreach ($paragraphs as $paraData) {
  195. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  196. $attaJson = array_merge($attaJson, $sentData);
  197. }
  198. }
  199. // llm 对齐
  200. $result = $this->textAlign($paliJson, $attaJson);
  201. // 写入db
  202. $this->save($result, $channel);
  203. }
  204. // 处理义注
  205. if (
  206. $this->hasData($type, 'aṭṭhakathā') &&
  207. $this->hasData($type, 'ṭīkā')
  208. ) {
  209. $tikaResult = [];
  210. foreach ($type['ṭīkā'] as $keyBook => $paragraphs) {
  211. $tikaJson = [];
  212. foreach ($paragraphs as $key => $paraData) {
  213. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  214. $tikaJson = array_merge($tikaJson, $sentData);
  215. }
  216. // llm 对齐
  217. $result = $this->textAlign($attaJson, $tikaJson);
  218. // 将新旧数据合并 如果原来没有,就添加,有,就合并数据
  219. foreach ($result as $new) {
  220. $found = false;
  221. foreach ($tikaResult as $key => $old) {
  222. if ($old['id'] === $new['id']) {
  223. $found = true;
  224. if (isset($new['commentary']) && is_array($new['commentary'])) {
  225. $tikaResult[$key]['commentary'] = array_merge($tikaResult[$key]['commentary'], $new['commentary']);
  226. }
  227. break;
  228. }
  229. }
  230. if (! $found) {
  231. array_push($tikaResult, $new);
  232. }
  233. }
  234. }
  235. // 写入db
  236. $this->save($tikaResult, $channel);
  237. }
  238. // 该处理单元全部写库完成后再标记游标,确保中途中断不会误跳过
  239. $done[$cursor] = true;
  240. Cache::put(self::CACHE_KEY, $done, now()->addHours(24));
  241. }
  242. }
  243. // 完整遍历正常结束,清空断点缓存
  244. if ($isFullRun) {
  245. Cache::forget(self::CACHE_KEY);
  246. }
  247. return 0;
  248. }
  249. private function hasData($typeData, $typeName)
  250. {
  251. if (
  252. ! isset($typeData[$typeName]) ||
  253. $this->getParagraphNumber($typeData[$typeName]) === 0
  254. ) {
  255. Log::warning($typeName.' data is missing');
  256. return false;
  257. }
  258. return true;
  259. }
  260. private function getParagraphNumber($type)
  261. {
  262. if (! isset($type) || ! is_array($type)) {
  263. return 0;
  264. }
  265. $count = 0;
  266. foreach ($type as $bookId => $paragraphs) {
  267. $count += count($paragraphs);
  268. }
  269. return $count;
  270. }
  271. private function getBookType($bookId)
  272. {
  273. $bookTitle = BookTitle::where('sn', $bookId)->first();
  274. $paliTextUuid = PaliText::where('book', $bookTitle->book)->where('paragraph', $bookTitle->paragraph)->value('uid');
  275. $tagIds = TagMap::where('anchor_id', $paliTextUuid)->select('tag_id')->get();
  276. $tags = Tag::whereIn('id', $tagIds)->select('name')->get();
  277. foreach ($tags as $key => $tag) {
  278. if (in_array($tag->name, ['pāḷi', 'aṭṭhakathā', 'ṭīkā'])) {
  279. return $tag->name;
  280. }
  281. }
  282. return null;
  283. }
  284. private function getParaContent($book, $para)
  285. {
  286. $sentenceService = app(SearchPaliDataService::class);
  287. $sentences = PaliSentence::where('book', $book)
  288. ->where('paragraph', $para)
  289. ->orderBy('word_begin')
  290. ->get();
  291. if (! $sentences) {
  292. return null;
  293. }
  294. $json = [];
  295. foreach ($sentences as $key => $sentence) {
  296. $content = $sentenceService->getSentenceContent($book, $para, $sentence->word_begin, $sentence->word_end);
  297. $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
  298. $json[] = ['id' => $id, 'content' => $content['markdown']];
  299. }
  300. return $json;
  301. }
  302. private function arrayIndexed(array $input): array
  303. {
  304. $output = [];
  305. foreach ($input as $key => $value) {
  306. $value['id'] = $key;
  307. $output[] = $value;
  308. }
  309. return $output;
  310. }
  311. private function arrayUnIndexed(array $input, array $original, array $commentary): array
  312. {
  313. $output = [];
  314. foreach ($input as $key => $value) {
  315. if (! isset($original[$key])) {
  316. Log::warning('no id');
  317. continue;
  318. }
  319. $value['id'] = $original[$key]['id'];
  320. if (isset($value['commentary'])) {
  321. $newCommentary = array_map(function ($n) use ($commentary) {
  322. if (isset($commentary[$n])) {
  323. return $commentary[$n]['id'];
  324. }
  325. return '';
  326. }, $value['commentary']);
  327. $value['commentary'] = $newCommentary;
  328. }
  329. $output[] = $value;
  330. }
  331. return $output;
  332. }
  333. private function textAlign(array $original, array $commentary)
  334. {
  335. if (! $this->model) {
  336. Log::error('model is invalid');
  337. return [];
  338. }
  339. $originalSn = $this->arrayIndexed($original);
  340. $commentarySn = $this->arrayIndexed($commentary);
  341. $originalText = "```jsonl\n".LlmResponseParser::jsonl_encode($originalSn)."\n```";
  342. $commentaryText = "```jsonl\n".LlmResponseParser::jsonl_encode($commentarySn)."\n```";
  343. Log::debug('ai request', [
  344. 'original' => $originalText,
  345. 'commentary' => $commentaryText,
  346. ]);
  347. $totalSentences = count($original) + count($commentary);
  348. $maxTokens = (int) ($this->tokensPerSentence * $totalSentences * 1.5);
  349. $this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens");
  350. Log::debug('requesting…… '.$this->model['model']);
  351. $startAt = time();
  352. $response = $this->openAIService->setApiUrl($this->model['url'])
  353. ->setModel($this->model['model'])
  354. ->setApiKey($this->model['key'])
  355. ->setSystemPrompt($this->prompt)
  356. ->setTemperature(0.0)
  357. ->setStream(false)
  358. ->setMaxToken($maxTokens)
  359. ->send("# pali\n\n{$originalText}\n\n# commentary\n\n{$commentaryText}");
  360. $completeAt = time();
  361. $answer = $response['choices'][0]['message']['content'] ?? '[]';
  362. Log::debug('ai response', ['data' => $answer]);
  363. $message = ($completeAt - $startAt).'s';
  364. if (isset($response['usage']['completion_tokens'])) {
  365. Log::debug('usage', $response['usage']);
  366. $message .= ' completion_tokens:'.$response['usage']['completion_tokens'];
  367. $curr = (int) ($response['usage']['completion_tokens'] / $totalSentences);
  368. if ($curr > $this->tokensPerSentence) {
  369. $this->tokensPerSentence = $curr;
  370. }
  371. }
  372. $this->info($message);
  373. $json = [];
  374. if (is_string($answer)) {
  375. $json = LlmResponseParser::jsonl($answer);
  376. $json = $this->arrayUnIndexed($json, $original, $commentary);
  377. Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
  378. }
  379. if (count($json) === 0) {
  380. Log::error('jsonl is empty');
  381. }
  382. return $json;
  383. }
  384. private function save($json, $channel)
  385. {
  386. if (! is_array($json)) {
  387. Log::warning('llm return null');
  388. return false;
  389. }
  390. foreach ($json as $key => $sentence) {
  391. if (! isset($sentence['commentary'])) {
  392. continue;
  393. }
  394. $sentId = explode('-', $sentence['id']);
  395. $arrCommentary = $sentence['commentary'];
  396. if (
  397. isset($arrCommentary) &&
  398. is_array($arrCommentary) &&
  399. count($arrCommentary) > 0
  400. ) {
  401. $content = array_map(function ($n) {
  402. if (is_string($n)) {
  403. return '{{'.$n.'}}';
  404. } elseif (is_array($n) && isset($n['id']) && is_string($n['id'])) {
  405. return '{{'.$n['id'].'}}';
  406. } else {
  407. return '';
  408. }
  409. }, $arrCommentary);
  410. $this->sentenceService->save(
  411. [
  412. 'book_id' => $sentId[0],
  413. 'paragraph' => $sentId[1],
  414. 'word_start' => $sentId[2],
  415. 'word_end' => $sentId[3],
  416. 'channel_uid' => $channel->uid,
  417. 'content' => implode("\n", $content),
  418. 'lang' => $channel->lang,
  419. 'status' => $channel->status,
  420. 'editor_uid' => $this->model['uid'],
  421. ]
  422. );
  423. $this->info($sentence['id'].' saved');
  424. }
  425. }
  426. }
  427. }