UpgradeSystemCommentary.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. <?php
  2. namespace App\Console\Commands;
  3. use Illuminate\Console\Command;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\RelatedParagraph;
  6. use App\Models\BookTitle;
  7. use App\Models\PaliText;
  8. use App\Models\TagMap;
  9. use App\Models\Tag;
  10. use App\Models\PaliSentence;
  11. use App\Services\SearchPaliDataService;
  12. use App\Services\OpenAIService;
  13. use App\Services\AIModelService;
  14. use App\Services\SentenceService;
  15. use App\Helpers\LlmResponseParser;
  16. use App\Http\Api\ChannelApi;
  17. class UpgradeSystemCommentary extends Command
  18. {
  19. /**
  20. * The name and signature of the console command.
  21. * php artisan upgrade:sys.commentary
  22. * @var string
  23. */
  24. protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=}';
  25. protected $prompt = <<<md
  26. 你是一个注释对照阅读助手。
  27. pali 是巴利原文,jsonl格式, 每条记录是一个句子。包括id 和 content 两个字段
  28. commentary 是pali的注释,jsonl 格式,每条记录是一个句子。包括id 和 content 两个字段
  29. commentary里面的内容是对pali内容的注释
  30. commentary里面的黑体字,说明该句子是注释pali中的对应的巴利文。
  31. 你需要按照顺序将commentary中的句子与pali原文对照,。
  32. 输出格式jsonl
  33. 只输出pali数据
  34. 在pali句子数据里面增加一个字段“commentary” 里面放这个句子对应的commentary句子的id
  35. 不要输出content字段,只输出id,commentary字段
  36. 直接输出jsonl数据,无需解释
  37. **关键规则:**
  38. 1. 根据commentary中的句子的意思找到与pali对应的句子
  39. 1. 如果commentary中的某个句子**有黑体字**,它应该放在pali中对应巴利词汇出现的句子之后
  40. 2. 如果commentary中的某个句子**没有黑体字**,请将其与**上面最近的有黑体字的commentary句子**合并在一起(保持在同一个引用块内),不要单独成行
  41. 3. 有些pali原文句子可能没有对应的注释
  42. 4. 请不要遗漏任何commentary中的句子,也不要打乱顺序
  43. 5. 同时保持pali的句子数量不变,不要增删
  44. 6. 应该将全部commentary中的句子都与pali句子对应,不要有遗漏
  45. 7. 对照排版时,请保持原来的字体设置:原来是黑体就是黑体,原来不是黑体就不是黑体。尤其是pali巴利原文,请不要改变字体
  46. **输出范例**
  47. [
  48. {
  49. "id": "165-6-112-136",
  50. "content": "Yepi te, bho gotama, ahesuṃ atītamaddhānaṃ arahanto sammāsambuddhā tepi bhagavanto etaparamaṃyeva sammā bhikkhusaṅghaṃ paṭipādesuṃ – seyyathāpi etarahi bhotā gotamena sammā bhikkhusaṅgho paṭipādito.",
  51. "commentary": [
  52. "131-9-35-63",
  53. "131-9-64-72",
  54. "131-9-73-82",
  55. "131-9-83-95",
  56. "131-9-96-130"
  57. ]
  58. }
  59. ]
  60. md;
  61. /**
  62. * The console command description.
  63. *
  64. * @var string
  65. */
  66. protected $description = 'Command description';
  67. protected $sentenceService;
  68. protected $modelService;
  69. protected $openAIService;
  70. protected $model;
  71. protected $tokensPerSentence = 0;
  72. /**
  73. * Create a new command instance.
  74. *
  75. * @return void
  76. */
  77. public function __construct(AIModelService $model, SentenceService $sent, OpenAIService $openAI)
  78. {
  79. $this->modelService = $model;
  80. $this->sentenceService = $sent;
  81. $this->openAIService = $openAI;
  82. parent::__construct();
  83. }
  84. /**
  85. * Execute the console command.
  86. *
  87. * @return int
  88. */
  89. public function handle()
  90. {
  91. if ($this->option('list')) {
  92. $result = RelatedParagraph::whereNotNull('book_name')
  93. ->groupBy('book_name')
  94. ->selectRaw('book_name,count(*)')
  95. ->get();
  96. foreach ($result as $key => $value) {
  97. $this->info($value['book_name'] . "[" . $value['count'] . "]");
  98. }
  99. return 0;
  100. }
  101. if ($this->option('model')) {
  102. $this->model = $this->modelService->getModelById($this->option('model'));
  103. $this->info("model:{$this->model['model']}");
  104. }
  105. $channel = ChannelApi::getChannelByName('_System_commentary_');
  106. $books = [];
  107. if ($this->option('book')) {
  108. $books[] = ['book_name' => $this->option('book')];
  109. } else {
  110. $books = RelatedParagraph::whereNotNull('book_name')
  111. ->where('cs_para', '>', 0)
  112. ->groupBy('book_name')
  113. ->select('book_name')
  114. ->get()->toArray();
  115. }
  116. foreach ($books as $key => $currBook) {
  117. $paragraphs = [];
  118. if ($this->option('para')) {
  119. $paragraphs[] = ['cs_para' => $this->option('para')];
  120. } else {
  121. $paragraphs = RelatedParagraph::where('book_name', $currBook['book_name'])
  122. ->where('cs_para', '>', 0)
  123. ->groupBy('cs_para')
  124. ->select('cs_para')
  125. ->get()->toArray();
  126. }
  127. foreach ($paragraphs as $key => $paragraph) {
  128. $message = 'ai commentary ' . $currBook['book_name'] . '-' . $paragraph['cs_para'];
  129. $this->info($message);
  130. Log::info($message);
  131. $result = RelatedParagraph::where('book_name', $currBook['book_name'])
  132. ->where('cs_para', $paragraph['cs_para'])
  133. ->where('book_id', '>', 0)
  134. ->orderBy('book_id')
  135. ->orderBy('para')
  136. ->get();
  137. $pcdBooks = [];
  138. $type = [];
  139. foreach ($result as $rBook) {
  140. # 把段落整合成书。有几本书就有几条输出纪录
  141. if (!isset($pcdBooks[$rBook->book_id])) {
  142. $bookType = $this->getBookType($rBook->book_id);
  143. $pcdBooks[$rBook->book_id] = $bookType;
  144. if (!isset($type[$bookType])) {
  145. $type[$bookType] = [];
  146. }
  147. $type[$bookType][$rBook->book_id] = [];
  148. }
  149. $currType = $pcdBooks[$rBook->book_id];
  150. $type[$currType][$rBook->book_id][] = ['book' => $rBook->book, 'para' => $rBook->para];
  151. }
  152. foreach ($type as $keyType => $info) {
  153. Log::debug($keyType);
  154. foreach ($info as $bookId => $paragraphs) {
  155. Log::debug($bookId);
  156. foreach ($paragraphs as $paragraph) {
  157. Log::debug($paragraph['book'] . '-' . $paragraph['para']);
  158. }
  159. }
  160. }
  161. //处理pali
  162. if (
  163. $this->hasData($type, 'pāḷi') &&
  164. $this->hasData($type, 'aṭṭhakathā')
  165. ) {
  166. $paliJson = [];
  167. foreach ($type['pāḷi'] as $keyBook => $paragraphs) {
  168. foreach ($paragraphs as $paraData) {
  169. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  170. $paliJson = array_merge($paliJson, $sentData);
  171. }
  172. }
  173. $attaJson = [];
  174. foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) {
  175. foreach ($paragraphs as $paraData) {
  176. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  177. $attaJson = array_merge($attaJson, $sentData);
  178. }
  179. }
  180. //llm 对齐
  181. $result = $this->textAlign($paliJson, $attaJson);
  182. //写入db
  183. $this->save($result, $channel);
  184. }
  185. //处理义注
  186. if (
  187. $this->hasData($type, 'aṭṭhakathā') &&
  188. $this->hasData($type, 'ṭīkā')
  189. ) {
  190. $tikaResult = array();
  191. foreach ($type['ṭīkā'] as $keyBook => $paragraphs) {
  192. $tikaJson = [];
  193. foreach ($paragraphs as $key => $paraData) {
  194. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  195. $tikaJson = array_merge($tikaJson, $sentData);
  196. }
  197. //llm 对齐
  198. $result = $this->textAlign($attaJson, $tikaJson);
  199. //将新旧数据合并 如果原来没有,就添加,有,就合并数据
  200. foreach ($result as $new) {
  201. $found = false;
  202. foreach ($tikaResult as $key => $old) {
  203. if ($old['id'] === $new['id']) {
  204. $found = true;
  205. if (isset($new['commentary']) && is_array($new['commentary'])) {
  206. $tikaResult[$key]['commentary'] = array_merge($tikaResult[$key]['commentary'], $new['commentary']);
  207. }
  208. break;
  209. }
  210. }
  211. if (!$found) {
  212. array_push($tikaResult, $new);
  213. }
  214. }
  215. }
  216. //写入db
  217. $this->save($tikaResult, $channel);
  218. }
  219. }
  220. }
  221. return 0;
  222. }
  223. private function hasData($typeData, $typeName)
  224. {
  225. if (
  226. !isset($typeData[$typeName]) ||
  227. $this->getParagraphNumber($typeData[$typeName]) === 0
  228. ) {
  229. Log::warning($typeName . ' data is missing');
  230. return false;
  231. }
  232. return true;
  233. }
  234. private function getParagraphNumber($type)
  235. {
  236. if (!isset($type) || !is_array($type)) {
  237. return 0;
  238. }
  239. $count = 0;
  240. foreach ($type as $bookId => $paragraphs) {
  241. $count += count($paragraphs);
  242. }
  243. return $count;
  244. }
  245. private function getBookType($bookId)
  246. {
  247. $bookTitle = BookTitle::where('sn', $bookId)->first();
  248. $paliTextUuid = PaliText::where('book', $bookTitle->book)->where('paragraph', $bookTitle->paragraph)->value('uid');
  249. $tagIds = TagMap::where('anchor_id', $paliTextUuid)->select('tag_id')->get();
  250. $tags = Tag::whereIn('id', $tagIds)->select('name')->get();
  251. foreach ($tags as $key => $tag) {
  252. if (in_array($tag->name, ['pāḷi', 'aṭṭhakathā', 'ṭīkā'])) {
  253. return $tag->name;
  254. }
  255. }
  256. return null;
  257. }
  258. private function getParaContent($book, $para)
  259. {
  260. $sentenceService = app(SearchPaliDataService::class);
  261. $sentences = PaliSentence::where('book', $book)
  262. ->where('paragraph', $para)
  263. ->orderBy('word_begin')
  264. ->get();
  265. if (!$sentences) {
  266. return null;
  267. }
  268. $json = [];
  269. foreach ($sentences as $key => $sentence) {
  270. $content = $sentenceService->getSentenceText($book, $para, $sentence->word_begin, $sentence->word_end);
  271. $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
  272. $json[] = ['id' => $id, 'content' => $content['markdown']];
  273. }
  274. return $json;
  275. }
  276. private function arrayIndexed(array $input): array
  277. {
  278. $output = [];
  279. foreach ($input as $key => $value) {
  280. $value['id'] = $key;
  281. $output[] = $value;
  282. }
  283. return $output;
  284. }
  285. private function arrayUnIndexed(array $input, array $original, array $commentary): array
  286. {
  287. $output = [];
  288. foreach ($input as $key => $value) {
  289. $value['id'] = $original[$key]['id'];
  290. if (isset($value['commentary'])) {
  291. $newCommentary = array_map(function ($n) use ($commentary) {
  292. if (isset($commentary[$n])) {
  293. return $commentary[$n]['id'];
  294. }
  295. return '';
  296. }, $value['commentary']);
  297. $value['commentary'] = $newCommentary;
  298. }
  299. $output[] = $value;
  300. }
  301. return $output;
  302. }
  303. private function textAlign(array $original, array $commentary)
  304. {
  305. if (!$this->model) {
  306. Log::error('model is invalid');
  307. return [];
  308. }
  309. $originalSn = $this->arrayIndexed($original);
  310. $commentarySn = $this->arrayIndexed($commentary);
  311. $originalText = "```jsonl\n" . LlmResponseParser::jsonl_encode($originalSn) . "\n```";
  312. $commentaryText = "```jsonl\n" . LlmResponseParser::jsonl_encode($commentarySn) . "\n```";
  313. Log::debug('ai request', [
  314. 'original' => $originalText,
  315. 'commentary' => $commentaryText
  316. ]);
  317. $totalSentences = count($original) + count($commentary);
  318. $maxTokens = (int)($this->tokensPerSentence * $totalSentences * 1.5);
  319. $this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens");
  320. Log::debug('requesting…… ' . $this->model['model']);
  321. $startAt = time();
  322. $response = $this->openAIService->setApiUrl($this->model['url'])
  323. ->setModel($this->model['model'])
  324. ->setApiKey($this->model['key'])
  325. ->setSystemPrompt($this->prompt)
  326. ->setTemperature(0.0)
  327. ->setStream(false)
  328. ->setMaxToken($maxTokens)
  329. ->send("# pali\n\n{$originalText}\n\n# commentary\n\n{$commentaryText}");
  330. $completeAt = time();
  331. $answer = $response['choices'][0]['message']['content'] ?? '[]';
  332. Log::debug('ai response', ['data' => $answer]);
  333. $message = ($completeAt - $startAt) . 's';
  334. if (isset($response['usage']['completion_tokens'])) {
  335. Log::debug('usage', $response['usage']);
  336. $message .= " completion_tokens:" . $response['usage']['completion_tokens'];
  337. $curr = (int)($response['usage']['completion_tokens'] / $totalSentences);
  338. if ($curr > $this->tokensPerSentence) {
  339. $this->tokensPerSentence = $curr;
  340. }
  341. }
  342. $this->info($message);
  343. $json = [];
  344. if (is_string($answer)) {
  345. $json = LlmResponseParser::jsonl($answer);
  346. $json = $this->arrayUnIndexed($json, $original, $commentary);
  347. Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
  348. }
  349. if (count($json) === 0) {
  350. Log::error("jsonl is empty");
  351. }
  352. return $json;
  353. }
  354. private function save($json, $channel)
  355. {
  356. if (!is_array($json)) {
  357. Log::warning('llm return null');
  358. return false;
  359. }
  360. foreach ($json as $key => $sentence) {
  361. if (!isset($sentence['commentary'])) {
  362. continue;
  363. }
  364. $sentId = explode('-', $sentence['id']);
  365. $arrCommentary = $sentence['commentary'];
  366. if (
  367. isset($arrCommentary) &&
  368. is_array($arrCommentary) &&
  369. count($arrCommentary) > 0
  370. ) {
  371. $content = array_map(function ($n) {
  372. if (is_string($n)) {
  373. return '{{' . $n . '}}';
  374. } else if (is_array($n) && isset($n['id']) && is_string($n['id'])) {
  375. return '{{' . $n['id'] . '}}';
  376. } else {
  377. return '';
  378. }
  379. }, $arrCommentary);
  380. $this->sentenceService->save(
  381. [
  382. 'book_id' => $sentId[0],
  383. 'paragraph' => $sentId[1],
  384. 'word_start' => $sentId[2],
  385. 'word_end' => $sentId[3],
  386. 'channel_uid' => $channel->uid,
  387. 'content' => implode("\n", $content),
  388. 'lang' => $channel->lang,
  389. 'status' => $channel->status,
  390. 'editor_uid' => $this->model['uid'],
  391. ]
  392. );
  393. $this->info($sentence['id'] . ' saved');
  394. }
  395. }
  396. }
  397. }