UpgradeSystemCommentary.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. <?php
  2. namespace App\Console\Commands;
  3. use Illuminate\Console\Command;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\RelatedParagraph;
  6. use App\Models\BookTitle;
  7. use App\Models\PaliText;
  8. use App\Models\TagMap;
  9. use App\Models\Tag;
  10. use App\Models\PaliSentence;
  11. use App\Services\SearchPaliDataService;
  12. use App\Services\OpenAIService;
  13. use App\Services\AIModelService;
  14. use App\Services\SentenceService;
  15. use App\Helpers\LlmResponseParser;
  16. use App\Http\Api\ChannelApi;
  17. class UpgradeSystemCommentary extends Command
  18. {
  19. /**
  20. * The name and signature of the console command.
  21. * php artisan upgrade:sys.commentary
  22. * @var string
  23. */
  24. protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=}';
  25. protected $prompt = <<<md
  26. 你是一个注释对照阅读助手。
  27. pali 是巴利原文,jsonl格式, 每条记录是一个句子。包括id 和 content 两个字段
  28. commentary 是pali的注释,jsonl 格式,每条记录是一个句子。包括id 和 content 两个字段
  29. commentary里面的内容是对pali内容的注释
  30. commentary里面的黑体字,说明该句子是注释pali中的对应的巴利文。
  31. 你需要按照顺序将commentary中的句子与pali原文对照,。
  32. 输出格式jsonl
  33. 只输出pali数据
  34. 在pali句子数据里面增加一个字段“commentary” 里面放这个句子对应的commentary句子的id
  35. 不要输出content字段,只输出id,commentary字段
  36. 直接输出jsonl数据,无需解释
  37. **关键规则:**
  38. 1. 根据commentary中的句子的意思找到与pali对应的句子
  39. 1. 如果commentary中的某个句子**有黑体字**,它应该放在pali中对应巴利词汇出现的句子之后
  40. 2. 如果commentary中的某个句子**没有黑体字**,请将其与**上面最近的有黑体字的commentary句子**合并在一起(保持在同一个引用块内),不要单独成行
  41. 3. 有些pali原文句子可能没有对应的注释
  42. 4. 请不要遗漏任何commentary中的句子,也不要打乱顺序
  43. 5. 同时保持pali的句子数量不变,不要增删
  44. 6. 应该将全部commentary中的句子都与pali句子对应,不要有遗漏
  45. **输出范例**
  46. {"id":0,"commentary":[0,1]}
  47. {"id":1,"commentary":[2]}
  48. md;
  49. /**
  50. * The console command description.
  51. *
  52. * @var string
  53. */
  54. protected $description = 'Command description';
  55. protected $sentenceService;
  56. protected $modelService;
  57. protected $openAIService;
  58. protected $model;
  59. protected $tokensPerSentence = 0;
  60. /**
  61. * Create a new command instance.
  62. *
  63. * @return void
  64. */
  65. public function __construct(
  66. AIModelService $model,
  67. SentenceService $sent,
  68. OpenAIService $openAI
  69. ) {
  70. $this->modelService = $model;
  71. $this->sentenceService = $sent;
  72. $this->openAIService = $openAI;
  73. parent::__construct();
  74. }
  75. /**
  76. * Execute the console command.
  77. *
  78. * @return int
  79. */
  80. public function handle()
  81. {
  82. if ($this->option('list')) {
  83. $result = RelatedParagraph::whereNotNull('book_name')
  84. ->groupBy('book_name')
  85. ->selectRaw('book_name,count(*)')
  86. ->get();
  87. foreach ($result as $key => $value) {
  88. $this->info($value['book_name'] . "[" . $value['count'] . "]");
  89. }
  90. return 0;
  91. }
  92. if ($this->option('model')) {
  93. $this->model = $this->modelService->getModelById($this->option('model'));
  94. $this->info("model:{$this->model['model']}");
  95. }
  96. $channel = ChannelApi::getChannelByName('_System_commentary_');
  97. $books = [];
  98. if ($this->option('book')) {
  99. $books[] = ['book_name' => $this->option('book')];
  100. } else {
  101. $books = RelatedParagraph::whereNotNull('book_name')
  102. ->where('cs_para', '>', 0)
  103. ->groupBy('book_name')
  104. ->select('book_name')
  105. ->get()->toArray();
  106. }
  107. foreach ($books as $key => $currBook) {
  108. $paragraphs = [];
  109. if ($this->option('para')) {
  110. $paragraphs[] = ['cs_para' => $this->option('para')];
  111. } else {
  112. $paragraphs = RelatedParagraph::where('book_name', $currBook['book_name'])
  113. ->where('cs_para', '>', 0)
  114. ->groupBy('cs_para')
  115. ->select('cs_para')
  116. ->get()->toArray();
  117. }
  118. foreach ($paragraphs as $key => $paragraph) {
  119. $message = 'ai commentary ' . $currBook['book_name'] . '-' . $paragraph['cs_para'];
  120. $this->info($message);
  121. Log::info($message);
  122. $result = RelatedParagraph::where('book_name', $currBook['book_name'])
  123. ->where('cs_para', $paragraph['cs_para'])
  124. ->where('book_id', '>', 0)
  125. ->orderBy('book_id')
  126. ->orderBy('para')
  127. ->get();
  128. $pcdBooks = [];
  129. $type = [];
  130. foreach ($result as $rBook) {
  131. # 把段落整合成书。有几本书就有几条输出纪录
  132. if (!isset($pcdBooks[$rBook->book_id])) {
  133. $bookType = $this->getBookType($rBook->book_id);
  134. $pcdBooks[$rBook->book_id] = $bookType;
  135. if (!isset($type[$bookType])) {
  136. $type[$bookType] = [];
  137. }
  138. $type[$bookType][$rBook->book_id] = [];
  139. }
  140. $currType = $pcdBooks[$rBook->book_id];
  141. $type[$currType][$rBook->book_id][] = ['book' => $rBook->book, 'para' => $rBook->para];
  142. }
  143. foreach ($type as $keyType => $info) {
  144. Log::debug($keyType);
  145. foreach ($info as $bookId => $paragraphs) {
  146. Log::debug($bookId);
  147. foreach ($paragraphs as $paragraph) {
  148. Log::debug($paragraph['book'] . '-' . $paragraph['para']);
  149. }
  150. }
  151. }
  152. //处理pali
  153. if (
  154. $this->hasData($type, 'pāḷi') &&
  155. $this->hasData($type, 'aṭṭhakathā')
  156. ) {
  157. $paliJson = [];
  158. foreach ($type['pāḷi'] as $keyBook => $paragraphs) {
  159. foreach ($paragraphs as $paraData) {
  160. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  161. $paliJson = array_merge($paliJson, $sentData);
  162. }
  163. }
  164. $attaJson = [];
  165. foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) {
  166. foreach ($paragraphs as $paraData) {
  167. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  168. $attaJson = array_merge($attaJson, $sentData);
  169. }
  170. }
  171. //llm 对齐
  172. $result = $this->textAlign($paliJson, $attaJson);
  173. //写入db
  174. $this->save($result, $channel);
  175. }
  176. //处理义注
  177. if (
  178. $this->hasData($type, 'aṭṭhakathā') &&
  179. $this->hasData($type, 'ṭīkā')
  180. ) {
  181. $tikaResult = array();
  182. foreach ($type['ṭīkā'] as $keyBook => $paragraphs) {
  183. $tikaJson = [];
  184. foreach ($paragraphs as $key => $paraData) {
  185. $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
  186. $tikaJson = array_merge($tikaJson, $sentData);
  187. }
  188. //llm 对齐
  189. $result = $this->textAlign($attaJson, $tikaJson);
  190. //将新旧数据合并 如果原来没有,就添加,有,就合并数据
  191. foreach ($result as $new) {
  192. $found = false;
  193. foreach ($tikaResult as $key => $old) {
  194. if ($old['id'] === $new['id']) {
  195. $found = true;
  196. if (isset($new['commentary']) && is_array($new['commentary'])) {
  197. $tikaResult[$key]['commentary'] = array_merge($tikaResult[$key]['commentary'], $new['commentary']);
  198. }
  199. break;
  200. }
  201. }
  202. if (!$found) {
  203. array_push($tikaResult, $new);
  204. }
  205. }
  206. }
  207. //写入db
  208. $this->save($tikaResult, $channel);
  209. }
  210. }
  211. }
  212. return 0;
  213. }
  214. private function hasData($typeData, $typeName)
  215. {
  216. if (
  217. !isset($typeData[$typeName]) ||
  218. $this->getParagraphNumber($typeData[$typeName]) === 0
  219. ) {
  220. Log::warning($typeName . ' data is missing');
  221. return false;
  222. }
  223. return true;
  224. }
  225. private function getParagraphNumber($type)
  226. {
  227. if (!isset($type) || !is_array($type)) {
  228. return 0;
  229. }
  230. $count = 0;
  231. foreach ($type as $bookId => $paragraphs) {
  232. $count += count($paragraphs);
  233. }
  234. return $count;
  235. }
  236. private function getBookType($bookId)
  237. {
  238. $bookTitle = BookTitle::where('sn', $bookId)->first();
  239. $paliTextUuid = PaliText::where('book', $bookTitle->book)->where('paragraph', $bookTitle->paragraph)->value('uid');
  240. $tagIds = TagMap::where('anchor_id', $paliTextUuid)->select('tag_id')->get();
  241. $tags = Tag::whereIn('id', $tagIds)->select('name')->get();
  242. foreach ($tags as $key => $tag) {
  243. if (in_array($tag->name, ['pāḷi', 'aṭṭhakathā', 'ṭīkā'])) {
  244. return $tag->name;
  245. }
  246. }
  247. return null;
  248. }
  249. private function getParaContent($book, $para)
  250. {
  251. $sentenceService = app(SearchPaliDataService::class);
  252. $sentences = PaliSentence::where('book', $book)
  253. ->where('paragraph', $para)
  254. ->orderBy('word_begin')
  255. ->get();
  256. if (!$sentences) {
  257. return null;
  258. }
  259. $json = [];
  260. foreach ($sentences as $key => $sentence) {
  261. $content = $sentenceService->getSentenceText($book, $para, $sentence->word_begin, $sentence->word_end);
  262. $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
  263. $json[] = ['id' => $id, 'content' => $content['markdown']];
  264. }
  265. return $json;
  266. }
  267. private function arrayIndexed(array $input): array
  268. {
  269. $output = [];
  270. foreach ($input as $key => $value) {
  271. $value['id'] = $key;
  272. $output[] = $value;
  273. }
  274. return $output;
  275. }
  276. private function arrayUnIndexed(array $input, array $original, array $commentary): array
  277. {
  278. $output = [];
  279. foreach ($input as $key => $value) {
  280. $value['id'] = $original[$key]['id'];
  281. if (isset($value['commentary'])) {
  282. $newCommentary = array_map(function ($n) use ($commentary) {
  283. if (isset($commentary[$n])) {
  284. return $commentary[$n]['id'];
  285. }
  286. return '';
  287. }, $value['commentary']);
  288. $value['commentary'] = $newCommentary;
  289. }
  290. $output[] = $value;
  291. }
  292. return $output;
  293. }
  294. private function textAlign(array $original, array $commentary)
  295. {
  296. if (!$this->model) {
  297. Log::error('model is invalid');
  298. return [];
  299. }
  300. $originalSn = $this->arrayIndexed($original);
  301. $commentarySn = $this->arrayIndexed($commentary);
  302. $originalText = "```jsonl\n" . LlmResponseParser::jsonl_encode($originalSn) . "\n```";
  303. $commentaryText = "```jsonl\n" . LlmResponseParser::jsonl_encode($commentarySn) . "\n```";
  304. Log::debug('ai request', [
  305. 'original' => $originalText,
  306. 'commentary' => $commentaryText
  307. ]);
  308. $totalSentences = count($original) + count($commentary);
  309. $maxTokens = (int)($this->tokensPerSentence * $totalSentences * 1.5);
  310. $this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens");
  311. Log::debug('requesting…… ' . $this->model['model']);
  312. $startAt = time();
  313. $response = $this->openAIService->setApiUrl($this->model['url'])
  314. ->setModel($this->model['model'])
  315. ->setApiKey($this->model['key'])
  316. ->setSystemPrompt($this->prompt)
  317. ->setTemperature(0.0)
  318. ->setStream(false)
  319. ->setMaxToken($maxTokens)
  320. ->send("# pali\n\n{$originalText}\n\n# commentary\n\n{$commentaryText}");
  321. $completeAt = time();
  322. $answer = $response['choices'][0]['message']['content'] ?? '[]';
  323. Log::debug('ai response', ['data' => $answer]);
  324. $message = ($completeAt - $startAt) . 's';
  325. if (isset($response['usage']['completion_tokens'])) {
  326. Log::debug('usage', $response['usage']);
  327. $message .= " completion_tokens:" . $response['usage']['completion_tokens'];
  328. $curr = (int)($response['usage']['completion_tokens'] / $totalSentences);
  329. if ($curr > $this->tokensPerSentence) {
  330. $this->tokensPerSentence = $curr;
  331. }
  332. }
  333. $this->info($message);
  334. $json = [];
  335. if (is_string($answer)) {
  336. $json = LlmResponseParser::jsonl($answer);
  337. $json = $this->arrayUnIndexed($json, $original, $commentary);
  338. Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
  339. }
  340. if (count($json) === 0) {
  341. Log::error("jsonl is empty");
  342. }
  343. return $json;
  344. }
  345. private function save($json, $channel)
  346. {
  347. if (!is_array($json)) {
  348. Log::warning('llm return null');
  349. return false;
  350. }
  351. foreach ($json as $key => $sentence) {
  352. if (!isset($sentence['commentary'])) {
  353. continue;
  354. }
  355. $sentId = explode('-', $sentence['id']);
  356. $arrCommentary = $sentence['commentary'];
  357. if (
  358. isset($arrCommentary) &&
  359. is_array($arrCommentary) &&
  360. count($arrCommentary) > 0
  361. ) {
  362. $content = array_map(function ($n) {
  363. if (is_string($n)) {
  364. return '{{' . $n . '}}';
  365. } else if (is_array($n) && isset($n['id']) && is_string($n['id'])) {
  366. return '{{' . $n['id'] . '}}';
  367. } else {
  368. return '';
  369. }
  370. }, $arrCommentary);
  371. $this->sentenceService->save(
  372. [
  373. 'book_id' => $sentId[0],
  374. 'paragraph' => $sentId[1],
  375. 'word_start' => $sentId[2],
  376. 'word_end' => $sentId[3],
  377. 'channel_uid' => $channel->uid,
  378. 'content' => implode("\n", $content),
  379. 'lang' => $channel->lang,
  380. 'status' => $channel->status,
  381. 'editor_uid' => $this->model['uid'],
  382. ]
  383. );
  384. $this->info($sentence['id'] . ' saved');
  385. }
  386. }
  387. }
  388. }