| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- <?php
- namespace App\Console\Commands;
- use Illuminate\Console\Command;
- use Illuminate\Support\Facades\Log;
- use App\Models\RelatedParagraph;
- use App\Models\BookTitle;
- use App\Models\PaliText;
- use App\Models\TagMap;
- use App\Models\Tag;
- use App\Models\PaliSentence;
- use App\Services\SearchPaliDataService;
- use App\Services\OpenAIService;
- use App\Services\AIModelService;
- use App\Services\SentenceService;
- use App\Helpers\LlmResponseParser;
- use App\Http\Api\ChannelApi;
- class UpgradeSystemCommentary extends Command
- {
- /**
- * The name and signature of the console command.
- * php artisan upgrade:sys.commentary
- * @var string
- */
- protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=}';
- protected $prompt = <<<md
- 你是一个注释对照阅读助手。
- pali 是巴利原文,jsonl格式, 每条记录是一个句子。包括id 和 content 两个字段
- commentary 是pali的注释,jsonl 格式,每条记录是一个句子。包括id 和 content 两个字段
- commentary里面的内容是对pali内容的注释
- commentary里面的黑体字,说明该句子是注释pali中的对应的巴利文。
- 你需要按照顺序将commentary中的句子与pali原文对照,。
- 输出格式jsonl
- 只输出pali数据
- 在pali句子数据里面增加一个字段“commentary” 里面放这个句子对应的commentary句子的id
- 不要输出content字段,只输出id,commentary字段
- 直接输出jsonl数据,无需解释
- **关键规则:**
- 1. 根据commentary中的句子的意思找到与pali对应的句子
- 1. 如果commentary中的某个句子**有黑体字**,它应该放在pali中对应巴利词汇出现的句子之后
- 2. 如果commentary中的某个句子**没有黑体字**,请将其与**上面最近的有黑体字的commentary句子**合并在一起(保持在同一个引用块内),不要单独成行
- 3. 有些pali原文句子可能没有对应的注释
- 4. 请不要遗漏任何commentary中的句子,也不要打乱顺序
- 5. 同时保持pali的句子数量不变,不要增删
- 6. 应该将全部commentary中的句子都与pali句子对应,不要有遗漏
- **输出范例**
- {"id":0,"commentary":[0,1]}
- {"id":1,"commentary":[2]}
- md;
- /**
- * The console command description.
- *
- * @var string
- */
- protected $description = 'Command description';
- protected $sentenceService;
- protected $modelService;
- protected $openAIService;
- protected $model;
- protected $tokensPerSentence = 0;
- /**
- * Create a new command instance.
- *
- * @return void
- */
- public function __construct(
- AIModelService $model,
- SentenceService $sent,
- OpenAIService $openAI
- ) {
- $this->modelService = $model;
- $this->sentenceService = $sent;
- $this->openAIService = $openAI;
- parent::__construct();
- }
- /**
- * Execute the console command.
- *
- * @return int
- */
- public function handle()
- {
- if ($this->option('list')) {
- $result = RelatedParagraph::whereNotNull('book_name')
- ->groupBy('book_name')
- ->selectRaw('book_name,count(*)')
- ->get();
- foreach ($result as $key => $value) {
- $this->info($value['book_name'] . "[" . $value['count'] . "]");
- }
- return 0;
- }
- if ($this->option('model')) {
- $this->model = $this->modelService->getModelById($this->option('model'));
- $this->info("model:{$this->model['model']}");
- }
- $channel = ChannelApi::getChannelByName('_System_commentary_');
- $books = [];
- if ($this->option('book')) {
- $books[] = ['book_name' => $this->option('book')];
- } else {
- $books = RelatedParagraph::whereNotNull('book_name')
- ->where('cs_para', '>', 0)
- ->groupBy('book_name')
- ->select('book_name')
- ->get()->toArray();
- }
- foreach ($books as $key => $currBook) {
- $paragraphs = [];
- if ($this->option('para')) {
- $paragraphs[] = ['cs_para' => $this->option('para')];
- } else {
- $paragraphs = RelatedParagraph::where('book_name', $currBook['book_name'])
- ->where('cs_para', '>', 0)
- ->groupBy('cs_para')
- ->select('cs_para')
- ->get()->toArray();
- }
- foreach ($paragraphs as $key => $paragraph) {
- $message = 'ai commentary ' . $currBook['book_name'] . '-' . $paragraph['cs_para'];
- $this->info($message);
- Log::info($message);
- $result = RelatedParagraph::where('book_name', $currBook['book_name'])
- ->where('cs_para', $paragraph['cs_para'])
- ->where('book_id', '>', 0)
- ->orderBy('book_id')
- ->orderBy('para')
- ->get();
- $pcdBooks = [];
- $type = [];
- foreach ($result as $rBook) {
- # 把段落整合成书。有几本书就有几条输出纪录
- if (!isset($pcdBooks[$rBook->book_id])) {
- $bookType = $this->getBookType($rBook->book_id);
- $pcdBooks[$rBook->book_id] = $bookType;
- if (!isset($type[$bookType])) {
- $type[$bookType] = [];
- }
- $type[$bookType][$rBook->book_id] = [];
- }
- $currType = $pcdBooks[$rBook->book_id];
- $type[$currType][$rBook->book_id][] = ['book' => $rBook->book, 'para' => $rBook->para];
- }
- foreach ($type as $keyType => $info) {
- Log::debug($keyType);
- foreach ($info as $bookId => $paragraphs) {
- Log::debug($bookId);
- foreach ($paragraphs as $paragraph) {
- Log::debug($paragraph['book'] . '-' . $paragraph['para']);
- }
- }
- }
- //处理pali
- if (
- $this->hasData($type, 'pāḷi') &&
- $this->hasData($type, 'aṭṭhakathā')
- ) {
- $paliJson = [];
- foreach ($type['pāḷi'] as $keyBook => $paragraphs) {
- foreach ($paragraphs as $paraData) {
- $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
- $paliJson = array_merge($paliJson, $sentData);
- }
- }
- $attaJson = [];
- foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) {
- foreach ($paragraphs as $paraData) {
- $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
- $attaJson = array_merge($attaJson, $sentData);
- }
- }
- //llm 对齐
- $result = $this->textAlign($paliJson, $attaJson);
- //写入db
- $this->save($result, $channel);
- }
- //处理义注
- if (
- $this->hasData($type, 'aṭṭhakathā') &&
- $this->hasData($type, 'ṭīkā')
- ) {
- $tikaResult = array();
- foreach ($type['ṭīkā'] as $keyBook => $paragraphs) {
- $tikaJson = [];
- foreach ($paragraphs as $key => $paraData) {
- $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
- $tikaJson = array_merge($tikaJson, $sentData);
- }
- //llm 对齐
- $result = $this->textAlign($attaJson, $tikaJson);
- //将新旧数据合并 如果原来没有,就添加,有,就合并数据
- foreach ($result as $new) {
- $found = false;
- foreach ($tikaResult as $key => $old) {
- if ($old['id'] === $new['id']) {
- $found = true;
- if (isset($new['commentary']) && is_array($new['commentary'])) {
- $tikaResult[$key]['commentary'] = array_merge($tikaResult[$key]['commentary'], $new['commentary']);
- }
- break;
- }
- }
- if (!$found) {
- array_push($tikaResult, $new);
- }
- }
- }
- //写入db
- $this->save($tikaResult, $channel);
- }
- }
- }
- return 0;
- }
- private function hasData($typeData, $typeName)
- {
- if (
- !isset($typeData[$typeName]) ||
- $this->getParagraphNumber($typeData[$typeName]) === 0
- ) {
- Log::warning($typeName . ' data is missing');
- return false;
- }
- return true;
- }
- private function getParagraphNumber($type)
- {
- if (!isset($type) || !is_array($type)) {
- return 0;
- }
- $count = 0;
- foreach ($type as $bookId => $paragraphs) {
- $count += count($paragraphs);
- }
- return $count;
- }
- private function getBookType($bookId)
- {
- $bookTitle = BookTitle::where('sn', $bookId)->first();
- $paliTextUuid = PaliText::where('book', $bookTitle->book)->where('paragraph', $bookTitle->paragraph)->value('uid');
- $tagIds = TagMap::where('anchor_id', $paliTextUuid)->select('tag_id')->get();
- $tags = Tag::whereIn('id', $tagIds)->select('name')->get();
- foreach ($tags as $key => $tag) {
- if (in_array($tag->name, ['pāḷi', 'aṭṭhakathā', 'ṭīkā'])) {
- return $tag->name;
- }
- }
- return null;
- }
- private function getParaContent($book, $para)
- {
- $sentenceService = app(SearchPaliDataService::class);
- $sentences = PaliSentence::where('book', $book)
- ->where('paragraph', $para)
- ->orderBy('word_begin')
- ->get();
- if (!$sentences) {
- return null;
- }
- $json = [];
- foreach ($sentences as $key => $sentence) {
- $content = $sentenceService->getSentenceText($book, $para, $sentence->word_begin, $sentence->word_end);
- $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
- $json[] = ['id' => $id, 'content' => $content['markdown']];
- }
- return $json;
- }
- private function arrayIndexed(array $input): array
- {
- $output = [];
- foreach ($input as $key => $value) {
- $value['id'] = $key;
- $output[] = $value;
- }
- return $output;
- }
- private function arrayUnIndexed(array $input, array $original, array $commentary): array
- {
- $output = [];
- foreach ($input as $key => $value) {
- $value['id'] = $original[$key]['id'];
- if (isset($value['commentary'])) {
- $newCommentary = array_map(function ($n) use ($commentary) {
- if (isset($commentary[$n])) {
- return $commentary[$n]['id'];
- }
- return '';
- }, $value['commentary']);
- $value['commentary'] = $newCommentary;
- }
- $output[] = $value;
- }
- return $output;
- }
- private function textAlign(array $original, array $commentary)
- {
- if (!$this->model) {
- Log::error('model is invalid');
- return [];
- }
- $originalSn = $this->arrayIndexed($original);
- $commentarySn = $this->arrayIndexed($commentary);
- $originalText = "```jsonl\n" . LlmResponseParser::jsonl_encode($originalSn) . "\n```";
- $commentaryText = "```jsonl\n" . LlmResponseParser::jsonl_encode($commentarySn) . "\n```";
- Log::debug('ai request', [
- 'original' => $originalText,
- 'commentary' => $commentaryText
- ]);
- $totalSentences = count($original) + count($commentary);
- $maxTokens = (int)($this->tokensPerSentence * $totalSentences * 1.5);
- $this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens");
- Log::debug('requesting…… ' . $this->model['model']);
- $startAt = time();
- $response = $this->openAIService->setApiUrl($this->model['url'])
- ->setModel($this->model['model'])
- ->setApiKey($this->model['key'])
- ->setSystemPrompt($this->prompt)
- ->setTemperature(0.0)
- ->setStream(false)
- ->setMaxToken($maxTokens)
- ->send("# pali\n\n{$originalText}\n\n# commentary\n\n{$commentaryText}");
- $completeAt = time();
- $answer = $response['choices'][0]['message']['content'] ?? '[]';
- Log::debug('ai response', ['data' => $answer]);
- $message = ($completeAt - $startAt) . 's';
- if (isset($response['usage']['completion_tokens'])) {
- Log::debug('usage', $response['usage']);
- $message .= " completion_tokens:" . $response['usage']['completion_tokens'];
- $curr = (int)($response['usage']['completion_tokens'] / $totalSentences);
- if ($curr > $this->tokensPerSentence) {
- $this->tokensPerSentence = $curr;
- }
- }
- $this->info($message);
- $json = [];
- if (is_string($answer)) {
- $json = LlmResponseParser::jsonl($answer);
- $json = $this->arrayUnIndexed($json, $original, $commentary);
- Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
- }
- if (count($json) === 0) {
- Log::error("jsonl is empty");
- }
- return $json;
- }
- private function save($json, $channel)
- {
- if (!is_array($json)) {
- Log::warning('llm return null');
- return false;
- }
- foreach ($json as $key => $sentence) {
- if (!isset($sentence['commentary'])) {
- continue;
- }
- $sentId = explode('-', $sentence['id']);
- $arrCommentary = $sentence['commentary'];
- if (
- isset($arrCommentary) &&
- is_array($arrCommentary) &&
- count($arrCommentary) > 0
- ) {
- $content = array_map(function ($n) {
- if (is_string($n)) {
- return '{{' . $n . '}}';
- } else if (is_array($n) && isset($n['id']) && is_string($n['id'])) {
- return '{{' . $n['id'] . '}}';
- } else {
- return '';
- }
- }, $arrCommentary);
- $this->sentenceService->save(
- [
- 'book_id' => $sentId[0],
- 'paragraph' => $sentId[1],
- 'word_start' => $sentId[2],
- 'word_end' => $sentId[3],
- 'channel_uid' => $channel->uid,
- 'content' => implode("\n", $content),
- 'lang' => $channel->lang,
- 'status' => $channel->status,
- 'editor_uid' => $this->model['uid'],
- ]
- );
- $this->info($sentence['id'] . ' saved');
- }
- }
- }
- }
|