visuddhinanda 3 mēneši atpakaļ
vecāks
revīzija
8ac27b6c39

+ 425 - 0
api-v8/app/Console/Commands/UpgradeSystemCommentary.php

@@ -0,0 +1,425 @@
+<?php
+
+namespace App\Console\Commands;
+
+use Illuminate\Console\Command;
+use Illuminate\Support\Facades\Log;
+
+use App\Models\RelatedParagraph;
+use App\Models\BookTitle;
+use App\Models\PaliText;
+use App\Models\TagMap;
+use App\Models\Tag;
+use App\Models\PaliSentence;
+
+use App\Services\SearchPaliDataService;
+use App\Services\OpenAIService;
+use App\Services\AIModelService;
+use App\Services\SentenceService;
+
+use App\Helpers\LlmResponseParser;
+use App\Http\Api\ChannelApi;
+
+class UpgradeSystemCommentary extends Command
+{
+    /**
+     * The name and signature of the console command.
+     * php artisan upgrade:sys.commentary
+     * @var string
+     */
+    protected $signature = 'upgrade:sys.commentary {--book=} {--para=} {--list} {--model=}';
+    protected $prompt = <<<md
+    你是一个注释对照阅读助手。
+    pali 是巴利原文,jsonl格式, 每条记录是一个句子。包括id 和 content 两个字段
+    commentary 是pali的注释,jsonl 格式,每条记录是一个句子。包括id 和 content 两个字段
+    commentary里面的内容是对pali内容的注释
+    commentary里面的黑体字,说明该句子是注释pali中的对应的巴利文。
+    你需要按照顺序将commentary中的句子与pali原文对照,。
+    输出格式jsonl
+    只输出pali数据
+    在pali句子数据里面增加一个字段“commentary” 里面放这个句子对应的commentary句子的id
+    不要输出content字段,只输出id,commentary字段
+    直接输出jsonl数据,无需解释
+
+**关键规则:**
+1. 根据commentary中的句子的意思找到与pali对应的句子
+1. 如果commentary中的某个句子**有黑体字**,它应该放在pali中对应巴利词汇出现的句子之后
+2. 如果commentary中的某个句子**没有黑体字**,请将其与**上面最近的有黑体字的commentary句子**合并在一起(保持在同一个引用块内),不要单独成行
+3. 有些pali原文句子可能没有对应的注释
+4. 请不要遗漏任何commentary中的句子,也不要打乱顺序
+5. 同时保持pali的句子数量不变,不要增删
+6. 应该将全部commentary中的句子都与pali句子对应,不要有遗漏
+7. 对照排版时,请保持原来的字体设置:原来是黑体就是黑体,原来不是黑体就不是黑体。尤其是pali巴利原文,请不要改变字体
+
+**输出范例**
+[
+    {
+        "id": "165-6-112-136",
+        "content": "Yepi te, bho gotama, ahesuṃ atītamaddhānaṃ arahanto sammāsambuddhā tepi bhagavanto etaparamaṃyeva sammā bhikkhusaṅghaṃ paṭipādesuṃ – seyyathāpi etarahi bhotā gotamena sammā bhikkhusaṅgho paṭipādito.",
+        "commentary": [
+            "131-9-35-63",
+            "131-9-64-72",
+            "131-9-73-82",
+            "131-9-83-95",
+            "131-9-96-130"
+        ]
+    }
+]
+md;
+    /**
+     * The console command description.
+     *
+     * @var string
+     */
+    protected $description = 'Command description';
+    protected $sentenceService;
+    protected $modelService;
+    protected $openAIService;
+    protected $model;
+    protected $tokensPerSentence = 0;
+    /**
+     * Create a new command instance.
+     *
+     * @return void
+     */
+    public function __construct(AIModelService $model, SentenceService $sent, OpenAIService $openAI)
+    {
+        $this->modelService = $model;
+        $this->sentenceService = $sent;
+        $this->openAIService = $openAI;
+        parent::__construct();
+    }
+
+    /**
+     * Execute the console command.
+     *
+     * @return int
+     */
+    public function handle()
+    {
+        if ($this->option('list')) {
+            $result = RelatedParagraph::whereNotNull('book_name')
+                ->groupBy('book_name')
+                ->selectRaw('book_name,count(*)')
+                ->get();
+            foreach ($result as $key => $value) {
+                $this->info($value['book_name'] . "[" . $value['count'] . "]");
+            }
+            return 0;
+        }
+        if ($this->option('model')) {
+            $this->model = $this->modelService->getModelById($this->option('model'));
+            $this->info("model:{$this->model['model']}");
+        }
+
+
+        $channel = ChannelApi::getChannelByName('_System_commentary_');
+
+        $books = [];
+        if ($this->option('book')) {
+            $books[] = ['book_name' => $this->option('book')];
+        } else {
+            $books = RelatedParagraph::whereNotNull('book_name')
+                ->where('cs_para', '>', 0)
+                ->groupBy('book_name')
+                ->select('book_name')
+                ->get()->toArray();
+        }
+        foreach ($books as $key => $currBook) {
+            $paragraphs = [];
+            if ($this->option('para')) {
+                $paragraphs[] = ['cs_para' => $this->option('para')];
+            } else {
+                $paragraphs = RelatedParagraph::where('book_name', $currBook['book_name'])
+                    ->where('cs_para', '>', 0)
+                    ->groupBy('cs_para')
+                    ->select('cs_para')
+                    ->get()->toArray();
+            }
+            foreach ($paragraphs as $key => $paragraph) {
+                $message = 'ai commentary ' . $currBook['book_name'] . '-' . $paragraph['cs_para'];
+                $this->info($message);
+                Log::info($message);
+                $result = RelatedParagraph::where('book_name', $currBook['book_name'])
+                    ->where('cs_para', $paragraph['cs_para'])
+                    ->where('book_id', '>', 0)
+                    ->orderBy('book_id')
+                    ->orderBy('para')
+                    ->get();
+                $pcdBooks = [];
+                $type = [];
+                foreach ($result as $rBook) {
+                    # 把段落整合成书。有几本书就有几条输出纪录
+                    if (!isset($pcdBooks[$rBook->book_id])) {
+                        $bookType = $this->getBookType($rBook->book_id);
+                        $pcdBooks[$rBook->book_id] = $bookType;
+                        if (!isset($type[$bookType])) {
+                            $type[$bookType] = [];
+                        }
+                        $type[$bookType][$rBook->book_id] = [];
+                    }
+                    $currType = $pcdBooks[$rBook->book_id];
+                    $type[$currType][$rBook->book_id][] = ['book' => $rBook->book, 'para' => $rBook->para];
+                }
+                foreach ($type as $keyType => $info) {
+                    Log::debug($keyType);
+                    foreach ($info as $bookId => $paragraphs) {
+                        Log::debug($bookId);
+                        foreach ($paragraphs as  $paragraph) {
+                            Log::debug($paragraph['book'] . '-' . $paragraph['para']);
+                        }
+                    }
+                }
+
+                //处理pali
+                if (
+                    $this->hasData($type, 'pāḷi') &&
+                    $this->hasData($type, 'aṭṭhakathā')
+                ) {
+                    $paliJson = [];
+                    foreach ($type['pāḷi'] as $keyBook => $paragraphs) {
+                        foreach ($paragraphs as  $paraData) {
+                            $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
+                            $paliJson = array_merge($paliJson, $sentData);
+                        }
+                    }
+
+                    $attaJson = [];
+                    foreach ($type['aṭṭhakathā'] as $keyBook => $paragraphs) {
+                        foreach ($paragraphs as  $paraData) {
+                            $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
+                            $attaJson = array_merge($attaJson, $sentData);
+                        }
+                    }
+
+                    //llm 对齐
+                    $result = $this->textAlign($paliJson, $attaJson);
+                    //写入db
+                    $this->save($result, $channel);
+                }
+
+                //处理义注
+                if (
+                    $this->hasData($type, 'aṭṭhakathā') &&
+                    $this->hasData($type, 'ṭīkā')
+                ) {
+                    $tikaResult = array();
+                    foreach ($type['ṭīkā'] as $keyBook => $paragraphs) {
+                        $tikaJson = [];
+                        foreach ($paragraphs as $key => $paraData) {
+                            $sentData = $this->getParaContent($paraData['book'], $paraData['para']);
+                            $tikaJson = array_merge($tikaJson, $sentData);
+                        }
+
+                        //llm 对齐
+                        $result = $this->textAlign($attaJson, $tikaJson);
+                        //将新旧数据合并 如果原来没有,就添加,有,就合并数据
+                        foreach ($result as $new) {
+                            $found = false;
+                            foreach ($tikaResult as $key => $old) {
+                                if ($old['id'] === $new['id']) {
+                                    $found = true;
+                                    if (isset($new['commentary']) && is_array($new['commentary'])) {
+                                        $tikaResult[$key]['commentary'] = array_merge($tikaResult[$key]['commentary'], $new['commentary']);
+                                    }
+                                    break;
+                                }
+                            }
+                            if (!$found) {
+                                array_push($tikaResult, $new);
+                            }
+                        }
+                    }
+                    //写入db
+                    $this->save($tikaResult, $channel);
+                }
+            }
+        }
+
+        return 0;
+    }
+    private function hasData($typeData, $typeName)
+    {
+        if (
+            !isset($typeData[$typeName]) ||
+            $this->getParagraphNumber($typeData[$typeName]) === 0
+        ) {
+            Log::warning($typeName . ' data is missing');
+            return false;
+        }
+        return true;
+    }
+    private function getParagraphNumber($type)
+    {
+        if (!isset($type) || !is_array($type)) {
+            return 0;
+        }
+        $count = 0;
+        foreach ($type as $bookId => $paragraphs) {
+            $count += count($paragraphs);
+        }
+        return $count;
+    }
+    private function getBookType($bookId)
+    {
+        $bookTitle = BookTitle::where('sn', $bookId)->first();
+        $paliTextUuid = PaliText::where('book', $bookTitle->book)->where('paragraph', $bookTitle->paragraph)->value('uid');
+        $tagIds = TagMap::where('anchor_id', $paliTextUuid)->select('tag_id')->get();
+        $tags = Tag::whereIn('id', $tagIds)->select('name')->get();
+        foreach ($tags as $key => $tag) {
+            if (in_array($tag->name, ['pāḷi', 'aṭṭhakathā', 'ṭīkā'])) {
+                return $tag->name;
+            }
+        }
+        return null;
+    }
+
+    private function getParaContent($book, $para)
+    {
+        $sentenceService = app(SearchPaliDataService::class);
+        $sentences = PaliSentence::where('book', $book)
+            ->where('paragraph', $para)
+            ->orderBy('word_begin')
+            ->get();
+        if (!$sentences) {
+            return null;
+        }
+        $json = [];
+        foreach ($sentences as $key => $sentence) {
+            $content = $sentenceService->getSentenceText($book, $para, $sentence->word_begin, $sentence->word_end);
+            $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
+            $json[] = ['id' => $id, 'content' => $content['markdown']];
+        }
+        return $json;
+    }
+
+    private function arrayIndexed(array $input): array
+    {
+        $output  = [];
+        foreach ($input as $key => $value) {
+            $value['id'] = $key;
+            $output[] = $value;
+        }
+        return $output;
+    }
+    private function arrayUnIndexed(array $input, array $original, array $commentary): array
+    {
+        $output  = [];
+        foreach ($input as $key => $value) {
+            $value['id'] = $original[$key]['id'];
+            if (isset($value['commentary'])) {
+                $newCommentary = array_map(function ($n) use ($commentary) {
+                    if (isset($commentary[$n])) {
+                        return $commentary[$n]['id'];
+                    }
+                    return '';
+                }, $value['commentary']);
+                $value['commentary'] = $newCommentary;
+            }
+            $output[] = $value;
+        }
+        return $output;
+    }
+    private function textAlign(array $original, array $commentary)
+    {
+        if (!$this->model) {
+            Log::error('model is invalid');
+            return [];
+        }
+        $originalSn  = $this->arrayIndexed($original);
+        $commentarySn  = $this->arrayIndexed($commentary);
+
+        $originalText = "```jsonl\n" . LlmResponseParser::jsonl_encode($originalSn) . "\n```";
+        $commentaryText = "```jsonl\n" . LlmResponseParser::jsonl_encode($commentarySn) . "\n```";
+
+        Log::debug('ai request', [
+            'original' => $originalText,
+            'commentary' => $commentaryText
+        ]);
+
+        $totalSentences = count($original) + count($commentary);
+        $maxTokens = (int)($this->tokensPerSentence * $totalSentences * 1.5);
+        $this->info("requesting…… {$totalSentences} sentences {$this->tokensPerSentence}tokens/sentence set {$maxTokens} max_tokens");
+        Log::debug('requesting…… ' . $this->model['model']);
+        $startAt = time();
+        $response = $this->openAIService->setApiUrl($this->model['url'])
+            ->setModel($this->model['model'])
+            ->setApiKey($this->model['key'])
+            ->setSystemPrompt($this->prompt)
+            ->setTemperature(0.0)
+            ->setStream(false)
+            ->setMaxToken($maxTokens)
+            ->send("# pali\n\n{$originalText}\n\n# commentary\n\n{$commentaryText}");
+        $completeAt = time();
+        $answer = $response['choices'][0]['message']['content'] ?? '[]';
+        Log::debug('ai response', ['data' => $answer]);
+        $message = ($completeAt - $startAt) . 's';
+
+        if (isset($response['usage']['completion_tokens'])) {
+            Log::debug('usage', $response['usage']);
+            $message .= " completion_tokens:" . $response['usage']['completion_tokens'];
+            $curr = (int)($response['usage']['completion_tokens'] / $totalSentences);
+            if ($curr > $this->tokensPerSentence) {
+                $this->tokensPerSentence = $curr;
+            }
+        }
+        $this->info($message);
+        $json = [];
+        if (is_string($answer)) {
+            $json = LlmResponseParser::jsonl($answer);
+            $json = $this->arrayUnIndexed($json, $original, $commentary);
+            Log::debug(json_encode($json, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
+        }
+        if (count($json) === 0) {
+            Log::error("jsonl is empty");
+        }
+
+        return $json;
+    }
+
+
+
+    private function save($json, $channel)
+    {
+        if (!is_array($json)) {
+            Log::warning('llm return null');
+            return false;
+        }
+        foreach ($json as $key => $sentence) {
+            if (!isset($sentence['commentary'])) {
+                continue;
+            }
+            $sentId = explode('-', $sentence['id']);
+            $arrCommentary = $sentence['commentary'];
+            if (
+                isset($arrCommentary) &&
+                is_array($arrCommentary) &&
+                count($arrCommentary) > 0
+            ) {
+                $content =  array_map(function ($n) {
+                    if (is_string($n)) {
+                        return '{{' . $n . '}}';
+                    } else if (is_array($n) && isset($n['id']) && is_string($n['id'])) {
+                        return '{{' . $n['id'] . '}}';
+                    } else {
+                        return '';
+                    }
+                }, $arrCommentary);
+                $this->sentenceService->save(
+                    [
+                        'book_id' => $sentId[0],
+                        'paragraph' => $sentId[1],
+                        'word_start' => $sentId[2],
+                        'word_end' => $sentId[3],
+                        'channel_uid' => $channel->uid,
+                        'content' => implode("\n", $content),
+                        'lang' => $channel->lang,
+                        'status' => $channel->status,
+                        'editor_uid' => $this->model['uid'],
+                    ]
+                );
+                $this->info($sentence['id'] . ' saved');
+            }
+        }
+    }
+}

+ 130 - 0
api-v8/app/Helpers/LlmResponseParser.php

@@ -0,0 +1,130 @@
+<?php
+
+namespace App\Helpers;
+
+use Illuminate\Support\Facades\Log;
+
+/**
+ * Class LlmResponseParser
+ * @package App\Helpers
+ */
+class LlmResponseParser
+{
+    /**
+     * 解析LLM返回的可能包含Markdown格式(如```json...```)或额外文字说明的JSON字符串。
+     *
+     * @param string $input LLM返回的原始字符串。
+     * @return array|null 解析成功的PHP数组,如果解析失败则返回空数组
+     */
+    public static function json(string $input): array
+    {
+        // 1. 预处理:查找并提取被 Markdown 代码块包裹的JSON字符串。
+        // 匹配 ```json ... ``` 或 ``` ... ``` 格式的代码块。
+        // S: dotall 模式,允许 . 匹配换行符。
+        // ?: 非贪婪模式,匹配尽可能少的字符直到遇到下一个 ```。
+        $pattern = '/```(?:json)?\s*(.*?)\s*```/s';
+
+        if (preg_match($pattern, $input, $matches)) {
+            // 情况 2 和 3:找到代码块,提取其内容。
+            // $matches[1] 包含代码块内部的内容。
+            $jsonString = trim($matches[1]);
+        } else {
+            // 情况 1:没有代码块包裹,认为整个输入就是纯 JSON 字符串。
+            // 这种情况也包含了用户可能提供的、未被代码块包裹但带有文字说明的JSON。
+            // 这种情况下,我们需要在后续尝试解析时,先尝试 trim() 整个输入。
+            $jsonString = trim($input);
+        }
+
+        // 2. 尝试解析提取或预处理后的字符串。
+        // json_decode 的第二个参数设为 true,将其解码为关联数组。
+        $data = json_decode($jsonString, true);
+
+        // 3. 检查解析结果。
+        // json_last_error() 返回 JSON 解析的最后一个错误代码。
+        if (json_last_error() === JSON_ERROR_NONE && is_array($data)) {
+            // 解析成功且结果为数组。
+            return $data;
+        }
+
+        // 4. 如果第一次尝试失败 (通常是针对情况 1 或包含文字说明的情况),
+        // 并且提取的字符串与原始输入相同(即没有匹配到代码块),
+        // 并且原始输入中包含可能干扰解析的文字,
+        // 则可以考虑更复杂的清理步骤,但考虑到 LLM 返回的 JSON 通常比较规范,
+        // 推荐的做法是如果第一次没有成功,就返回 null,以保持函数的健壮性。
+        // 纯粹的 JSON 字符串(情况 1)在第一次尝试时应该已经成功。
+
+        // 如果解析失败,则返回 空数据
+        Log::error('解析失败' . $input);
+        return [];
+    }
+
+    /**
+     * 解析LLM返回的JSONL(JSON Lines)格式字符串。
+     * 支持Markdown代码块包裹、额外说明文字,以及处理截断的JSONL数据。
+     * 每一行应为独立的JSON对象,解析时会跳过无效行并返回所有成功解析的数据。
+     *
+     * @param string $input LLM返回的原始JSONL字符串。
+     * @return array 解析成功的PHP数组,每个元素对应一行有效的JSON对象。如果完全解析失败则返回空数组。
+     */
+    public static function jsonl(string $input): array
+    {
+        // 1. 预处理:查找并提取被 Markdown 代码块包裹的 JSONL 字符串。
+        // 匹配 ```jsonl ... ``` 或 ``` ... ``` 格式的代码块。
+        $pattern = '/```(?:jsonl?)?\s*(.*?)\s*```/s';
+
+        if (preg_match($pattern, $input, $matches)) {
+            // 情况 2 和 3:找到代码块,提取其内容。
+            $jsonlString = trim($matches[1]);
+        } else {
+            // 情况 1:没有代码块包裹,认为整个输入就是纯 JSONL 字符串。
+            $jsonlString = trim($input);
+        }
+
+        // 2. 按行分割 JSONL 字符串。
+        // 使用 PHP_EOL 或 \n 作为分隔符,同时处理 Windows (\r\n) 和 Unix (\n) 换行符。
+        $lines = preg_split('/\r\n|\r|\n/', $jsonlString);
+
+        // 3. 逐行解析 JSON 对象。
+        $result = [];
+        foreach ($lines as $lineNumber => $line) {
+            // 去除每行的首尾空白字符。
+            $line = trim($line);
+
+            // 跳过空行。
+            if (empty($line)) {
+                continue;
+            }
+
+            // 尝试解析当前行为 JSON。
+            $decoded = json_decode($line, true);
+
+            // 检查解析是否成功。
+            if (json_last_error() === JSON_ERROR_NONE && is_array($decoded)) {
+                // 解析成功,添加到结果数组。
+                $result[] = $decoded;
+            } else {
+                // 解析失败,记录错误日志(可选)。
+                // 这里处理了截断的情况:如果某行不是有效 JSON,则跳过它。
+                // 通常最后一行可能因截断而无效,前面的有效行仍会被返回。
+                Log::warning("JSONL解析失败 - 行 " . ($lineNumber + 1) . ": " . $line);
+            }
+        }
+
+        // 4. 返回解析结果。
+        // 即使没有成功解析任何行,也返回空数组而非 null,保持返回类型一致。
+        if (empty($result)) {
+            Log::error('JSONL解析失败,未能提取任何有效数据: ' . $input);
+        }
+
+        return $result;
+    }
+
+    public static function jsonl_encode(array $input): string
+    {
+        $rows = [];
+        foreach ($input as $key => $value) {
+            $rows[] = json_encode($value, JSON_UNESCAPED_UNICODE);
+        }
+        return implode("\n", $rows);
+    }
+}

+ 47 - 0
api-v8/app/Services/SentenceService.php

@@ -0,0 +1,47 @@
+<?php
+
+namespace App\Services;
+
+use App\Models\Sentence;
+use App\Models\SentHistory;
+use Illuminate\Support\Str;
+
+class SentenceService
+{
+    public function save($data)
+    {
+        $row = Sentence::firstOrNew([
+            "book_id" => $data['book_id'],
+            "paragraph" => $data['paragraph'],
+            "word_start" => $data['word_start'],
+            "word_end" => $data['word_end'],
+            "channel_uid" => $data['channel_uid'],
+        ], [
+            "id" => app('snowflake')->id(),
+            "uid" => Str::uuid(),
+        ]);
+        $row->content = $data['content'];
+        if (isset($data['content_type']) && !empty($data['content_type'])) {
+            $row->content_type = $data['content_type'];
+        }
+        $row->strlen = mb_strlen($data['content'], "UTF-8");
+        $row->language = $data['lang'];
+        $row->status = $data['status'];
+        if (isset($data['copy'])) {
+            //复制句子,保留原作者信息
+            $row->editor_uid = $data["editor_uid"];
+            $row->acceptor_uid = $data["acceptor_uid"];
+            $row->pr_edit_at = $data["updated_at"];
+            if (isset($data['fork_from'])) {
+                $row->fork_at = now();
+            }
+        } else {
+            $row->editor_uid = $data["editor_uid"];
+            $row->acceptor_uid = null;
+            $row->pr_edit_at = null;
+        }
+        $row->create_time = time() * 1000;
+        $row->modify_time = time() * 1000;
+        $row->save();
+    }
+}