UpgradeAITranslation.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. <?php
  2. namespace App\Console\Commands;
  3. use App\Helpers\LlmResponseParser;
  4. use App\Http\Api\ChannelApi;
  5. use App\Http\Resources\AiModelResource;
  6. use App\Models\PaliSentence;
  7. use App\Models\PaliText;
  8. use App\Models\Sentence;
  9. use App\Services\AIAssistant\NissayaTranslateService;
  10. use App\Services\AIModelService;
  11. use App\Services\AuthService;
  12. use App\Services\OpenAIService;
  13. use App\Services\SearchPaliDataService;
  14. use App\Services\SentenceService;
  15. use App\Tools\Tools;
  16. use Illuminate\Console\Command;
  17. use Illuminate\Support\Facades\Cache;
  18. use Illuminate\Support\Facades\Log;
  19. class UpgradeAITranslation extends Command
  20. {
  21. /**
  22. * The name and signature of the console command.
  23. * php artisan upgrade:ai.translation translation --book=141 --para=535
  24. * php artisan upgrade:ai.translation nissaya --book=207 --para=1247
  25. *
  26. * @var string
  27. */
  28. protected $signature = 'upgrade:ai.translation
  29. {type}
  30. {channel}
  31. {--book=}
  32. {--para=}
  33. {--resume}
  34. {--model=}
  35. {--thinking= : 开启和关闭deepseek thinking true | false}
  36. {--fresh : 清除缓存断点,从头开始}';
  37. // 缓存键前缀:以 type、channel 区分,记录已完成的 "book|para" 集合,中断后重跑自动跳过
  38. private const CACHE_KEY_PREFIX = 'upgrade:ai.translation:done';
  39. /**
  40. * The console command description.
  41. *
  42. * @var string
  43. */
  44. protected $description = 'Command description';
  45. protected AiModelResource $model;
  46. protected string $modelToken;
  47. protected array $workChannel;
  48. protected string $accessToken;
  49. protected bool $thinking;
  50. /**
  51. * Create a new command instance.
  52. *
  53. * @return void
  54. */
  55. public function __construct(
  56. protected AIModelService $modelService,
  57. protected SentenceService $sentenceService,
  58. protected OpenAIService $openAIService,
  59. protected NissayaTranslateService $nissayaTranslateService
  60. ) {
  61. parent::__construct();
  62. }
  63. /**
  64. * Execute the console command.
  65. *
  66. * @return int
  67. */
  68. public function handle()
  69. {
  70. /**
  71. * model
  72. */
  73. if (!$this->option('model')) {
  74. $this->error('model is request');
  75. return 1;
  76. }
  77. $this->model = $this->modelService->getModelById($this->option('model'));
  78. $this->info("model:{$this->model['model']}");
  79. $this->modelToken = AuthService::getUserToken($this->model['uid']);
  80. //channel
  81. $this->workChannel = ChannelApi::getById($this->argument('channel'));
  82. // 需要判断输入channel 与翻译类型是否一致 nissaya -> nissaya channel
  83. if ($this->workChannel['type'] !== $this->argument('type')) {
  84. $this->error('channel type not match request ' . $this->argument('type') . ' input is ' . $this->workChannel['type']);
  85. return 1;
  86. }
  87. if ($this->option('thinking')) {
  88. $this->thinking = $this->option('thinking') === 'true';
  89. $this->line('thinking is ' . $this->option('thinking'));
  90. }
  91. $type = $this->argument('type');
  92. $channelId = $this->workChannel['id'] ?? '';
  93. // 缓存键:按 type、channel 区分不同任务的断点
  94. $cacheKey = self::CACHE_KEY_PREFIX . ':' . $type . ':' . $channelId;
  95. if ($this->option('fresh')) {
  96. Cache::forget($cacheKey);
  97. $this->info('Cleared cached cursor.');
  98. }
  99. // 是否为完整遍历(未指定 book/para),仅此情形在结束后清空断点缓存
  100. $isFullRun = ! $this->option('book') && ! $this->option('para');
  101. // 从缓存恢复已完成的 (book, para) 集合,作为重入时的稳定游标
  102. $done = Cache::get($cacheKey, []);
  103. $books = [];
  104. if ($this->option('book')) {
  105. $books = [$this->option('book')];
  106. } else {
  107. // 未指定 book 时,若已有断点缓存,从上次处理到的 book 继续,无需从 1 开始
  108. $startBook = 1;
  109. if (! empty($done)) {
  110. $doneBooks = array_map(fn($cursor) => (int) explode('|', $cursor)[0], array_keys($done));
  111. $startBook = max($doneBooks);
  112. $this->info("resume from book {$startBook}");
  113. }
  114. $books = range($startBook, 217);
  115. }
  116. foreach ($books as $key => $book) {
  117. $maxParagraph = PaliText::where('book', $book)->max('paragraph');
  118. $paragraphs = range(1, $maxParagraph);
  119. if ($this->option('para')) {
  120. $paragraphs = [$this->option('para')];
  121. }
  122. foreach ($paragraphs as $key => $paragraph) {
  123. // 稳定游标:缓存键已含 type、channel,此处仅以 book|para 标识处理单元
  124. $cursor = $book . '|' . $paragraph;
  125. if (isset($done[$cursor])) {
  126. $this->info("skip {$cursor}");
  127. continue;
  128. }
  129. $start = time();
  130. $data = [];
  131. switch ($this->argument('type')) {
  132. case 'translation':
  133. $data = $this->aiPaliTranslate($book, $paragraph);
  134. break;
  135. case 'nissaya':
  136. $data = $this->aiNissayaTranslate($book, $paragraph);
  137. break;
  138. case 'wbw':
  139. $data = $this->aiWBW($book, $paragraph);
  140. break;
  141. default:
  142. // code...
  143. break;
  144. }
  145. $this->save($data);
  146. $time = time() - $start;
  147. $this->info($this->argument('type') . " {$book}-{$paragraph} " . count($data) . ' sentences time=' . $time);
  148. // 该处理单元全部写库完成后再标记游标,确保中途中断不会误跳过
  149. $done[$cursor] = true;
  150. Cache::put($cacheKey, $done, now()->addHours(24));
  151. }
  152. }
  153. // 完整遍历正常结束,清空断点缓存
  154. if ($isFullRun) {
  155. Cache::forget($cacheKey);
  156. }
  157. return 0;
  158. }
  159. private function getPaliContent($book, $para)
  160. {
  161. $sentenceService = app(SearchPaliDataService::class);
  162. $sentences = PaliSentence::where('book', $book)
  163. ->where('paragraph', $para)
  164. ->orderBy('word_begin')
  165. ->get();
  166. if (! $sentences) {
  167. return null;
  168. }
  169. $json = [];
  170. foreach ($sentences as $key => $sentence) {
  171. $content = $sentenceService->getSentenceContent($book, $para, $sentence->word_begin, $sentence->word_end);
  172. $id = "{$book}-{$para}-{$sentence->word_begin}-{$sentence->word_end}";
  173. $json[] = ['id' => $id, 'content' => $content['markdown']];
  174. }
  175. return $json;
  176. }
  177. private function aiPaliTranslate($book, $para)
  178. {
  179. $prompt = <<<'md'
  180. 你是一个巴利语翻译助手。
  181. pali 是巴利原文的一个段落,json格式, 每条记录是一个句子。包括id 和 content 两个字段
  182. 请翻译这个段落为简体中文。
  183. 翻译要求
  184. 1. 语言风格为现代汉语书面语,不要使用古汉语或者半文半白。
  185. 2. 译文严谨,完全贴合巴利原文,不要加入自己的理解
  186. 3. 巴利原文中的黑体字在译文中也使用黑体。其他标点符号跟随巴利原文,但应该替换为相应的汉字全角符号
  187. 输出格式jsonl
  188. 输出id 和 content 两个字段,
  189. id 使用巴利原文句子的id ,
  190. content 为中文译文
  191. 直接输出jsonl数据,无需解释
  192. **输出范例**
  193. {"id":"1-2-3-4","content":"译文"}
  194. {"id":"2-3-4-5","content":"译文"}
  195. md;
  196. $pali = $this->getPaliContent($book, $para);
  197. $originalText = "```json\n" . json_encode($pali, JSON_UNESCAPED_UNICODE) . "\n```";
  198. Log::debug($originalText);
  199. if (! $this->model) {
  200. Log::error('model is invalid');
  201. return [];
  202. }
  203. $startAt = time();
  204. $llm = $this->openAIService->setApiUrl($this->model['url'])
  205. ->setModel($this->model['model'])
  206. ->setApiKey($this->model['key'])
  207. ->setSystemPrompt($prompt)
  208. ->setTemperature(0.0)
  209. ->setStream(false);
  210. if (isset($this->thinking)) {
  211. $llm = $llm->setThinking($this->thinking);
  212. }
  213. $response = $llm->send("# pali\n\n{$originalText}\n\n");
  214. $complete = time() - $startAt;
  215. $translationText = $response['choices'][0]['message']['content'] ?? '[]';
  216. Log::debug("complete in {$complete}s", ['content' => $translationText]);
  217. $json = [];
  218. if (is_string($translationText)) {
  219. $json = LlmResponseParser::jsonl($translationText);
  220. }
  221. return $json;
  222. }
  223. private function aiWBW($book, $para)
  224. {
  225. $sysPrompt = <<<'md'
  226. 你是一个佛教翻译专家,精通巴利文和缅文,精通巴利文逐词解析
  227. ## 翻译要求:
  228. - 请将用户提供的巴利句子单词表中的每个巴利文单词翻译为中文
  229. - 这些单词是一个完整的句子,请根据单词的上下文翻译
  230. - original 里面的数据是巴利文单词
  231. - 输入格式为 json 数组
  232. - 输出jsonl格式
  233. 在原来的数据中添加下列输出字段
  234. 1. meaning:单词的中文意思,如果有两个可能的意思,两个意思之间用/符号分隔
  235. 5. confidence:你认为你给出的这个单词的信息的信心指数(准确程度) 数值1-100 如果觉得非常有把握100, 如果觉得把握不大,适当降低信心指数
  236. 6. note:如果你认为信心指数很低,这个是疑难单词,请在note字段写明原因,如果不是疑难单词,请不要填写note
  237. **范例**:
  238. {"id":1,"original":"bhikkhusanghassa","meaning":"比库僧团[的]","confidence":100}
  239. 直接输出jsonl, 无需其他内容
  240. md;
  241. $channelId = ChannelApi::getSysChannel('_System_Wbw_VRI_');
  242. $sentences = Sentence::where('channel_uid', $channelId)
  243. ->where('book_id', $book)
  244. ->where('paragraph', $para)
  245. ->get();
  246. $result = [];
  247. foreach ($sentences as $key => $sentence) {
  248. $wbw = json_decode($sentence->content);
  249. $tpl = [];
  250. foreach ($wbw as $key => $word) {
  251. if (
  252. ! empty($word->real->value) &&
  253. $word->type->value !== '.ctl.'
  254. ) {
  255. $tpl[] = [
  256. 'id' => $word->sn[0],
  257. 'original' => $word->real->value,
  258. ];
  259. }
  260. }
  261. $tplText = json_encode($tpl, JSON_UNESCAPED_UNICODE);
  262. Log::debug($tplText);
  263. $startAt = time();
  264. $llm = $this->openAIService->setApiUrl($this->model['url'])
  265. ->setModel($this->model['model'])
  266. ->setApiKey($this->model['key'])
  267. ->setSystemPrompt($sysPrompt)
  268. ->setTemperature(0.7)
  269. ->setStream(false);
  270. if (isset($this->thinking)) {
  271. $llm = $llm->setThinking($this->thinking);
  272. }
  273. $response = $llm->send("```json\n{$tplText}\n```");
  274. $complete = time() - $startAt;
  275. $content = $response['choices'][0]['message']['content'] ?? '[]';
  276. Log::debug("ai response in {$complete}s content=" . $content);
  277. $json = LlmResponseParser::jsonl($content);
  278. $id = "{$sentence->book_id}-{$sentence->paragraph}-{$sentence->word_start}-{$sentence->word_end}";
  279. $result[] = [
  280. 'id' => $id,
  281. 'content' => json_encode($json, JSON_UNESCAPED_UNICODE),
  282. ];
  283. }
  284. return $result;
  285. }
  286. private function aiNissayaTranslate($book, $para)
  287. {
  288. $sysPrompt = <<<'md'
  289. 你是一个佛教翻译专家,精通巴利文和缅文
  290. ## 翻译要求:
  291. - 请将nissaya单词表中的巴利文和缅文分别翻译为中文
  292. - 输入格式为 巴利文:缅文
  293. - 一行是一条记录,翻译的时候,请不要拆分一行中的巴利文单词或缅文单词,一行中出现多个单词的,一起翻译
  294. - 输出csv格式内容,分隔符为"$",
  295. - 字段如下:巴利文$巴利文的中文译文$缅文$缅文的中文译文 #两个译文的语义相似度(%)
  296. **范例**:
  297. pana$然而$ဝါဒန္တရကား$教义之说 #60%
  298. 直接输出csv, 无需其他内容
  299. 用```包裹的行为注释内容,也需要翻译和解释。放在最后面。如果没有```,无需处理
  300. md;
  301. $sentences = Sentence::nissaya()
  302. ->language('my') // 过滤缅文
  303. ->where('book_id', $book)
  304. ->where('paragraph', $para)
  305. ->orderBy('strlen')
  306. ->get();
  307. $result = [];
  308. foreach ($sentences as $key => $sentence) {
  309. $id = "{$sentence->book_id}-{$sentence->paragraph}-{$sentence->word_start}-{$sentence->word_end}";
  310. /*
  311. $nissaya = [];
  312. $rows = explode("\n", $sentence->content);
  313. foreach ($rows as $key => $row) {
  314. if (strpos('=', $row) >= 0) {
  315. $factors = explode("=", $row);
  316. $nissaya[] = Tools::MyToRm($factors[0]) . ':' . end($factors);
  317. } else {
  318. $nissaya[] = $row;
  319. }
  320. }
  321. $nissayaText = json_encode(implode("\n", $nissaya), JSON_UNESCAPED_UNICODE);
  322. Log::debug($nissayaText);
  323. $startAt = time();
  324. $response = $this->openAIService->setApiUrl($this->model['url'])
  325. ->setModel($this->model['model'])
  326. ->setApiKey($this->model['key'])
  327. ->setSystemPrompt($sysPrompt)
  328. ->setTemperature(0.7)
  329. ->setStream(false)
  330. ->send("# nissaya\n\n{$nissayaText}\n\n");
  331. $complete = time() - $startAt;
  332. $content = $response['choices'][0]['message']['content'] ?? '';
  333. Log::debug("ai response in {$complete}s content=" . $content);
  334. */
  335. $aiNissaya = $this->nissayaTranslateService
  336. ->setModel($this->model)
  337. ->translate($sentence->content, false);
  338. Log::debug('ai response ', ['content' => $aiNissaya['data']]);
  339. $result[] = [
  340. 'id' => $id,
  341. 'content' => json_encode($aiNissaya['data'] ?? [], JSON_UNESCAPED_UNICODE),
  342. 'content_type' => 'json',
  343. ];
  344. }
  345. return $result;
  346. }
  347. private function save($data)
  348. {
  349. // 写入句子库
  350. $sentData = [];
  351. $sentData = array_map(function ($n) {
  352. $sId = explode('-', $n['id']);
  353. return [
  354. 'book_id' => $sId[0],
  355. 'paragraph' => $sId[1],
  356. 'word_start' => $sId[2],
  357. 'word_end' => $sId[3],
  358. 'channel_uid' => $this->workChannel['id'],
  359. 'content' => $n['content'],
  360. 'content_type' => $n['content_type'] ?? 'markdown',
  361. 'lang' => $this->workChannel['lang'],
  362. 'status' => $this->workChannel['status'],
  363. 'editor_uid' => $this->model['uid'],
  364. ];
  365. }, $data);
  366. foreach ($sentData as $key => $value) {
  367. $this->sentenceService->save($value);
  368. }
  369. }
  370. }