ExportAiTrainingData.php 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. <?php
  2. namespace App\Console\Commands;
  3. use Illuminate\Console\Command;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\Sentence;
  6. use App\Models\PaliSentence;
  7. use App\Http\Api\MdRender;
  8. use Illuminate\Support\Facades\File;
  9. use App\Http\Api\ChannelApi;
  10. use App\Services\PaliTextService;
  11. class ExportAiTrainingData extends Command
  12. {
  13. private $ShortTrans = 0.17;
  14. /**
  15. * The name and signature of the console command.
  16. * php artisan export:ai.training.data
  17. * @var string
  18. */
  19. protected $signature = 'export:ai.training.data {--format=gz : zip file format 7z,lzma,gz } {--test}';
  20. /**
  21. * The console command description.
  22. *
  23. * @var string
  24. */
  25. protected $description = 'export ai training data';
  26. /**
  27. * Create a new command instance.
  28. *
  29. * @return void
  30. */
  31. public function __construct()
  32. {
  33. parent::__construct();
  34. }
  35. /**
  36. * Execute the console command.
  37. *
  38. * @return int
  39. */
  40. public function handle()
  41. {
  42. Log::debug('task export offline sentence-table start');
  43. //创建文件夹
  44. $base = 'app/tmp/export/offline';
  45. $exportDir = storage_path($base);
  46. if (!is_dir($exportDir)) {
  47. $res = mkdir($exportDir, 0755, true);
  48. if (!$res) {
  49. $this->error('mkdir fail path=' . $exportDir);
  50. return 1;
  51. } else {
  52. $this->info('make dir successful ' . $exportDir);
  53. }
  54. }
  55. //创建临时文件夹\
  56. $dirname = $exportDir . '/' . 'wikipali-offline-ai-training-' . date("YmdHis");
  57. $tmp = mkdir($dirname, 0755, true);
  58. if (!$tmp) {
  59. $this->error('mkdir fail path=' . $dirname);
  60. return 1;
  61. } else {
  62. $this->info('make dir successful ' . $dirname);
  63. }
  64. $fpIndex = fopen($dirname . '/index.md', 'w');
  65. if ($fpIndex === false) {
  66. die('无法创建索引文件');
  67. }
  68. $channels = [
  69. '7ac4d13b-a43d-4409-91b5-5f2a82b916b3',
  70. 'e5bc5c97-a6fb-4ccb-b7df-be6dcfee9c43',
  71. '74ebf4c5-c243-4948-955d-6c277e29276a',
  72. '3b0cb0aa-ea88-4ce5-b67d-00a3e76220cc',
  73. '5310999c-0b0c-4bb0-9bb9-9cdd176e9ef0',
  74. '331447b6-39bb-4b49-ac10-6206db93a050',
  75. ];
  76. $start = time();
  77. foreach ($channels as $key => $channel) {
  78. if ($this->option('test') && $key > 0) {
  79. // test mode 只跑一个
  80. break;
  81. }
  82. fwrite($fpIndex, "# {$channel}\n");
  83. $channelInfo = ChannelApi::getById($channel);
  84. if ($channelInfo) {
  85. fwrite($fpIndex, "- 版本名称:{$channelInfo['name']}\n");
  86. fwrite($fpIndex, "- 语言:{$channelInfo['lang']}\n");
  87. }
  88. // 创建文件
  89. $this->info('export start' . $channel);
  90. $filename = $channel . '.jsonl';
  91. $exportFile = $dirname . '/' . $filename;
  92. $fp = fopen($exportFile, 'w');
  93. if ($fp === false) {
  94. die('无法创建文件');
  95. }
  96. $db = Sentence::where('channel_uid', $channel);
  97. $bar = $this->output->createProgressBar($db->count());
  98. $srcDb = $db->select([
  99. 'book_id',
  100. 'paragraph',
  101. 'word_start',
  102. 'word_end',
  103. 'content',
  104. 'content_type'
  105. ])->orderBy('book_id')
  106. ->orderBy('paragraph')
  107. ->orderBy('word_start')->cursor();
  108. $done = [];
  109. foreach ($srcDb as $sent) {
  110. $id = "{$sent->book_id}-{$sent->paragraph}-{$sent->word_start}-{$sent->word_end}";
  111. if (isset($done[$id])) {
  112. continue;
  113. }
  114. //获取原文
  115. $origin = PaliSentence::where('book', $sent->book_id)
  116. ->where('paragraph', $sent->paragraph)
  117. ->where('word_begin', $sent->word_start)
  118. ->where('word_end', $sent->word_end)
  119. ->value('text');
  120. //忽略空的原文
  121. if (self::isEmpty($origin)) {
  122. Log::warning('origin is empty id=' . $id);
  123. continue;
  124. }
  125. // 渲染译文
  126. $translation = MdRender::render(
  127. $sent->content,
  128. [$channel],
  129. null,
  130. 'read',
  131. 'translation',
  132. $sent->content_type,
  133. 'text',
  134. );
  135. $translation = trim($translation);
  136. // 忽略空的译文
  137. if (self::isEmpty($translation)) {
  138. Log::warning('translation is empty id=' . $id);
  139. continue;
  140. }
  141. //忽略过短的译文
  142. if (mb_strlen($translation) / mb_strlen($origin) < $this->ShortTrans) {
  143. Log::warning('translation is short id=' . $id);
  144. continue;
  145. }
  146. //原文与翻译完全相同
  147. if ($translation === $origin) {
  148. Log::warning('translation is same id=' . $id);
  149. continue;
  150. }
  151. // 获取分类标签
  152. $paliTextService = app(PaliTextService::class);
  153. $tags = $paliTextService->getParaCategoryTags($sent->book_id, $sent->paragraph);
  154. $path = $paliTextService->getParaPathTitle($sent->book_id, $sent->paragraph);
  155. $currData = [
  156. 'id' => $id,
  157. 'original' => $origin,
  158. 'translation' => $translation,
  159. 'category' => $tags,
  160. 'path' => $path,
  161. ];
  162. fwrite($fp, json_encode($currData, JSON_UNESCAPED_UNICODE) . "\n");
  163. $bar->advance();
  164. $done[$id] = 1;
  165. }
  166. fclose($fp);
  167. }
  168. fclose($fpIndex);
  169. $this->info((time() - $start) . ' seconds');
  170. $this->call('export:zip2', [
  171. 'id' => 'ai-translating-training-data',
  172. 'filename' => $dirname,
  173. 'title' => 'wikipali ai translating training data',
  174. 'format' => $this->option('format'),
  175. ]);
  176. sleep(5);
  177. File::deleteDirectory($dirname);
  178. return 0;
  179. }
  180. private function isEmpty(string $input)
  181. {
  182. $result = preg_replace('/[\s\d\p{P}]/u', '', $input);
  183. return empty($result);
  184. }
  185. }