ExportAiTrainingData.php 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. <?php
  2. namespace App\Console\Commands;
  3. use Illuminate\Console\Command;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\Sentence;
  6. use App\Models\PaliSentence;
  7. use App\Http\Api\MdRender;
  8. use Illuminate\Support\Facades\File;
  9. class ExportAiTrainingData extends Command
  10. {
  11. /**
  12. * The name and signature of the console command.
  13. * php artisan export:ai.training.data
  14. * @var string
  15. */
  16. protected $signature = 'export:ai.training.data {--format=gz : zip file format 7z,lzma,gz }';
  17. /**
  18. * The console command description.
  19. *
  20. * @var string
  21. */
  22. protected $description = 'export ai training data';
  23. /**
  24. * Create a new command instance.
  25. *
  26. * @return void
  27. */
  28. public function __construct()
  29. {
  30. parent::__construct();
  31. }
  32. /**
  33. * Execute the console command.
  34. *
  35. * @return int
  36. */
  37. public function handle()
  38. {
  39. Log::debug('task export offline sentence-table start');
  40. //创建文件夹
  41. $base = 'app/tmp/export/offline';
  42. $exportDir = storage_path($base);
  43. if (!is_dir($exportDir)) {
  44. $res = mkdir($exportDir, 0755, true);
  45. if (!$res) {
  46. $this->error('mkdir fail path=' . $exportDir);
  47. return 1;
  48. } else {
  49. $this->info('make dir successful ' . $exportDir);
  50. }
  51. }
  52. //创建临时文件夹\
  53. $dirname = $exportDir . '/' . 'wikipali-offline-ai-training-' . date("YmdHis");
  54. $tmp = mkdir($dirname, 0755, true);
  55. if (!$tmp) {
  56. $this->error('mkdir fail path=' . $dirname);
  57. return 1;
  58. } else {
  59. $this->info('make dir successful ' . $dirname);
  60. }
  61. $channels = [
  62. '19f53a65-81db-4b7d-8144-ac33f1217d34',
  63. 'e5bc5c97-a6fb-4ccb-b7df-be6dcfee9c43',
  64. '7ac4d13b-a43d-4409-91b5-5f2a82b916b3',
  65. '74ebf4c5-c243-4948-955d-6c277e29276a',
  66. '3b0cb0aa-ea88-4ce5-b67d-00a3e76220cc',
  67. '5310999c-0b0c-4bb0-9bb9-9cdd176e9ef0',
  68. '331447b6-39bb-4b49-ac10-6206db93a050',
  69. ];
  70. $start = time();
  71. foreach ($channels as $key => $channel) {
  72. // 创建文件
  73. $this->info('export start' . $channel);
  74. $filename = $channel . '.jsonl';
  75. $exportFile = $dirname . '/' . $filename;
  76. $fp = fopen($exportFile, 'w');
  77. if ($fp === false) {
  78. die('无法创建文件');
  79. }
  80. $db = Sentence::where('channel_uid', $channel);
  81. $bar = $this->output->createProgressBar($db->count());
  82. $srcDb = $db->select([
  83. 'book_id',
  84. 'paragraph',
  85. 'word_start',
  86. 'word_end',
  87. 'content',
  88. 'content_type'
  89. ])->orderBy('book_id')
  90. ->orderBy('paragraph')
  91. ->orderBy('word_start')->cursor();
  92. $done = [];
  93. foreach ($srcDb as $sent) {
  94. $id = "{$sent->book_id}-{$sent->paragraph}-{$sent->word_start}-{$sent->word_end}";
  95. if (isset($done[$id])) {
  96. continue;
  97. }
  98. $content = MdRender::render(
  99. $sent->content,
  100. [$channel],
  101. null,
  102. 'read',
  103. 'translation',
  104. $sent->content_type,
  105. 'text',
  106. );
  107. $origin = PaliSentence::where('book', $sent->book_id)
  108. ->where('paragraph', $sent->paragraph)
  109. ->where('word_begin', $sent->word_start)
  110. ->where('word_end', $sent->word_end)
  111. ->value('text');
  112. if (empty($origin)) {
  113. Log::warning('origin is empty id=' . $id);
  114. continue;
  115. }
  116. if (empty($content)) {
  117. Log::warning('translation is empty id=' . $id);
  118. continue;
  119. }
  120. $currData = ['id' => $id, 'original' => $origin, 'translation' => trim($content)];
  121. fwrite($fp, json_encode($currData, JSON_UNESCAPED_UNICODE) . "\n");
  122. $bar->advance();
  123. $done[$id] = 1;
  124. }
  125. fclose($fp);
  126. }
  127. $this->info((time() - $start) . ' seconds');
  128. $this->call('export:zip2', [
  129. 'id' => 'ai-translating-training-data',
  130. 'filename' => $dirname,
  131. 'title' => 'wikipali ai translating training data',
  132. 'format' => $this->option('format'),
  133. ]);
  134. sleep(5);
  135. File::deleteDirectory($dirname);
  136. return 0;
  137. }
  138. }