2
0

ExportAiTrainingData.php 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. <?php
  2. namespace App\Console\Commands;
  3. use Illuminate\Console\Command;
  4. use Illuminate\Support\Facades\Log;
  5. use App\Models\Sentence;
  6. use App\Models\PaliSentence;
  7. use App\Http\Api\MdRender;
  8. use Illuminate\Support\Facades\File;
  9. use App\Http\Api\ChannelApi;
  10. use App\Services\PaliTextService;
  11. class ExportAiTrainingData extends Command
  12. {
  13. private $ShortTrans = 0.17;
  14. /**
  15. * The name and signature of the console command.
  16. * php artisan export:ai.training.data
  17. * @var string
  18. */
  19. protected $signature = 'export:ai.training.data {--format=gz : zip file format 7z,lzma,gz } {--test}';
  20. /**
  21. * The console command description.
  22. *
  23. * @var string
  24. */
  25. protected $description = 'export ai training data';
  26. /**
  27. * Create a new command instance.
  28. *
  29. * @return void
  30. */
  31. public function __construct()
  32. {
  33. parent::__construct();
  34. }
  35. /**
  36. * Execute the console command.
  37. *
  38. * @return int
  39. */
  40. public function handle()
  41. {
  42. Log::info('task export offline sentence-table start');
  43. //创建文件夹
  44. $base = 'app/tmp/export/offline';
  45. $exportDir = storage_path($base);
  46. if (!is_dir($exportDir)) {
  47. $res = mkdir($exportDir, 0755, true);
  48. if (!$res) {
  49. $this->error('mkdir fail path=' . $exportDir);
  50. return 1;
  51. } else {
  52. $this->info('make dir successful ' . $exportDir);
  53. }
  54. }
  55. //创建临时文件夹\
  56. $dirname = $exportDir . '/' . 'wikipali-offline-ai-training-' . date("YmdHis");
  57. $tmp = mkdir($dirname, 0755, true);
  58. if (!$tmp) {
  59. $this->error('mkdir fail path=' . $dirname);
  60. return 1;
  61. } else {
  62. $this->info('make dir successful ' . $dirname);
  63. }
  64. $fpIndex = fopen($dirname . '/index.md', 'w');
  65. if ($fpIndex === false) {
  66. die('无法创建索引文件');
  67. }
  68. $channels = [
  69. '7ac4d13b-a43d-4409-91b5-5f2a82b916b3',
  70. 'e5bc5c97-a6fb-4ccb-b7df-be6dcfee9c43',
  71. '74ebf4c5-c243-4948-955d-6c277e29276a',
  72. '3b0cb0aa-ea88-4ce5-b67d-00a3e76220cc',
  73. '5310999c-0b0c-4bb0-9bb9-9cdd176e9ef0',
  74. '331447b6-39bb-4b49-ac10-6206db93a050',
  75. ];
  76. $start = time();
  77. foreach ($channels as $key => $channel) {
  78. if ($this->option('test') && $key > 0) {
  79. // test mode 只跑一个
  80. break;
  81. }
  82. fwrite($fpIndex, "# {$channel}\n");
  83. $channelInfo = ChannelApi::getById($channel);
  84. if ($channelInfo) {
  85. fwrite($fpIndex, "- 版本名称:{$channelInfo['name']}\n");
  86. fwrite($fpIndex, "- 语言:{$channelInfo['lang']}\n");
  87. }
  88. // 创建文件
  89. $this->info('export start' . $channel);
  90. $filename = $channel . '.jsonl';
  91. $exportFile = $dirname . '/' . $filename;
  92. $fp = fopen($exportFile, 'w');
  93. if ($fp === false) {
  94. die('无法创建文件');
  95. }
  96. $db = Sentence::where('channel_uid', $channel);
  97. $bar = $this->output->createProgressBar($db->count());
  98. $srcDb = $db->select([
  99. 'book_id',
  100. 'paragraph',
  101. 'word_start',
  102. 'word_end',
  103. 'content',
  104. 'content_type'
  105. ])
  106. ->whereNotNull('content')
  107. ->orderBy('book_id')
  108. ->orderBy('paragraph')
  109. ->orderBy('word_start')->cursor();
  110. $done = [];
  111. foreach ($srcDb as $sent) {
  112. $id = "{$sent->book_id}-{$sent->paragraph}-{$sent->word_start}-{$sent->word_end}";
  113. if (isset($done[$id])) {
  114. continue;
  115. }
  116. //获取原文
  117. $origin = PaliSentence::where('book', $sent->book_id)
  118. ->where('paragraph', $sent->paragraph)
  119. ->where('word_begin', $sent->word_start)
  120. ->where('word_end', $sent->word_end)
  121. ->value('text');
  122. //忽略空的原文
  123. if (self::isEmpty($origin)) {
  124. Log::warning('origin is empty id=' . $id);
  125. continue;
  126. }
  127. // 渲染译文
  128. $translation = MdRender::render(
  129. $sent->content,
  130. [$channel],
  131. null,
  132. 'read',
  133. 'translation',
  134. $sent->content_type,
  135. 'text',
  136. );
  137. $translation = trim($translation);
  138. // 忽略空的译文
  139. if (self::isEmpty($translation)) {
  140. Log::warning('translation is empty id=' . $id);
  141. continue;
  142. }
  143. //忽略过短的译文
  144. if (mb_strlen($translation) / mb_strlen($origin) < $this->ShortTrans) {
  145. Log::warning('translation is short id=' . $id);
  146. continue;
  147. }
  148. //原文与翻译完全相同
  149. if ($translation === $origin) {
  150. Log::warning('translation is same id=' . $id);
  151. continue;
  152. }
  153. // 获取分类标签
  154. $paliTextService = app(PaliTextService::class);
  155. $tags = $paliTextService->getParaCategoryTags($sent->book_id, $sent->paragraph);
  156. $path = $paliTextService->getParaPathTitle($sent->book_id, $sent->paragraph);
  157. $currData = [
  158. 'id' => $id,
  159. 'original' => $origin,
  160. 'translation' => $translation,
  161. 'category' => $tags,
  162. 'path' => $path,
  163. ];
  164. fwrite($fp, json_encode($currData, JSON_UNESCAPED_UNICODE) . "\n");
  165. $bar->advance();
  166. $done[$id] = 1;
  167. }
  168. fclose($fp);
  169. }
  170. fclose($fpIndex);
  171. $this->info((time() - $start) . ' seconds');
  172. $this->call('export:zip2', [
  173. 'id' => 'ai-translating-training-data',
  174. 'filename' => $dirname,
  175. 'title' => 'wikipali ai translating training data',
  176. 'format' => $this->option('format'),
  177. ]);
  178. sleep(5);
  179. File::deleteDirectory($dirname);
  180. return 0;
  181. }
  182. private function isEmpty(?string $input): bool
  183. {
  184. if (empty($input)) {
  185. return true;
  186. }
  187. $result = preg_replace('/[\s\d\p{P}]/u', '', $input);
  188. return empty($result);
  189. }
  190. }