PacketService.php 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. <?php
  2. namespace App\Services;
  3. use App\Models\Channel;
  4. use App\Models\Sentence;
  5. use Illuminate\Support\Facades\DB;
  6. use Illuminate\Support\Facades\Storage;
  7. use ZipArchive;
  8. use App\Http\Api\ChannelApi;
  9. /**
  10. * PacketService
  11. *
  12. * 用于导出句子数据为训练数据包的服务类
  13. * 将指定版本的译文与巴利原文配对导出为JSONL格式,并打包为ZIP文件
  14. */
  15. class PacketService
  16. {
  17. /**
  18. * 每批处理的记录数
  19. */
  20. private const CHUNK_SIZE = 1000;
  21. /**
  22. * 临时文件存储路径
  23. */
  24. private const TEMP_DIR = 'temp/packet';
  25. /**
  26. * 巴利原文的channel_uid
  27. */
  28. private string $paliChannelUid;
  29. /**
  30. * 译文版本的channel_uid数组
  31. */
  32. private array $translationChannelUids;
  33. /**
  34. * 临时文件路径集合
  35. */
  36. private array $tempFiles = [];
  37. /**
  38. * 构造函数
  39. *
  40. * @param string $paliChannelUid 巴利原文的channel_uid
  41. * @param array $translationChannelUids 译文版本的channel_uid数组
  42. */
  43. public function __construct(array $translationChannelUids)
  44. {
  45. $this->paliChannelUid = ChannelApi::getSysChannel('_System_Pali_VRI_');
  46. $this->translationChannelUids = $translationChannelUids;
  47. }
  48. /**
  49. * 执行导出并打包
  50. *
  51. * @return string 返回生成的ZIP文件路径
  52. * @throws \Exception
  53. */
  54. public function export(): string
  55. {
  56. try {
  57. // 创建临时目录
  58. $this->createTempDirectory();
  59. // 导出所有译文文件
  60. foreach ($this->translationChannelUids as $channelUid) {
  61. $this->exportTranslation($channelUid);
  62. }
  63. // 打包ZIP文件
  64. $zipPath = $this->createZipArchive();
  65. // 清理临时文件
  66. $this->cleanupTempFiles();
  67. return $zipPath;
  68. } catch (\Exception $e) {
  69. // 发生错误时也要清理临时文件
  70. $this->cleanupTempFiles();
  71. throw $e;
  72. }
  73. }
  74. /**
  75. * 创建临时目录
  76. *
  77. * @return void
  78. */
  79. private function createTempDirectory(): void
  80. {
  81. $tempPath = storage_path('app/' . self::TEMP_DIR);
  82. if (!is_dir($tempPath)) {
  83. mkdir($tempPath, 0755, true);
  84. }
  85. // 创建translations子目录
  86. $translationsPath = $tempPath . '/translations';
  87. if (!is_dir($translationsPath)) {
  88. mkdir($translationsPath, 0755, true);
  89. }
  90. }
  91. /**
  92. * 导出指定译文版本的数据
  93. *
  94. * @param string $channelUid 译文版本的channel_uid
  95. * @return void
  96. */
  97. private function exportTranslation(string $channelUid): void
  98. {
  99. // 获取channel名称
  100. $channelName = $this->getChannelName($channelUid);
  101. // 创建JSONL文件
  102. $filename = $channelName . '.jsonl';
  103. $filepath = storage_path('app/' . self::TEMP_DIR . '/translations/' . $filename);
  104. // 记录临时文件路径
  105. $this->tempFiles[] = $filepath;
  106. // 打开文件准备写入
  107. $handle = fopen($filepath, 'w');
  108. if ($handle === false) {
  109. throw new \RuntimeException("无法创建文件: {$filepath}");
  110. }
  111. try {
  112. // 分批查询并写入数据
  113. $this->writeTranslationData($handle, $channelUid);
  114. } finally {
  115. fclose($handle);
  116. }
  117. }
  118. /**
  119. * 查询并写入译文数据
  120. *
  121. * @param resource $handle 文件句柄
  122. * @param string $channelUid 译文版本的channel_uid
  123. * @return void
  124. */
  125. private function writeTranslationData($handle, string $channelUid): void
  126. {
  127. // 构建查询,联表获取译文和巴利文
  128. DB::table('sentences as s1')
  129. ->select([
  130. 's1.book_id',
  131. 's1.paragraph',
  132. 's1.word_start',
  133. 's1.word_end',
  134. 's1.content as translation',
  135. 's2.content as pali'
  136. ])
  137. ->join('sentences as s2', function ($join) {
  138. $join->on('s1.book_id', '=', 's2.book_id')
  139. ->on('s1.paragraph', '=', 's2.paragraph')
  140. ->on('s1.word_start', '=', 's2.word_start')
  141. ->on('s1.word_end', '=', 's2.word_end')
  142. ->where('s2.channel_uid', '=', $this->paliChannelUid);
  143. })
  144. ->where('s1.channel_uid', '=', $channelUid)
  145. ->whereNotNull('s1.content')
  146. ->where('s1.content', '!=', '')
  147. ->orderBy('s1.book_id')
  148. ->orderBy('s1.paragraph')
  149. ->orderBy('s1.word_start')
  150. ->orderBy('s1.word_end')
  151. ->chunk(self::CHUNK_SIZE, function ($sentences) use ($handle) {
  152. foreach ($sentences as $sentence) {
  153. // 如果没有译文,跳过
  154. if (empty($sentence->translation)) {
  155. continue;
  156. }
  157. // 构建ID
  158. $id = sprintf(
  159. '%s-%s-%s-%s',
  160. $sentence->book_id,
  161. $sentence->paragraph,
  162. $sentence->word_start,
  163. $sentence->word_end
  164. );
  165. // 构建JSON对象
  166. $data = [
  167. 'id' => $id,
  168. 'pali' => $sentence->pali ?? '',
  169. 'translation' => $sentence->translation
  170. ];
  171. // 写入JSONL格式(每行一个JSON对象)
  172. fwrite($handle, json_encode($data, JSON_UNESCAPED_UNICODE) . "\n");
  173. }
  174. });
  175. }
  176. /**
  177. * 获取channel名称
  178. *
  179. * @param string $channelUid channel的uuid
  180. * @return string channel名称,如果找不到则返回uuid
  181. */
  182. private function getChannelName(string $channelUid): string
  183. {
  184. $channel = Channel::where('uid', $channelUid)->first();
  185. return $channel?->name ?? $channelUid;
  186. }
  187. /**
  188. * 创建ZIP压缩包
  189. *
  190. * @return string 返回ZIP文件在Storage中的路径
  191. * @throws \RuntimeException
  192. */
  193. private function createZipArchive(): string
  194. {
  195. $timestamp = now()->format('YmdHis');
  196. $zipFilename = "training_data_{$timestamp}.zip";
  197. $zipPath = storage_path('app/packet/' . $zipFilename);
  198. // 确保packet目录存在
  199. $packetDir = storage_path('app/packet');
  200. if (!is_dir($packetDir)) {
  201. mkdir($packetDir, 0755, true);
  202. }
  203. $zip = new ZipArchive();
  204. if ($zip->open($zipPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
  205. throw new \RuntimeException("无法创建ZIP文件: {$zipPath}");
  206. }
  207. try {
  208. // 添加所有JSONL文件到ZIP
  209. $translationsDir = storage_path('app/' . self::TEMP_DIR . '/translations');
  210. if (is_dir($translationsDir)) {
  211. $files = scandir($translationsDir);
  212. foreach ($files as $file) {
  213. if ($file === '.' || $file === '..') {
  214. continue;
  215. }
  216. $filePath = $translationsDir . '/' . $file;
  217. if (is_file($filePath)) {
  218. // 添加到ZIP的translations目录下
  219. $zip->addFile($filePath, 'translations/' . $file);
  220. }
  221. }
  222. }
  223. $zip->close();
  224. } catch (\Exception $e) {
  225. $zip->close();
  226. throw $e;
  227. }
  228. // 返回相对于Storage的路径
  229. return 'packet/' . $zipFilename;
  230. }
  231. /**
  232. * 清理临时文件和目录
  233. *
  234. * @return void
  235. */
  236. private function cleanupTempFiles(): void
  237. {
  238. $tempPath = storage_path('app/' . self::TEMP_DIR);
  239. if (is_dir($tempPath)) {
  240. $this->deleteDirectory($tempPath);
  241. }
  242. }
  243. /**
  244. * 递归删除目录
  245. *
  246. * @param string $dir 目录路径
  247. * @return void
  248. */
  249. private function deleteDirectory(string $dir): void
  250. {
  251. if (!is_dir($dir)) {
  252. return;
  253. }
  254. $files = array_diff(scandir($dir), ['.', '..']);
  255. foreach ($files as $file) {
  256. $path = $dir . '/' . $file;
  257. is_dir($path) ? $this->deleteDirectory($path) : unlink($path);
  258. }
  259. rmdir($dir);
  260. }
  261. }