PacketService.php 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. <?php
  2. namespace App\Services;
  3. use App\Models\Channel;
  4. use Illuminate\Support\Facades\DB;
  5. use Illuminate\Support\Facades\Storage;
  6. use ZipArchive;
  7. use App\Http\Api\ChannelApi;
  8. use Illuminate\Support\Facades\Log;
  9. use Illuminate\Support\Facades\Cache;
  10. use Illuminate\Support\Facades\App;
  11. /**
  12. * PacketService
  13. *
  14. * 用于导出句子数据为训练数据包的服务类
  15. * 将指定版本的译文与巴利原文配对导出为JSONL格式,并打包为ZIP文件
  16. */
  17. class PacketService
  18. {
  19. /**
  20. * 每批处理的记录数
  21. */
  22. private const CHUNK_SIZE = 1000;
  23. /**
  24. * 临时文件存储路径
  25. */
  26. private const TEMP_DIR = 'temp/packet';
  27. /**
  28. * 巴利原文的channel_uid
  29. */
  30. private string $paliChannelUid;
  31. /**
  32. * 译文版本的channel_uid数组
  33. */
  34. private array $translationChannelUids;
  35. /**
  36. * 临时文件路径集合
  37. */
  38. private array $tempFiles = [];
  39. /**
  40. *
  41. *
  42. * @param string $paliChannelUid 巴利原文的channel_uid
  43. * @param array $translationChannelUids 译文版本的channel_uid数组
  44. */
  45. public function channels(array $translationChannelUids)
  46. {
  47. $this->paliChannelUid = ChannelApi::getSysChannel('_System_Pali_VRI_');
  48. $this->translationChannelUids = $translationChannelUids;
  49. }
  50. /**
  51. * 执行导出并打包
  52. *
  53. * @return string 返回生成的ZIP文件路径
  54. * @throws \Exception
  55. */
  56. public function export(): string
  57. {
  58. try {
  59. // 创建临时目录
  60. $this->createTempDirectory();
  61. // 导出所有译文文件
  62. foreach ($this->translationChannelUids as $channelUid) {
  63. $this->exportTranslation($channelUid);
  64. }
  65. // 打包ZIP文件
  66. $zipPath = $this->createZipArchive();
  67. // 清理临时文件
  68. $this->cleanupTempFiles();
  69. return $zipPath;
  70. } catch (\Exception $e) {
  71. // 发生错误时也要清理临时文件
  72. $this->cleanupTempFiles();
  73. throw $e;
  74. }
  75. }
  76. /**
  77. * 创建临时目录
  78. *
  79. * @return void
  80. */
  81. private function createTempDirectory(): void
  82. {
  83. $tempPath = storage_path('app/' . self::TEMP_DIR);
  84. if (!is_dir($tempPath)) {
  85. mkdir($tempPath, 0755, true);
  86. }
  87. // 创建translations子目录
  88. $translationsPath = $tempPath . '/translations';
  89. if (!is_dir($translationsPath)) {
  90. mkdir($translationsPath, 0755, true);
  91. }
  92. }
  93. /**
  94. * 导出指定译文版本的数据
  95. *
  96. * @param string $channelUid 译文版本的channel_uid
  97. * @return void
  98. */
  99. private function exportTranslation(string $channelUid): void
  100. {
  101. // 获取channel名称
  102. $channelName = $this->getChannelName($channelUid);
  103. // 创建JSONL文件
  104. $filename = $channelName . '.jsonl';
  105. $filepath = storage_path('app/' . self::TEMP_DIR . '/translations/' . $filename);
  106. // 记录临时文件路径
  107. $this->tempFiles[] = $filepath;
  108. // 打开文件准备写入
  109. $handle = fopen($filepath, 'w');
  110. if ($handle === false) {
  111. throw new \RuntimeException("无法创建文件: {$filepath}");
  112. }
  113. try {
  114. // 分批查询并写入数据
  115. $this->writeTranslationData($handle, $channelUid);
  116. } finally {
  117. fclose($handle);
  118. }
  119. }
  120. /**
  121. * 查询并写入译文数据
  122. *
  123. * @param resource $handle 文件句柄
  124. * @param string $channelUid 译文版本的channel_uid
  125. * @return void
  126. */
  127. private function writeTranslationData($handle, string $channelUid): void
  128. {
  129. // 构建查询,联表获取译文和巴利文
  130. DB::table('sentences as s1')
  131. ->select([
  132. 's1.book_id',
  133. 's1.paragraph',
  134. 's1.word_start',
  135. 's1.word_end',
  136. 's1.content as translation',
  137. 's2.content as pali'
  138. ])
  139. ->join('sentences as s2', function ($join) {
  140. $join->on('s1.book_id', '=', 's2.book_id')
  141. ->on('s1.paragraph', '=', 's2.paragraph')
  142. ->on('s1.word_start', '=', 's2.word_start')
  143. ->on('s1.word_end', '=', 's2.word_end')
  144. ->where('s2.channel_uid', '=', $this->paliChannelUid);
  145. })
  146. ->where('s1.channel_uid', '=', $channelUid)
  147. ->whereNotNull('s1.content')
  148. ->where('s1.content', '!=', '')
  149. ->orderBy('s1.book_id')
  150. ->orderBy('s1.paragraph')
  151. ->orderBy('s1.word_start')
  152. ->orderBy('s1.word_end')
  153. ->chunk(self::CHUNK_SIZE, function ($sentences) use ($handle) {
  154. foreach ($sentences as $sentence) {
  155. // 如果没有译文,跳过
  156. if (empty($sentence->translation)) {
  157. continue;
  158. }
  159. // 构建ID
  160. $id = sprintf(
  161. '%s-%s-%s-%s',
  162. $sentence->book_id,
  163. $sentence->paragraph,
  164. $sentence->word_start,
  165. $sentence->word_end
  166. );
  167. // 构建JSON对象
  168. $data = [
  169. 'id' => $id,
  170. 'pali' => $sentence->pali ?? '',
  171. 'translation' => $sentence->translation
  172. ];
  173. // 写入JSONL格式(每行一个JSON对象)
  174. fwrite($handle, json_encode($data, JSON_UNESCAPED_UNICODE) . "\n");
  175. }
  176. });
  177. }
  178. /**
  179. * 获取channel名称
  180. *
  181. * @param string $channelUid channel的uuid
  182. * @return string channel名称,如果找不到则返回uuid
  183. */
  184. private function getChannelName(string $channelUid): string
  185. {
  186. $channel = Channel::where('uid', $channelUid)->first();
  187. return $channel?->name ?? $channelUid;
  188. }
  189. /**
  190. * 创建ZIP压缩包
  191. *
  192. * @return string 返回ZIP文件在Storage中的路径
  193. * @throws \RuntimeException
  194. */
  195. private function createZipArchive(): string
  196. {
  197. $timestamp = now()->format('YmdHis');
  198. $zipFilename = "training_data_{$timestamp}.zip";
  199. $zipPath = storage_path('app/packet/' . $zipFilename);
  200. // 确保packet目录存在
  201. $packetDir = storage_path('app/packet');
  202. if (!is_dir($packetDir)) {
  203. mkdir($packetDir, 0755, true);
  204. }
  205. $zip = new ZipArchive();
  206. if ($zip->open($zipPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
  207. throw new \RuntimeException("无法创建ZIP文件: {$zipPath}");
  208. }
  209. try {
  210. // 添加所有JSONL文件到ZIP
  211. $translationsDir = storage_path('app/' . self::TEMP_DIR . '/translations');
  212. if (is_dir($translationsDir)) {
  213. $files = scandir($translationsDir);
  214. foreach ($files as $file) {
  215. if ($file === '.' || $file === '..') {
  216. continue;
  217. }
  218. $filePath = $translationsDir . '/' . $file;
  219. if (is_file($filePath)) {
  220. // 添加到ZIP的translations目录下
  221. $zip->addFile($filePath, 'translations/' . $file);
  222. }
  223. }
  224. }
  225. $zip->close();
  226. } catch (\Exception $e) {
  227. $zip->close();
  228. throw $e;
  229. }
  230. // 返回相对于Storage的路径
  231. return 'packet/' . $zipFilename;
  232. }
  233. /**
  234. * 清理临时文件和目录
  235. *
  236. * @return void
  237. */
  238. private function cleanupTempFiles(): void
  239. {
  240. $tempPath = storage_path('app/' . self::TEMP_DIR);
  241. if (is_dir($tempPath)) {
  242. $this->deleteDirectory($tempPath);
  243. }
  244. }
  245. /**
  246. * 递归删除目录
  247. *
  248. * @param string $dir 目录路径
  249. * @return void
  250. */
  251. private function deleteDirectory(string $dir): void
  252. {
  253. if (!is_dir($dir)) {
  254. return;
  255. }
  256. $files = array_diff(scandir($dir), ['.', '..']);
  257. foreach ($files as $file) {
  258. $path = $dir . '/' . $file;
  259. is_dir($path) ? $this->deleteDirectory($path) : unlink($path);
  260. }
  261. rmdir($dir);
  262. }
  263. public function index(?string $id = null)
  264. {
  265. $key = '/offline/index';
  266. if (!Cache::has($key)) {
  267. return [];
  268. }
  269. $fileInfo = Cache::get($key);
  270. $output = [];
  271. foreach ($fileInfo as $key => $file) {
  272. if ($id) {
  273. if ($file['id'] !== $id) {
  274. continue;
  275. }
  276. }
  277. $zipFile = $file['filename'];
  278. $bucket = config('mint.attachments.bucket_name.temporary');
  279. $tmpFile = $bucket . '/' . $zipFile;
  280. $url = array();
  281. foreach (config('mint.server.cdn_urls') as $key => $cdn) {
  282. $url[] = [
  283. 'link' => $cdn . '/' . $zipFile,
  284. 'hostname' => 'cdn-' . $key,
  285. ];
  286. }
  287. if (App::environment('local')) {
  288. $s3Link = Storage::url($tmpFile);
  289. } else {
  290. try {
  291. $s3Link = Storage::temporaryUrl($tmpFile, now()->addDays(2));
  292. } catch (\Exception $e) {
  293. Log::error('offline-index {Exception}', ['exception' => $e]);
  294. continue;
  295. }
  296. }
  297. $url[] = [
  298. 'link' => $s3Link,
  299. 'hostname' => 'Amazon cloud storage(Hongkong)',
  300. ];
  301. $file['url'] = $url;
  302. Log::debug('offline-index: file info=', ['data' => $file]);
  303. $output[] = $file;
  304. }
  305. return $output;
  306. }
  307. }