UpdateCorpus.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. <?php
  2. namespace App\Console\Commands;
  3. use Illuminate\Support\Facades\Log;
  4. use App\Services\SentenceService;
  5. use App\Services\TermService;
  6. use Illuminate\Console\Attributes\Description;
  7. use Illuminate\Console\Attributes\Signature;
  8. use Illuminate\Console\Command;
  9. use Illuminate\Support\Facades\DB;
  10. use App\Models\Channel;
  11. use App\Http\Api\UserApi;
  12. #[Signature('app:update-corpus')]
  13. #[Description('Update corpus from JSONL files in corpus directory')]
  14. class UpdateCorpus extends Command
  15. {
  16. /**
  17. * The SentenceService instance.
  18. *
  19. * @var SentenceService
  20. */
  21. protected SentenceService $sentenceService;
  22. protected TermService $termService;
  23. /**
  24. * Create a new command instance.
  25. *
  26. * @param SentenceService $sentenceService
  27. */
  28. public function __construct(SentenceService $sentenceService, TermService $termService)
  29. {
  30. parent::__construct();
  31. $this->sentenceService = $sentenceService;
  32. $this->termService = $termService;
  33. }
  34. /**
  35. * Execute the console command.
  36. *
  37. * @return int
  38. */
  39. public function handle(): int
  40. {
  41. $this->info('Starting corpus update process...');
  42. // Get the corpus base path from config
  43. $corpusBasePath = config('mint.path.corpus');
  44. if (!is_dir($corpusBasePath)) {
  45. $this->error("Corpus directory not found: {$corpusBasePath}");
  46. return self::FAILURE;
  47. }
  48. // Scan subdirectories of the corpus path
  49. $stores = $this->getSubdirectories($corpusBasePath);
  50. if (empty($stores)) {
  51. $this->warn('No subdirectories found in corpus path.');
  52. return self::SUCCESS;
  53. }
  54. $this->info("Found " . count($stores) . " subdirectories to process.");
  55. $totalProcessed = 0;
  56. $totalErrors = 0;
  57. foreach ($stores as $store) {
  58. $this->info("Processing directory: {$store}");
  59. try {
  60. $stats = $this->processCorpusDirectory($store);
  61. $totalProcessed += $stats['processed'];
  62. $totalErrors += $stats['errors'];
  63. $this->info("Directory processed: {$stats['processed']} records saved, {$stats['errors']} errors");
  64. if (isset($stats['channels'])) {
  65. foreach ($stats['channels'] as $key => $channelId) {
  66. $this->call('upgrade:progress', ['--channel' => $channelId]);
  67. $this->call('upgrade:progress.chapter', ['--channel' => $channelId]);
  68. $this->call('opensearch:index-tipitaka', [
  69. 'book' => 0,
  70. '--channel' => $channelId,
  71. '--granularity' => 'chapter',
  72. '--summary' => 'off'
  73. ]);
  74. }
  75. }
  76. } catch (\Exception $e) {
  77. $this->error("Failed to process directory {$store}: {$e->getMessage()}");
  78. Log::error("Failed to process directory", [
  79. 'dir' => $store,
  80. 'message' => $e->getMessage(),
  81. 'file' => $e->getFile(),
  82. 'line' => $e->getLine(),
  83. 'trace' => $e->getTraceAsString(),
  84. ]);
  85. $totalErrors++;
  86. }
  87. }
  88. $this->info("Corpus update completed. Total processed: {$totalProcessed}, Total errors: {$totalErrors}");
  89. return $totalErrors > 0 ? self::FAILURE : self::SUCCESS;
  90. }
  91. /**
  92. * Get all subdirectories of a given directory.
  93. *
  94. * @param string $path
  95. * @return array
  96. */
  97. protected function getSubdirectories(string $path): array
  98. {
  99. $directories = [];
  100. $items = scandir($path);
  101. foreach ($items as $item) {
  102. if ($item === '.' || $item === '..') {
  103. continue;
  104. }
  105. $fullPath = $path . DIRECTORY_SEPARATOR . $item;
  106. if (is_dir($fullPath)) {
  107. $directories[] = $fullPath;
  108. }
  109. }
  110. return $directories;
  111. }
  112. /**
  113. * Process a single corpus directory.
  114. *
  115. * @param string $directoryPath
  116. * @return array
  117. * @throws \Exception
  118. */
  119. protected function processCorpusDirectory(string $directoryPath): array
  120. {
  121. $stats = [
  122. 'processed' => 0,
  123. 'errors' => 0,
  124. ];
  125. // Read meta.json file
  126. $metaFile = $directoryPath . DIRECTORY_SEPARATOR . 'meta.json';
  127. if (!file_exists($metaFile)) {
  128. $this->warn("meta.json not found in directory: {$directoryPath}");
  129. return $stats;
  130. }
  131. $metaData = json_decode(file_get_contents($metaFile), true);
  132. if (!isset($metaData['id'])) {
  133. $this->error("Invalid meta.json: missing 'id' field in {$directoryPath}");
  134. return $stats;
  135. }
  136. $sourceId = $metaData['id'];
  137. $this->info("Processing {$directoryPath} source ID: {$sourceId}");
  138. // Find all channel records with matching source_id
  139. $channels = Channel::where('source_id', $sourceId)->get();
  140. if ($channels->isEmpty()) {
  141. $this->warn("No channels found with source_id: {$sourceId}");
  142. return $stats;
  143. }
  144. $this->info("Found {$channels->count()} channel(s) for source ID: {$sourceId}");
  145. $glossaryFile = $directoryPath . DIRECTORY_SEPARATOR . 'glossary.csv';
  146. if (file_exists($glossaryFile)) {
  147. $status = $this->processGlossary($glossaryFile, $channels);
  148. $this->line('glossary load');
  149. }
  150. // Scan subdirectories of the current directory for JSONL files
  151. $childDirectories = $this->getSubdirectories($directoryPath);
  152. foreach ($childDirectories as $childDir) {
  153. $this->info("Scanning directory for JSONL files: {$childDir}");
  154. $jsonlFiles = glob($childDir . DIRECTORY_SEPARATOR . '*.jsonl');
  155. foreach ($jsonlFiles as $jsonlFile) {
  156. $this->line("Processing file: {$jsonlFile}");
  157. $fileStats = $this->processJsonlFile($jsonlFile, $channels);
  158. $stats['processed'] += $fileStats['processed'];
  159. $stats['errors'] += $fileStats['errors'];
  160. }
  161. }
  162. $stats['channels'] = array_map(fn($item) => $item['uid'], $channels->toArray());
  163. return $stats;
  164. }
  165. /**
  166. * Process a glossary csv file and save glossary for each channel.
  167. *
  168. * @param string $filePath
  169. * @param \Illuminate\Database\Eloquent\Collection $channels
  170. * @return array
  171. */
  172. protected function processGlossary(string $filePath, $channels): array
  173. {
  174. $stats = [
  175. 'processed' => 0,
  176. 'errors' => 0,
  177. ];
  178. $handle = fopen($filePath, 'r');
  179. if (!$handle) {
  180. $this->error("Failed to open file: {$filePath}");
  181. return $stats;
  182. }
  183. $robotUid = config('mint.admin.robot_uuid');
  184. if (!$robotUid) {
  185. $this->error('robot_uuid not configured in mint.admin.robot_uid');
  186. fclose($handle);
  187. return $stats;
  188. }
  189. // 读取表头行
  190. $headers = fgetcsv($handle);
  191. if ($headers === false) {
  192. $this->error("Failed to read CSV headers from: {$filePath}");
  193. fclose($handle);
  194. return $stats;
  195. }
  196. $lineNumber = 0;
  197. while (($row = fgetcsv($handle)) !== false) {
  198. $lineNumber++;
  199. if (count($row) !== count($headers)) {
  200. $this->error("Column count mismatch at line {$lineNumber} in file: {$filePath}");
  201. $stats['errors']++;
  202. continue;
  203. }
  204. $data = array_combine($headers, $row);
  205. $editor_id = UserApi::getIdByUuid($robotUid);
  206. foreach ($channels as $channel) {
  207. try {
  208. $saveData = [
  209. 'word' => $data['pali_word'],
  210. 'tag' => $data['tag'] ?? null,
  211. 'channel_id' => $channel->uid,
  212. 'meaning' => $data['meaning'],
  213. 'redirect' => $data['redirect'] ?? null,
  214. 'other_meaning' => $data['meaning2'] ?: null,
  215. 'note' => $data['note'] ?: null,
  216. 'editor_id' => $editor_id,
  217. ];
  218. DB::transaction(function () use ($saveData) {
  219. $this->termService->updateOrCreateByWord($saveData);
  220. });
  221. $stats['processed']++;
  222. } catch (\Exception $e) {
  223. $this->error("Failed to save glossary for channel {$channel->uid} at line {$lineNumber}: {$e->getMessage()}");
  224. $stats['errors']++;
  225. }
  226. }
  227. }
  228. fclose($handle);
  229. $this->line("glossary {$lineNumber} lines processed");
  230. return $stats;
  231. }
  232. /**
  233. * Process a single JSONL file and save records for each channel.
  234. *
  235. * @param string $filePath
  236. * @param \Illuminate\Database\Eloquent\Collection $channels
  237. * @return array
  238. */
  239. protected function processJsonlFile(string $filePath, $channels): array
  240. {
  241. $stats = [
  242. 'processed' => 0,
  243. 'errors' => 0,
  244. ];
  245. $handle = fopen($filePath, 'r');
  246. if (!$handle) {
  247. $this->error("Failed to open file: {$filePath}");
  248. return $stats;
  249. }
  250. $lineNumber = 0;
  251. $robotUid = config('mint.admin.robot_uuid');
  252. if (!$robotUid) {
  253. $this->error('robot_uuid not configured in mint.admin.robot_uuid');
  254. fclose($handle);
  255. return $stats;
  256. }
  257. while (($line = fgets($handle)) !== false) {
  258. $lineNumber++;
  259. $line = trim($line);
  260. if (empty($line)) {
  261. continue;
  262. }
  263. // Parse JSON line
  264. $data = json_decode($line, true);
  265. if ($data === null) {
  266. $this->error("Failed to parse JSON at line {$lineNumber} in file: {$filePath}");
  267. $stats['errors']++;
  268. continue;
  269. }
  270. // Save for each channel
  271. foreach ($channels as $channel) {
  272. try {
  273. [$book, $para, $start, $end] = explode('-', $data['id']);
  274. $saveData = [
  275. 'book_id' => $book,
  276. 'paragraph' => $para,
  277. 'word_start' => $start,
  278. 'word_end' => $end,
  279. 'content' => $data['content'],
  280. 'channel_uid' => $channel->uid,
  281. 'editor_uid' => $robotUid,
  282. ];
  283. DB::transaction(function () use ($saveData) {
  284. $this->sentenceService->save($saveData);
  285. });
  286. $stats['processed']++;
  287. //$this->line("Saved record for channel: {$channel->uid}");
  288. } catch (\Exception $e) {
  289. $this->error("Failed to save record for channel {$channel->uid} at line {$lineNumber}: {$e->getMessage()}");
  290. $stats['errors']++;
  291. }
  292. }
  293. }
  294. fclose($handle);
  295. $this->line("$lineNumber lines write");
  296. return $stats;
  297. }
  298. }