UpdateCorpus.php 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. <?php
  2. namespace App\Console\Commands;
  3. use App\Services\SentenceService;
  4. use Illuminate\Console\Attributes\Description;
  5. use Illuminate\Console\Attributes\Signature;
  6. use Illuminate\Console\Command;
  7. use Illuminate\Support\Facades\DB;
  8. use App\Models\Channel;
  9. #[Signature('app:update-corpus')]
  10. #[Description('Update corpus from JSONL files in corpus directory')]
  11. class UpdateCorpus extends Command
  12. {
  13. /**
  14. * The SentenceService instance.
  15. *
  16. * @var SentenceService
  17. */
  18. protected SentenceService $sentenceService;
  19. protected TermService $termService;
  20. /**
  21. * Create a new command instance.
  22. *
  23. * @param SentenceService $sentenceService
  24. */
  25. public function __construct(SentenceService $sentenceService)
  26. {
  27. public function __construct(SentenceService $sentenceService, TermService $termService)
  28. {
  29. parent::__construct();
  30. $this->sentenceService = $sentenceService;
  31. $this->termService = $termService;
  32. }
  33. /**
  34. * Execute the console command.
  35. *
  36. * @return int
  37. */
  38. public function handle(): int
  39. {
  40. $this->info('Starting corpus update process...');
  41. // Get the corpus base path from config
  42. $corpusBasePath = config('mint.path.corpus');
  43. if (!is_dir($corpusBasePath)) {
  44. $this->error("Corpus directory not found: {$corpusBasePath}");
  45. return self::FAILURE;
  46. }
  47. // Scan subdirectories of the corpus path
  48. $subdirectories = $this->getSubdirectories($corpusBasePath);
  49. if (empty($subdirectories)) {
  50. $this->warn('No subdirectories found in corpus path.');
  51. return self::SUCCESS;
  52. }
  53. $this->info("Found " . count($subdirectories) . " subdirectories to process.");
  54. $totalProcessed = 0;
  55. $totalErrors = 0;
  56. foreach ($subdirectories as $subdir) {
  57. $this->info("Processing directory: {$subdir}");
  58. try {
  59. $stats = $this->processCorpusDirectory($subdir);
  60. $totalProcessed += $stats['processed'];
  61. $totalErrors += $stats['errors'];
  62. $this->info("Directory processed: {$stats['processed']} records saved, {$stats['errors']} errors");
  63. } catch (\Exception $e) {
  64. $this->error("Failed to process directory {$subdir}: {$e->getMessage()}");
  65. $totalErrors++;
  66. }
  67. }
  68. $this->info("Corpus update completed. Total processed: {$totalProcessed}, Total errors: {$totalErrors}");
  69. return $totalErrors > 0 ? self::FAILURE : self::SUCCESS;
  70. }
  71. /**
  72. * Get all subdirectories of a given directory.
  73. *
  74. * @param string $path
  75. * @return array
  76. */
  77. protected function getSubdirectories(string $path): array
  78. {
  79. $directories = [];
  80. $items = scandir($path);
  81. foreach ($items as $item) {
  82. if ($item === '.' || $item === '..') {
  83. continue;
  84. }
  85. $fullPath = $path . DIRECTORY_SEPARATOR . $item;
  86. if (is_dir($fullPath)) {
  87. $directories[] = $fullPath;
  88. }
  89. }
  90. return $directories;
  91. }
  92. /**
  93. * Process a single corpus directory.
  94. *
  95. * @param string $directoryPath
  96. * @return array
  97. * @throws \Exception
  98. */
  99. protected function processCorpusDirectory(string $directoryPath): array
  100. {
  101. $stats = [
  102. 'processed' => 0,
  103. 'errors' => 0,
  104. ];
  105. // Read meta.json file
  106. $metaFile = $directoryPath . DIRECTORY_SEPARATOR . 'meta.json';
  107. if (!file_exists($metaFile)) {
  108. $this->warn("meta.json not found in directory: {$directoryPath}");
  109. return $stats;
  110. }
  111. $metaData = json_decode(file_get_contents($metaFile), true);
  112. if (!isset($metaData['id'])) {
  113. $this->error("Invalid meta.json: missing 'id' field in {$directoryPath}");
  114. return $stats;
  115. }
  116. $sourceId = $metaData['id'];
  117. $this->info("Processing {$directoryPath} source ID: {$sourceId}");
  118. // Find all channel records with matching source_id
  119. $channels = Channel::where('source_id', $sourceId)->get();
  120. if ($channels->isEmpty()) {
  121. $this->warn("No channels found with source_id: {$sourceId}");
  122. return $stats;
  123. }
  124. $this->info("Found {$channels->count()} channel(s) for source ID: {$sourceId}");
  125. // Scan subdirectories of the current directory for JSONL files
  126. $childDirectories = $this->getSubdirectories($directoryPath);
  127. foreach ($childDirectories as $childDir) {
  128. $this->info("Scanning directory for JSONL files: {$childDir}");
  129. $jsonlFiles = glob($childDir . DIRECTORY_SEPARATOR . '*.jsonl');
  130. foreach ($jsonlFiles as $jsonlFile) {
  131. $this->line("Processing file: {$jsonlFile}");
  132. $fileStats = $this->processJsonlFile($jsonlFile, $channels);
  133. $stats['processed'] += $fileStats['processed'];
  134. $stats['errors'] += $fileStats['errors'];
  135. }
  136. }
  137. return $stats;
  138. }
  139. /**
  140. * Process a single JSONL file and save records for each channel.
  141. *
  142. * @param string $filePath
  143. * @param \Illuminate\Database\Eloquent\Collection $channels
  144. * @return array
  145. */
  146. protected function processJsonlFile(string $filePath, $channels): array
  147. {
  148. $stats = [
  149. 'processed' => 0,
  150. 'errors' => 0,
  151. ];
  152. $handle = fopen($filePath, 'r');
  153. if (!$handle) {
  154. $this->error("Failed to open file: {$filePath}");
  155. return $stats;
  156. }
  157. $lineNumber = 0;
  158. $robotUid = config('mint.admin.robot_uuid');
  159. if (!$robotUid) {
  160. $this->error('robot_uuid not configured in mint.admin.robot_uuid');
  161. fclose($handle);
  162. return $stats;
  163. }
  164. while (($line = fgets($handle)) !== false) {
  165. $lineNumber++;
  166. $line = trim($line);
  167. if (empty($line)) {
  168. continue;
  169. }
  170. // Parse JSON line
  171. $data = json_decode($line, true);
  172. if ($data === null) {
  173. $this->error("Failed to parse JSON at line {$lineNumber} in file: {$filePath}");
  174. $stats['errors']++;
  175. continue;
  176. }
  177. // Save for each channel
  178. foreach ($channels as $channel) {
  179. try {
  180. $saveData = [
  181. 'book_id' => $data['book'],
  182. 'paragraph' => $data['paragraph'],
  183. 'word_start' => $data['start'],
  184. 'word_end' => $data['end'],
  185. 'content' => $data['content'],
  186. 'channel_uid' => $channel->uid,
  187. 'editor_uid' => $robotUid,
  188. ];
  189. DB::transaction(function () use ($saveData) {
  190. $this->sentenceService->save($saveData);
  191. });
  192. $stats['processed']++;
  193. //$this->line("Saved record for channel: {$channel->uid}");
  194. } catch (\Exception $e) {
  195. $this->error("Failed to save record for channel {$channel->uid} at line {$lineNumber}: {$e->getMessage()}");
  196. $stats['errors']++;
  197. }
  198. }
  199. }
  200. fclose($handle);
  201. $this->line("$lineNumber lines write");
  202. return $stats;
  203. }
  204. }