UpdateCorpus.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. <?php
  2. namespace App\Console\Commands;
  3. use Illuminate\Support\Facades\Log;
  4. use App\Services\SentenceService;
  5. use App\Services\TermService;
  6. use Illuminate\Console\Attributes\Description;
  7. use Illuminate\Console\Attributes\Signature;
  8. use Illuminate\Console\Command;
  9. use Illuminate\Support\Facades\DB;
  10. use App\Models\Channel;
  11. use App\Http\Api\UserApi;
  12. #[Signature('app:update-corpus --dir= --es')]
  13. #[Description('Update corpus from JSONL files in corpus directory')]
  14. class UpdateCorpus extends Command
  15. {
  16. /**
  17. * The SentenceService instance.
  18. *
  19. * @var SentenceService
  20. */
  21. protected SentenceService $sentenceService;
  22. protected TermService $termService;
  23. /**
  24. * Create a new command instance.
  25. *
  26. * @param SentenceService $sentenceService
  27. */
  28. public function __construct(SentenceService $sentenceService, TermService $termService)
  29. {
  30. parent::__construct();
  31. $this->sentenceService = $sentenceService;
  32. $this->termService = $termService;
  33. }
  34. /**
  35. * Execute the console command.
  36. *
  37. * @return int
  38. */
  39. public function handle(): int
  40. {
  41. $this->info('Starting corpus update process...');
  42. // Get the corpus base path from config
  43. if ($this->option('dir')) {
  44. $corpusBasePath = $this->option('dir');
  45. } else {
  46. $corpusBasePath = config('mint.path.corpus');
  47. }
  48. if (!is_dir($corpusBasePath)) {
  49. $this->error("Corpus directory not found: {$corpusBasePath}");
  50. return self::FAILURE;
  51. }
  52. // Scan subdirectories of the corpus path
  53. $stores = $this->getSubdirectories($corpusBasePath);
  54. if (empty($stores)) {
  55. $this->warn('No subdirectories found in corpus path.');
  56. return self::SUCCESS;
  57. }
  58. $this->info("Found " . count($stores) . " subdirectories to process.");
  59. $totalProcessed = 0;
  60. $totalErrors = 0;
  61. foreach ($stores as $store) {
  62. $this->info("Processing directory: {$store}");
  63. try {
  64. $stats = $this->processCorpusDirectory($store);
  65. $totalProcessed += $stats['processed'];
  66. $totalErrors += $stats['errors'];
  67. $this->info("Directory processed: {$stats['processed']} records saved, {$stats['errors']} errors");
  68. if ($this->option('es') && isset($stats['channels'])) {
  69. foreach ($stats['channels'] as $key => $channelId) {
  70. $this->call('upgrade:progress', ['--channel' => $channelId]);
  71. $this->call('upgrade:progress.chapter', ['--channel' => $channelId]);
  72. $this->call('opensearch:index-tipitaka', [
  73. 'book' => 0,
  74. '--channel' => $channelId,
  75. '--granularity' => 'chapter',
  76. '--summary' => 'off'
  77. ]);
  78. }
  79. }
  80. } catch (\Exception $e) {
  81. $this->error("Failed to process directory {$store}: {$e->getMessage()}");
  82. Log::error("Failed to process directory", [
  83. 'dir' => $store,
  84. 'message' => $e->getMessage(),
  85. 'file' => $e->getFile(),
  86. 'line' => $e->getLine(),
  87. 'trace' => $e->getTraceAsString(),
  88. ]);
  89. $totalErrors++;
  90. }
  91. }
  92. $this->info("Corpus update completed. Total processed: {$totalProcessed}, Total errors: {$totalErrors}");
  93. return $totalErrors > 0 ? self::FAILURE : self::SUCCESS;
  94. }
  95. /**
  96. * Get all subdirectories of a given directory.
  97. *
  98. * @param string $path
  99. * @return array
  100. */
  101. protected function getSubdirectories(string $path): array
  102. {
  103. $directories = [];
  104. $items = scandir($path);
  105. foreach ($items as $item) {
  106. if ($item === '.' || $item === '..') {
  107. continue;
  108. }
  109. $fullPath = $path . DIRECTORY_SEPARATOR . $item;
  110. if (is_dir($fullPath)) {
  111. $directories[] = $fullPath;
  112. }
  113. }
  114. return $directories;
  115. }
  116. /**
  117. * Process a single corpus directory.
  118. *
  119. * @param string $directoryPath
  120. * @return array
  121. * @throws \Exception
  122. */
  123. protected function processCorpusDirectory(string $directoryPath): array
  124. {
  125. $stats = [
  126. 'processed' => 0,
  127. 'errors' => 0,
  128. ];
  129. // Read meta.json file
  130. $metaFile = $directoryPath . DIRECTORY_SEPARATOR . 'meta.json';
  131. if (!file_exists($metaFile)) {
  132. $this->warn("meta.json not found in directory: {$directoryPath}");
  133. return $stats;
  134. }
  135. $metaData = json_decode(file_get_contents($metaFile), true);
  136. if (!isset($metaData['id'])) {
  137. $this->error("Invalid meta.json: missing 'id' field in {$directoryPath}");
  138. return $stats;
  139. }
  140. $sourceId = $metaData['id'];
  141. $this->info("Processing {$directoryPath} source ID: {$sourceId}");
  142. // Find all channel records with matching source_id
  143. $channels = Channel::where('source_id', $sourceId)->get();
  144. if ($channels->isEmpty()) {
  145. $this->warn("No channels found with source_id: {$sourceId}");
  146. return $stats;
  147. }
  148. $this->info("Found {$channels->count()} channel(s) for source ID: {$sourceId}");
  149. $glossaryFile = $directoryPath . DIRECTORY_SEPARATOR . 'glossary.csv';
  150. if (file_exists($glossaryFile)) {
  151. $status = $this->processGlossary($glossaryFile, $channels);
  152. $this->line('glossary load');
  153. }
  154. // Scan subdirectories of the current directory for JSONL files
  155. $childDirectories = $this->getSubdirectories($directoryPath);
  156. foreach ($childDirectories as $childDir) {
  157. $this->info("Scanning directory for JSONL files: {$childDir}");
  158. $jsonlFiles = glob($childDir . DIRECTORY_SEPARATOR . '*.jsonl');
  159. foreach ($jsonlFiles as $jsonlFile) {
  160. $this->line("Processing file: {$jsonlFile}");
  161. $fileStats = $this->processJsonlFile($jsonlFile, $channels);
  162. $stats['processed'] += $fileStats['processed'];
  163. $stats['errors'] += $fileStats['errors'];
  164. }
  165. }
  166. $stats['channels'] = array_map(fn($item) => $item['uid'], $channels->toArray());
  167. return $stats;
  168. }
  169. /**
  170. * Process a glossary csv file and save glossary for each channel.
  171. *
  172. * @param string $filePath
  173. * @param \Illuminate\Database\Eloquent\Collection $channels
  174. * @return array
  175. */
  176. protected function processGlossary(string $filePath, $channels): array
  177. {
  178. $stats = [
  179. 'processed' => 0,
  180. 'errors' => 0,
  181. ];
  182. $handle = fopen($filePath, 'r');
  183. if (!$handle) {
  184. $this->error("Failed to open file: {$filePath}");
  185. return $stats;
  186. }
  187. $robotUid = config('mint.admin.robot_uuid');
  188. if (!$robotUid) {
  189. $this->error('robot_uuid not configured in mint.admin.robot_uid');
  190. fclose($handle);
  191. return $stats;
  192. }
  193. // 读取表头行
  194. $headers = fgetcsv($handle);
  195. if ($headers === false) {
  196. $this->error("Failed to read CSV headers from: {$filePath}");
  197. fclose($handle);
  198. return $stats;
  199. }
  200. $lineNumber = 0;
  201. while (($row = fgetcsv($handle)) !== false) {
  202. $lineNumber++;
  203. if (count($row) !== count($headers)) {
  204. $this->error("Column count mismatch at line {$lineNumber} in file: {$filePath}");
  205. $stats['errors']++;
  206. continue;
  207. }
  208. $data = array_combine($headers, $row);
  209. $editor_id = UserApi::getIdByUuid($robotUid);
  210. foreach ($channels as $channel) {
  211. try {
  212. $saveData = [
  213. 'word' => $data['pali_word'],
  214. 'tag' => $data['tag'] ?? null,
  215. 'channel_id' => $channel->uid,
  216. 'meaning' => $data['meaning'],
  217. 'redirect' => $data['redirect'] ?? null,
  218. 'other_meaning' => $data['meaning2'] ?: null,
  219. 'note' => $data['note'] ?: null,
  220. 'editor_id' => $editor_id,
  221. ];
  222. DB::transaction(function () use ($saveData) {
  223. $this->termService->updateOrCreateByWord($saveData);
  224. });
  225. $stats['processed']++;
  226. } catch (\Exception $e) {
  227. $this->error("Failed to save glossary for channel {$channel->uid} at line {$lineNumber}: {$e->getMessage()}");
  228. $stats['errors']++;
  229. }
  230. }
  231. }
  232. fclose($handle);
  233. $this->line("glossary {$lineNumber} lines processed");
  234. return $stats;
  235. }
  236. /**
  237. * Process a single JSONL file and save records for each channel.
  238. *
  239. * @param string $filePath
  240. * @param \Illuminate\Database\Eloquent\Collection $channels
  241. * @return array
  242. */
  243. protected function processJsonlFile(string $filePath, $channels): array
  244. {
  245. $stats = [
  246. 'processed' => 0,
  247. 'errors' => 0,
  248. ];
  249. $handle = fopen($filePath, 'r');
  250. if (!$handle) {
  251. $this->error("Failed to open file: {$filePath}");
  252. return $stats;
  253. }
  254. $lineNumber = 0;
  255. $robotUid = config('mint.admin.robot_uuid');
  256. if (!$robotUid) {
  257. $this->error('robot_uuid not configured in mint.admin.robot_uuid');
  258. fclose($handle);
  259. return $stats;
  260. }
  261. while (($line = fgets($handle)) !== false) {
  262. $lineNumber++;
  263. $line = trim($line);
  264. if (empty($line)) {
  265. continue;
  266. }
  267. // Parse JSON line
  268. $data = json_decode($line, true);
  269. if ($data === null) {
  270. $this->error("Failed to parse JSON at line {$lineNumber} in file: {$filePath}");
  271. $stats['errors']++;
  272. continue;
  273. }
  274. // Save for each channel
  275. foreach ($channels as $channel) {
  276. try {
  277. [$book, $para, $start, $end] = explode('-', $data['id']);
  278. $saveData = [
  279. 'book_id' => $book,
  280. 'paragraph' => $para,
  281. 'word_start' => $start,
  282. 'word_end' => $end,
  283. 'content' => $data['content'],
  284. 'channel_uid' => $channel->uid,
  285. 'editor_uid' => $robotUid,
  286. ];
  287. DB::transaction(function () use ($saveData) {
  288. $this->sentenceService->save($saveData);
  289. });
  290. $stats['processed']++;
  291. //$this->line("Saved record for channel: {$channel->uid}");
  292. } catch (\Exception $e) {
  293. $this->error("Failed to save record for channel {$channel->uid} at line {$lineNumber}: {$e->getMessage()}");
  294. $stats['errors']++;
  295. }
  296. }
  297. }
  298. fclose($handle);
  299. $this->line("$lineNumber lines write");
  300. return $stats;
  301. }
  302. }