|
|
@@ -2,13 +2,20 @@
|
|
|
|
|
|
namespace App\Console\Commands;
|
|
|
|
|
|
+use Illuminate\Support\Facades\Log;
|
|
|
+
|
|
|
+
|
|
|
use App\Services\SentenceService;
|
|
|
+use App\Services\TermService;
|
|
|
use Illuminate\Console\Attributes\Description;
|
|
|
use Illuminate\Console\Attributes\Signature;
|
|
|
use Illuminate\Console\Command;
|
|
|
use Illuminate\Support\Facades\DB;
|
|
|
use App\Models\Channel;
|
|
|
|
|
|
+use App\Http\Api\UserApi;
|
|
|
+
|
|
|
+
|
|
|
#[Signature('app:update-corpus')]
|
|
|
#[Description('Update corpus from JSONL files in corpus directory')]
|
|
|
class UpdateCorpus extends Command
|
|
|
@@ -27,8 +34,6 @@ class UpdateCorpus extends Command
|
|
|
*
|
|
|
* @param SentenceService $sentenceService
|
|
|
*/
|
|
|
- public function __construct(SentenceService $sentenceService)
|
|
|
- {
|
|
|
public function __construct(SentenceService $sentenceService, TermService $termService)
|
|
|
{
|
|
|
parent::__construct();
|
|
|
@@ -54,34 +59,55 @@ class UpdateCorpus extends Command
|
|
|
}
|
|
|
|
|
|
// Scan subdirectories of the corpus path
|
|
|
- $subdirectories = $this->getSubdirectories($corpusBasePath);
|
|
|
+ $stores = $this->getSubdirectories($corpusBasePath);
|
|
|
|
|
|
- if (empty($subdirectories)) {
|
|
|
+ if (empty($stores)) {
|
|
|
$this->warn('No subdirectories found in corpus path.');
|
|
|
return self::SUCCESS;
|
|
|
}
|
|
|
|
|
|
- $this->info("Found " . count($subdirectories) . " subdirectories to process.");
|
|
|
+ $this->info("Found " . count($stores) . " subdirectories to process.");
|
|
|
|
|
|
$totalProcessed = 0;
|
|
|
$totalErrors = 0;
|
|
|
|
|
|
- foreach ($subdirectories as $subdir) {
|
|
|
- $this->info("Processing directory: {$subdir}");
|
|
|
+ foreach ($stores as $store) {
|
|
|
+ $this->info("Processing directory: {$store}");
|
|
|
|
|
|
try {
|
|
|
- $stats = $this->processCorpusDirectory($subdir);
|
|
|
+ $stats = $this->processCorpusDirectory($store);
|
|
|
$totalProcessed += $stats['processed'];
|
|
|
$totalErrors += $stats['errors'];
|
|
|
$this->info("Directory processed: {$stats['processed']} records saved, {$stats['errors']} errors");
|
|
|
+ if (isset($stats['channels'])) {
|
|
|
+ foreach ($stats['channels'] as $key => $channelId) {
|
|
|
+ $this->call('upgrade:progress', ['--channel' => $channelId]);
|
|
|
+ $this->call('upgrade:progress.chapter', ['--channel' => $channelId]);
|
|
|
+ $this->call('opensearch:index-tipitaka', [
|
|
|
+ 'book' => 0,
|
|
|
+ '--channel' => $channelId,
|
|
|
+ '--granularity' => 'chapter',
|
|
|
+ '--summary' => 'off'
|
|
|
+ ]);
|
|
|
+ }
|
|
|
+ }
|
|
|
} catch (\Exception $e) {
|
|
|
- $this->error("Failed to process directory {$subdir}: {$e->getMessage()}");
|
|
|
+ $this->error("Failed to process directory {$store}: {$e->getMessage()}");
|
|
|
+ Log::error("Failed to process directory", [
|
|
|
+ 'dir' => $store,
|
|
|
+ 'message' => $e->getMessage(),
|
|
|
+ 'file' => $e->getFile(),
|
|
|
+ 'line' => $e->getLine(),
|
|
|
+ 'trace' => $e->getTraceAsString(),
|
|
|
+ ]);
|
|
|
$totalErrors++;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$this->info("Corpus update completed. Total processed: {$totalProcessed}, Total errors: {$totalErrors}");
|
|
|
|
|
|
+
|
|
|
+
|
|
|
return $totalErrors > 0 ? self::FAILURE : self::SUCCESS;
|
|
|
}
|
|
|
|
|
|
@@ -152,6 +178,13 @@ class UpdateCorpus extends Command
|
|
|
|
|
|
$this->info("Found {$channels->count()} channel(s) for source ID: {$sourceId}");
|
|
|
|
|
|
+ $glossaryFile = $directoryPath . DIRECTORY_SEPARATOR . 'glossary.csv';
|
|
|
+
|
|
|
+ if (file_exists($glossaryFile)) {
|
|
|
+ $status = $this->processGlossary($glossaryFile, $channels);
|
|
|
+ $this->line('glossary load');
|
|
|
+ }
|
|
|
+
|
|
|
// Scan subdirectories of the current directory for JSONL files
|
|
|
$childDirectories = $this->getSubdirectories($directoryPath);
|
|
|
|
|
|
@@ -166,10 +199,89 @@ class UpdateCorpus extends Command
|
|
|
$stats['errors'] += $fileStats['errors'];
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+ $stats['channels'] = array_map(fn($item) => $item['uid'], $channels->toArray());
|
|
|
return $stats;
|
|
|
}
|
|
|
+ /**
|
|
|
+ * Process a glossary csv file and save glossary for each channel.
|
|
|
+ *
|
|
|
+ * @param string $filePath
|
|
|
+ * @param \Illuminate\Database\Eloquent\Collection $channels
|
|
|
+ * @return array
|
|
|
+ */
|
|
|
+ protected function processGlossary(string $filePath, $channels): array
|
|
|
+ {
|
|
|
+ $stats = [
|
|
|
+ 'processed' => 0,
|
|
|
+ 'errors' => 0,
|
|
|
+ ];
|
|
|
|
|
|
+ $handle = fopen($filePath, 'r');
|
|
|
+
|
|
|
+ if (!$handle) {
|
|
|
+ $this->error("Failed to open file: {$filePath}");
|
|
|
+ return $stats;
|
|
|
+ }
|
|
|
+
|
|
|
+ $robotUid = config('mint.admin.robot_uuid');
|
|
|
+
|
|
|
+ if (!$robotUid) {
|
|
|
+ $this->error('robot_uuid not configured in mint.admin.robot_uid');
|
|
|
+ fclose($handle);
|
|
|
+ return $stats;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 读取表头行
|
|
|
+ $headers = fgetcsv($handle);
|
|
|
+
|
|
|
+ if ($headers === false) {
|
|
|
+ $this->error("Failed to read CSV headers from: {$filePath}");
|
|
|
+ fclose($handle);
|
|
|
+ return $stats;
|
|
|
+ }
|
|
|
+
|
|
|
+ $lineNumber = 0;
|
|
|
+
|
|
|
+ while (($row = fgetcsv($handle)) !== false) {
|
|
|
+ $lineNumber++;
|
|
|
+
|
|
|
+ if (count($row) !== count($headers)) {
|
|
|
+ $this->error("Column count mismatch at line {$lineNumber} in file: {$filePath}");
|
|
|
+ $stats['errors']++;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ $data = array_combine($headers, $row);
|
|
|
+ $editor_id = UserApi::getIdByUuid($robotUid);
|
|
|
+ foreach ($channels as $channel) {
|
|
|
+ try {
|
|
|
+ $saveData = [
|
|
|
+ 'word' => $data['pali_word'],
|
|
|
+ 'tag' => $data['tag'] ?? null,
|
|
|
+ 'channel_id' => $channel->uid,
|
|
|
+ 'meaning' => $data['meaning'],
|
|
|
+ 'redirect' => $data['redirect'] ?? null,
|
|
|
+ 'other_meaning' => $data['meaning2'] ?: null,
|
|
|
+ 'note' => $data['note'] ?: null,
|
|
|
+ 'editor_id' => $editor_id,
|
|
|
+ ];
|
|
|
+
|
|
|
+ DB::transaction(function () use ($saveData) {
|
|
|
+ $this->termService->updateOrCreateByWord($saveData);
|
|
|
+ });
|
|
|
+
|
|
|
+ $stats['processed']++;
|
|
|
+ } catch (\Exception $e) {
|
|
|
+ $this->error("Failed to save glossary for channel {$channel->uid} at line {$lineNumber}: {$e->getMessage()}");
|
|
|
+ $stats['errors']++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fclose($handle);
|
|
|
+ $this->line("glossary {$lineNumber} lines processed");
|
|
|
+ return $stats;
|
|
|
+ }
|
|
|
/**
|
|
|
* Process a single JSONL file and save records for each channel.
|
|
|
*
|
|
|
@@ -220,11 +332,12 @@ class UpdateCorpus extends Command
|
|
|
// Save for each channel
|
|
|
foreach ($channels as $channel) {
|
|
|
try {
|
|
|
+ [$book, $para, $start, $end] = explode('-', $data['id']);
|
|
|
$saveData = [
|
|
|
- 'book_id' => $data['book'],
|
|
|
- 'paragraph' => $data['paragraph'],
|
|
|
- 'word_start' => $data['start'],
|
|
|
- 'word_end' => $data['end'],
|
|
|
+ 'book_id' => $book,
|
|
|
+ 'paragraph' => $para,
|
|
|
+ 'word_start' => $start,
|
|
|
+ 'word_end' => $end,
|
|
|
'content' => $data['content'],
|
|
|
'channel_uid' => $channel->uid,
|
|
|
'editor_uid' => $robotUid,
|