visuddhinanda 1 Minggu lalu
induk
melakukan
58ebae49c4
1 mengubah file dengan 245 tambahan dan 0 penghapusan
  1. 245 0
      api-v13/app/Console/Commands/UpdateCorpus.php

+ 245 - 0
api-v13/app/Console/Commands/UpdateCorpus.php

@@ -0,0 +1,245 @@
+<?php
+
+namespace App\Console\Commands;
+
+use App\Services\SentenceService;
+use Illuminate\Console\Attributes\Description;
+use Illuminate\Console\Attributes\Signature;
+use Illuminate\Console\Command;
+use Illuminate\Support\Facades\DB;
+use App\Models\Channel;
+
+#[Signature('app:update-corpus')]
+#[Description('Update corpus from JSONL files in corpus directory')]
+class UpdateCorpus extends Command
+{
+    /**
+     * The SentenceService instance.
+     *
+     * @var SentenceService
+     */
+    protected SentenceService $sentenceService;
+
+    /**
+     * Create a new command instance.
+     *
+     * @param SentenceService $sentenceService
+     */
+    public function __construct(SentenceService $sentenceService)
+    {
+        parent::__construct();
+        $this->sentenceService = $sentenceService;
+    }
+
+    /**
+     * Execute the console command.
+     *
+     * @return int
+     */
+    public function handle(): int
+    {
+        $this->info('Starting corpus update process...');
+
+        // Get the corpus base path from config
+        $corpusBasePath = config('mint.path.corpus');
+
+        if (!is_dir($corpusBasePath)) {
+            $this->error("Corpus directory not found: {$corpusBasePath}");
+            return self::FAILURE;
+        }
+
+        // Scan subdirectories of the corpus path
+        $subdirectories = $this->getSubdirectories($corpusBasePath);
+
+        if (empty($subdirectories)) {
+            $this->warn('No subdirectories found in corpus path.');
+            return self::SUCCESS;
+        }
+
+        $this->info("Found " . count($subdirectories) . " subdirectories to process.");
+
+        $totalProcessed = 0;
+        $totalErrors = 0;
+
+        foreach ($subdirectories as $subdir) {
+            $this->info("Processing directory: {$subdir}");
+
+            try {
+                $stats = $this->processCorpusDirectory($subdir);
+                $totalProcessed += $stats['processed'];
+                $totalErrors += $stats['errors'];
+                $this->info("Directory processed: {$stats['processed']} records saved, {$stats['errors']} errors");
+            } catch (\Exception $e) {
+                $this->error("Failed to process directory {$subdir}: {$e->getMessage()}");
+                $totalErrors++;
+            }
+        }
+
+        $this->info("Corpus update completed. Total processed: {$totalProcessed}, Total errors: {$totalErrors}");
+
+        return $totalErrors > 0 ? self::FAILURE : self::SUCCESS;
+    }
+
+    /**
+     * Get all subdirectories of a given directory.
+     *
+     * @param string $path
+     * @return array
+     */
+    protected function getSubdirectories(string $path): array
+    {
+        $directories = [];
+
+        $items = scandir($path);
+        foreach ($items as $item) {
+            if ($item === '.' || $item === '..') {
+                continue;
+            }
+
+            $fullPath = $path . DIRECTORY_SEPARATOR . $item;
+            if (is_dir($fullPath)) {
+                $directories[] = $fullPath;
+            }
+        }
+
+        return $directories;
+    }
+
+    /**
+     * Process a single corpus directory.
+     *
+     * @param string $directoryPath
+     * @return array
+     * @throws \Exception
+     */
+    protected function processCorpusDirectory(string $directoryPath): array
+    {
+        $stats = [
+            'processed' => 0,
+            'errors' => 0,
+        ];
+
+        // Read meta.json file
+        $metaFile = $directoryPath . DIRECTORY_SEPARATOR . 'meta.json';
+
+        if (!file_exists($metaFile)) {
+            $this->warn("meta.json not found in directory: {$directoryPath}");
+            return $stats;
+        }
+
+        $metaData = json_decode(file_get_contents($metaFile), true);
+
+        if (!isset($metaData['id'])) {
+            $this->error("Invalid meta.json: missing 'id' field in {$directoryPath}");
+            return $stats;
+        }
+
+        $sourceId = $metaData['id'];
+        $this->info("Processing {$directoryPath} source ID: {$sourceId}");
+
+        // Find all channel records with matching source_id
+        $channels = Channel::where('source_id', $sourceId)->get();
+
+        if ($channels->isEmpty()) {
+            $this->warn("No channels found with source_id: {$sourceId}");
+            return $stats;
+        }
+
+        $this->info("Found {$channels->count()} channel(s) for source ID: {$sourceId}");
+
+        // Scan subdirectories of the current directory for JSONL files
+        $childDirectories = $this->getSubdirectories($directoryPath);
+
+        foreach ($childDirectories as $childDir) {
+            $this->info("Scanning directory for JSONL files: {$childDir}");
+            $jsonlFiles = glob($childDir . DIRECTORY_SEPARATOR . '*.jsonl');
+
+            foreach ($jsonlFiles as $jsonlFile) {
+                $this->line("Processing file: {$jsonlFile}");
+                $fileStats = $this->processJsonlFile($jsonlFile, $channels);
+                $stats['processed'] += $fileStats['processed'];
+                $stats['errors'] += $fileStats['errors'];
+            }
+        }
+
+        return $stats;
+    }
+
+    /**
+     * Process a single JSONL file and save records for each channel.
+     *
+     * @param string $filePath
+     * @param \Illuminate\Database\Eloquent\Collection $channels
+     * @return array
+     */
+    protected function processJsonlFile(string $filePath, $channels): array
+    {
+        $stats = [
+            'processed' => 0,
+            'errors' => 0,
+        ];
+
+        $handle = fopen($filePath, 'r');
+
+        if (!$handle) {
+            $this->error("Failed to open file: {$filePath}");
+            return $stats;
+        }
+
+        $lineNumber = 0;
+        $robotUid = config('mint.admin.robot_uuid');
+
+        if (!$robotUid) {
+            $this->error('robot_uuid not configured in mint.admin.robot_uuid');
+            fclose($handle);
+            return $stats;
+        }
+
+        while (($line = fgets($handle)) !== false) {
+            $lineNumber++;
+            $line = trim($line);
+
+            if (empty($line)) {
+                continue;
+            }
+
+            // Parse JSON line
+            $data = json_decode($line, true);
+
+            if ($data === null) {
+                $this->error("Failed to parse JSON at line {$lineNumber} in file: {$filePath}");
+                $stats['errors']++;
+                continue;
+            }
+
+            // Save for each channel
+            foreach ($channels as $channel) {
+                try {
+                    $saveData = [
+                        'book_id' => $data['book'],
+                        'paragraph' => $data['paragraph'],
+                        'word_start' => $data['start'],
+                        'word_end' => $data['end'],
+                        'content' => $data['content'],
+                        'channel_uid' => $channel->uid,
+                        'editor_uid' => $robotUid,
+                    ];
+
+                    DB::transaction(function () use ($saveData) {
+                        $this->sentenceService->save($saveData);
+                    });
+
+                    $stats['processed']++;
+                    //$this->line("Saved record for channel: {$channel->uid}");
+                } catch (\Exception $e) {
+                    $this->error("Failed to save record for channel {$channel->uid} at line {$lineNumber}: {$e->getMessage()}");
+                    $stats['errors']++;
+                }
+            }
+        }
+
+        fclose($handle);
+        $this->line("$lineNumber lines write");
+        return $stats;
+    }
+}