visuddhinanda před 4 měsíci
rodič
revize
b51003a9cb

+ 214 - 0
api-v8/app/Services/SummaryService.php

@@ -0,0 +1,214 @@
+<?php
+
+namespace App\Services;
+
+use Illuminate\Support\Facades\Http;
+use App\Services\AIModelService;
+use Illuminate\Support\Facades\Cache;
+use Illuminate\Support\Facades\Log;
+
+class SummaryService
+{
+    protected string $modelId;
+    protected string $apiUrl = '';
+    protected string $apiModel = 'deepseek-v3';
+    protected int $maxRetries = 3;
+    protected int $chunkSize = 20000; // 每段字符数,可根据模型上下文调整
+    private string $system_prompt = '你是一个摘要写作助手.请根据用户的输入文本生成中文的摘要,直接输出摘要,无需解释说明。';
+
+    /**
+     * 创建服务实例,初始化 OpenAI API Key
+     *
+     * @return void
+     */
+    public function __construct(AIModelService $aiModels)
+    {
+        $models = $aiModels->getSysModels('summarize');
+        $this->modelId = $models[0]['uid'];
+        $this->apiUrl = config('mint.ai.proxy') . '/api/openai';
+    }
+
+    /**
+     * 生成输入文本的摘要,并支持缓存与强制刷新。
+     *
+     * 此方法会根据文本长度自动拆分为多个片段,
+     * 对每个片段调用模型生成部分摘要,
+     * 并最终将所有部分摘要再次合并生成整体摘要。
+     *
+     * 同时支持缓存机制:
+     * - 缓存键使用文本内容的 md5 计算。
+     * - 默认缓存有效期为 1 天。
+     * - 可通过 forceRefresh 参数强制重新生成摘要。
+     *
+     * @param  string  $text          输入的 Markdown 文本
+     * @param  int     $maxTokens     每次请求允许的最大 tokens 数
+     * @param  bool    $forceRefresh  是否忽略缓存并强制刷新摘要
+     * @return string                 最终生成的摘要文本
+     */
+    public function summarize(string $text, int $maxTokens = 500, bool $forceRefresh = false): string
+    {
+        // 1️⃣ 计算缓存 key
+        $cacheKey = 'summary_' . md5($text);
+
+        // 2️⃣ 检查缓存命中
+        if (!$forceRefresh && Cache::has($cacheKey)) {
+            Log::debug("SummaryService cache hit", ['key' => $cacheKey]);
+            return Cache::get($cacheKey);
+        }
+
+        Log::debug("SummaryService generating new summary", [
+            'key' => $cacheKey,
+            'forceRefresh' => $forceRefresh
+        ]);
+
+        // 3️⃣ 执行摘要逻辑
+        $chunks = $this->splitText($text, $this->chunkSize);
+        $partialSummaries = [];
+
+        foreach ($chunks as $chunk) {
+            $summary = $this->callOpenAI($chunk, $maxTokens);
+            if ($summary !== '') {
+                $partialSummaries[] = $summary;
+            }
+        }
+
+        if (count($partialSummaries) === 0) {
+            Log::warning("SummaryService no partial summaries", ['key' => $cacheKey]);
+            return '';
+        }
+
+        $finalSummary = '';
+        if (count($partialSummaries) === 1) {
+            $finalSummary = $partialSummaries[0];
+        } else {
+            $combinedText = implode("\n\n", $partialSummaries);
+            $finalSummary = $this->callOpenAI($combinedText, $maxTokens);
+        }
+
+        // 4️⃣ 写入缓存(默认缓存 1 周)
+        Cache::put($cacheKey, $finalSummary, now()->addWeek());
+
+        Log::debug("SummaryService cached new summary", [
+            'key' => $cacheKey,
+            'summary' => mb_substr($finalSummary, 0, 10, 'UTF-8')
+        ]);
+
+        return $finalSummary;
+    }
+
+    /**
+     * 按段落拆分文本
+     *
+     * 将 Markdown 文本按空行识别为段落,
+     * 避免在段落中间截断。
+     * 如果段落超过设定 chunkSize,则按字符截断。
+     *
+     * @param  string  $text       输入的 Markdown 文本
+     * @param  int     $chunkSize  每个块的最大字符数
+     * @return array               分割后的文本块数组
+     */
+    protected function splitText(string $text, int $chunkSize): array
+    {
+        $paragraphs = preg_split("/\r?\n\r?\n/", $text); // 按空行拆段落
+        $chunks = [];
+        $currentChunk = '';
+
+        foreach ($paragraphs as $para) {
+            $para = trim($para);
+            if ($para === '') {
+                continue;
+            }
+
+            // 如果单段落超长,按 chunkSize 截断
+            if (mb_strlen($para) > $chunkSize) {
+                $subStart = 0;
+                while ($subStart < mb_strlen($para)) {
+                    $subChunk = mb_substr($para, $subStart, $chunkSize);
+                    $chunks[] = $subChunk;
+                    $subStart += $chunkSize;
+                }
+                continue;
+            }
+
+            // 如果加上当前段落超过 chunkSize,则先保存当前 chunk
+            if (mb_strlen($currentChunk) + mb_strlen($para) + 2 > $chunkSize) { // +2 保留空行
+                $chunks[] = $currentChunk;
+                $currentChunk = $para;
+            } else {
+                // 否则累加到当前 chunk
+                $currentChunk .= ($currentChunk === '' ? '' : "\n\n") . $para;
+            }
+        }
+
+        if ($currentChunk !== '') {
+            $chunks[] = $currentChunk;
+        }
+
+        return $chunks;
+    }
+
+    /**
+     * 调用 OpenAI GPT 模型生成摘要
+     *
+     * 带有重试机制和指数退避。
+     * 在 429 或 500+ 错误时重试,最大重试次数为 maxRetries。
+     * 其他错误直接返回空字符串。
+     *
+     * @param  string  $text       输入文本
+     * @param  int     $maxTokens  每次请求允许的最大 tokens 数
+     * @return string              模型返回的摘要文本
+     */
+    protected function callOpenAI(string $text, int $maxTokens = 200): string
+    {
+        $attempt = 0;
+        $delay = 1;
+
+        $payload = [
+            'model' => $this->modelId,
+            'messages' => [
+                [
+                    'role' => 'system',
+                    'content' => $this->system_prompt
+                ],
+                [
+                    'role' => 'user',
+                    'content' => $text
+                ],
+            ],
+            'max_tokens' => $maxTokens,
+        ];
+        while ($attempt < $this->maxRetries) {
+            try {
+                $response = Http::timeout(100)
+                    ->withHeaders([
+                        'Authorization' => 'Bearer ',
+                        'Content-Type' => 'application/json',
+                    ])->post($this->apiUrl, [
+                        'model_id' => $this->modelId,
+                        'payload' => $payload
+                    ]);
+
+                if ($response->successful()) {
+                    $data = $response->json();
+                    return $data['choices'][0]['message']['content'] ?? '';
+                }
+
+                if (in_array($response->status(), [429, 500, 502, 503, 504])) {
+                    throw new \Exception("Temporary server error: " . $response->status());
+                }
+
+                return '';
+            } catch (\Exception $e) {
+                $attempt++;
+                if ($attempt >= $this->maxRetries) {
+                    return '';
+                }
+
+                sleep($delay);
+                $delay *= 10;
+            }
+        }
+
+        return '';
+    }
+}

+ 60 - 0
dashboard-v4/dashboard/src/components/chat/SearchResults.tsx

@@ -0,0 +1,60 @@
+import React, { useState } from "react";
+import { List, Pagination, Typography, Card, Space } from "antd";
+import { CalendarOutlined, FileTextOutlined } from "@ant-design/icons";
+import { ElasticsearchResponse, WikipaliDocument } from "../../types/search";
+
+const { Paragraph } = Typography;
+
+interface SearchResultsProps {
+  data: ElasticsearchResponse<WikipaliDocument>;
+  onPageChange?: (page: number, pageSize: number) => void;
+  pageSize?: number;
+}
+
+const SearchResults: React.FC<SearchResultsProps> = ({
+  data,
+  onPageChange,
+  pageSize = 20,
+}) => {
+  return (
+    <div className="search-results">
+      {/* 搜索结果列表 */}
+      <List
+        size="small"
+        dataSource={data.hits.hits}
+        pagination={{
+          onChange: onPageChange,
+          onShowSizeChange: onPageChange,
+          pageSize: pageSize,
+          total: data.hits.total.value,
+          showQuickJumper: true,
+          showTotal: (total, range) =>
+            `第 ${range[0]}-${range[1]} 条,共 ${total} 条`,
+        }}
+        renderItem={(item, index) => {
+          const previewText = item._source.content.text;
+          return (
+            <List.Item key={item._id}>
+              <List.Item.Meta
+                title={item._source.title.text}
+                description={
+                  <>
+                    <Paragraph
+                      type="secondary"
+                      ellipsis={{ rows: 2, expandable: true }}
+                    >
+                      {previewText}
+                    </Paragraph>
+                    <div>{item._source.path}</div>
+                  </>
+                }
+              />
+            </List.Item>
+          );
+        }}
+      />
+    </div>
+  );
+};
+
+export default SearchResults;

+ 53 - 0
dashboard-v4/dashboard/src/components/chat/ToolMessage.tsx

@@ -0,0 +1,53 @@
+import { Collapse } from "antd";
+import { SessionInfo } from "../../types/chat";
+import { getArgs, WikipaliSearchResponse } from "../../types/search";
+import SearchResults from "./SearchResults";
+
+const { Panel } = Collapse;
+
+interface IWidget {
+  session?: SessionInfo;
+}
+const ToolMessage = ({ session }: IWidget) => {
+  console.debug("ai chat render", session);
+  //找到llm请求的message 可能有多个
+  const toolCallMessages = session?.messages.filter((msg) => msg.tool_calls);
+  return (
+    <div key={"tool_calls"} className="tool-result">
+      {toolCallMessages?.map((msg, index) => {
+        return (
+          <Collapse key={index} style={{ borderRadius: 12 }}>
+            {msg.tool_calls?.map((tool) => {
+              //找到对应的结果
+              const search = session?.messages.find(
+                (msg) => msg.tool_call_id === tool.id
+              );
+              if (!search?.content) {
+                return <>没有结果</>;
+              } else {
+                const searchResult = getArgs<WikipaliSearchResponse>(
+                  search.content
+                );
+                return (
+                  search?.content && (
+                    <Panel
+                      header={`${tool?.function.name} ${tool?.function.arguments}`}
+                      key="1"
+                      extra={<>{`${searchResult.hits.total.value}个结果`}</>}
+                    >
+                      <div className="tool-content">
+                        <SearchResults data={searchResult} />
+                      </div>
+                    </Panel>
+                  )
+                );
+              }
+            })}
+          </Collapse>
+        );
+      })}
+    </div>
+  );
+};
+
+export default ToolMessage;

+ 339 - 0
dashboard-v4/dashboard/src/components/corpus/NissayaAligner.tsx

@@ -0,0 +1,339 @@
+import React, { useEffect, useState } from "react";
+import {
+  Steps,
+  Upload,
+  Button,
+  Table,
+  Input,
+  message,
+  Typography,
+  Space,
+} from "antd";
+import type { UploadChangeParam, UploadFile } from "antd/es/upload/interface";
+import {
+  InboxOutlined,
+  CopyOutlined,
+  UpSquareOutlined,
+  DownSquareOutlined,
+} from "@ant-design/icons";
+import { post } from "../../request";
+import { ISentenceDiffRequest, ISentenceDiffResponse } from "../api/Corpus";
+
+const { Step } = Steps;
+const { Dragger } = Upload;
+const { TextArea } = Input;
+const { Title } = Typography;
+
+interface WordData {
+  id: number;
+  pali: string;
+  nissaya: string;
+  note?: string;
+}
+
+interface SentenceData {
+  id: string;
+  content: string;
+}
+
+interface AlignResult {
+  id: string;
+  words: string;
+}
+
+interface IWidget {
+  sentencesId?: string[];
+}
+
+const NissayaAligner = ({ sentencesId }: IWidget) => {
+  const [current, setCurrent] = useState<number>(0);
+  const [csvData, setCsvData] = useState<WordData[]>([]);
+  const [jsonlInput, setJsonlInput] = useState<string>("");
+  const [alignResults, setAlignResults] = useState<AlignResult[]>([]);
+  const [original, setOriginal] = useState<SentenceData[]>([]);
+
+  useEffect(() => {
+    if (sentencesId) {
+      post<ISentenceDiffRequest, ISentenceDiffResponse>(`/v2/sent-in-channel`, {
+        sentences: sentencesId,
+        channels: ["_System_Pali_VRI_"],
+      }).then((json) => {
+        if (json.ok) {
+          setOriginal(
+            json.data.rows
+              .sort((a, b) => {
+                if (a.book_id !== b.book_id) {
+                  return a.book_id - b.book_id;
+                }
+                if (a.paragraph !== b.paragraph) {
+                  return a.paragraph - b.paragraph;
+                }
+                return a.word_start - b.word_start;
+              })
+              .map((item) => {
+                return {
+                  id: `${item.book_id}-${item.paragraph}-${item.word_start}-${item.word_end}`,
+                  content: item.content ?? "",
+                };
+              })
+          );
+        }
+      });
+    }
+  }, [sentencesId]);
+
+  const handleUpload = (info: UploadChangeParam<UploadFile<any>>) => {
+    console.log("Upload change event:", info);
+    const file = info.fileList?.[0]?.originFileObj || info.file.originFileObj;
+    if (!file) {
+      console.error("No valid file found in upload event:", info);
+      message.error("未检测到文件,请重新选择");
+      return;
+    }
+    console.log("Selected file:", file.name, file);
+    const reader = new FileReader();
+    reader.onload = (e) => {
+      const text = e.target?.result as string;
+      console.log("✅ File read complete. Length:", text?.length || 0);
+      parseCSV(text);
+    };
+    reader.onerror = (err) => {
+      console.error("❌ File read error:", err);
+      message.error("读取文件失败");
+    };
+    console.log("📖 Start reading file as text...");
+    reader.readAsText(file as Blob, "utf-8");
+  };
+
+  const parseCSV = (text: string) => {
+    console.log(
+      "Parsing CSV... Raw preview (first 300 chars):",
+      text.slice(0, 300)
+    );
+    const delimiter = text.includes("\t") ? "\t" : ",";
+    console.log("Detected delimiter:", delimiter === "\t" ? "TAB" : "COMMA");
+    const lines = text.trim().split(/\r?\n/);
+    const headers = lines[0]
+      .split(delimiter)
+      .map((h) => h.replace(/\"/g, "").trim());
+    console.log("Detected headers:", headers);
+
+    const data: WordData[] = lines.slice(1).map((line, i) => {
+      const cols = line
+        .split(delimiter)
+        .map((c) => c.replace(/\"/g, "").trim());
+      return {
+        id: i + 1,
+        pali:
+          cols[headers.findIndex((h) => h.toLowerCase().includes("pali"))] ||
+          "",
+        nissaya:
+          cols[headers.findIndex((h) => h.toLowerCase().includes("nissaya"))] ||
+          "",
+        note:
+          cols[headers.findIndex((h) => h.toLowerCase().includes("note"))] ||
+          "",
+      };
+    });
+
+    console.log("✅ Parsed CSV rows count:", data.length);
+    console.table(data.slice(0, 5));
+    setCsvData(data);
+    message.success(`CSV 文件解析成功,共 ${data.length} 行`);
+  };
+
+  const generatePrompt = (): string => {
+    const sentenceJsonl = original
+      .map((s) => `{"id": "${s.id}", "content": "${s.content}"}`)
+      .join("\n");
+    const csvText = ["id,pali,nissaya,note"]
+      .concat(
+        csvData.map(
+          (r) => `${r.id},"${r.pali}","${r.nissaya}","${r.note || ""}"`
+        )
+      )
+      .join("\n");
+    const prompt =
+      "将逐词解析数据与句子对应,一个句子对多个逐词解析数据。不是每个单词都有逐词解析数据,保持逐词解析数据的顺序不变,不可以有遗漏。对齐结果jsonl格式,每行一个句子,每个句子有三个字段 , id, content,words. 前两个字段与句子数据相同。逐词解析数据放在words字段中 , words字段里的数据为逐词解析数据的id字段,多个单词之间用逗号隔开";
+    return `# 句子数据\n\n\n\`\`\`jsonl\n${sentenceJsonl}\n\`\`\`\n\n# 逐词解析数据\n\n\`\`\`csv\n${csvText}\n\`\`\`\n\n${prompt}`;
+  };
+
+  const parseJsonlResults = () => {
+    try {
+      console.log("Parsing JSONL input...");
+      const lines = jsonlInput.trim().split(/\r?\n/);
+      const results = lines.map((line) => JSON.parse(line)) as AlignResult[];
+      console.log("Parsed results:", results);
+      setAlignResults(results);
+      message.success("结果解析成功");
+      setCurrent(3);
+    } catch (err) {
+      console.error("❌ JSONL parse error:", err);
+      message.error("JSONL 格式错误");
+    }
+  };
+
+  const moveWord = (sentenceIndex: number, direction: "prev" | "next") => {
+    console.log(
+      `Moving word: sentenceIndex=${sentenceIndex}, direction=${direction}`
+    );
+    const targetIndex =
+      direction === "prev" ? sentenceIndex - 1 : sentenceIndex + 1;
+    if (targetIndex < 0 || targetIndex >= alignResults.length) return;
+
+    const newResults = [...alignResults];
+    const currentWords = newResults[sentenceIndex].words.split(",");
+    const movingWord =
+      direction === "prev" ? currentWords.shift() : currentWords.pop();
+    if (!movingWord) return;
+
+    const targetWords = newResults[targetIndex].words.split(",");
+    if (direction === "prev") targetWords.push(movingWord);
+    else targetWords.unshift(movingWord);
+
+    newResults[sentenceIndex].words = currentWords.join(",");
+    newResults[targetIndex].words = targetWords.join(",");
+
+    console.log("Updated alignment:", newResults);
+    setAlignResults(newResults);
+  };
+
+  const steps = [
+    {
+      title: "上传 CSV",
+      content: (
+        <>
+          <Dragger
+            accept=".csv,.tsv,.txt"
+            showUploadList={false}
+            beforeUpload={() => false}
+            onChange={handleUpload}
+          >
+            <p className="ant-upload-drag-icon">
+              <InboxOutlined />
+            </p>
+            <p className="ant-upload-text">点击或拖拽上传 CSV 文件</p>
+          </Dragger>
+          {csvData.length > 0 && (
+            <Table
+              dataSource={csvData}
+              rowKey="id"
+              pagination={{ pageSize: 50 }}
+              scroll={{ y: 340 }}
+              columns={[
+                { title: "行号", dataIndex: "id", width: 120 },
+                { title: "Pali", dataIndex: "pali", width: 420 },
+                { title: "Nissaya", dataIndex: "nissaya" },
+              ]}
+            />
+          )}
+        </>
+      ),
+    },
+    {
+      title: "生成提示词",
+      content: (
+        <>
+          <Title level={5}>生成的提示词:</Title>
+          <TextArea rows={20} value={generatePrompt()} readOnly />
+          <Button
+            icon={<CopyOutlined />}
+            onClick={() => {
+              navigator.clipboard.writeText(generatePrompt());
+              message.success("提示词已复制");
+            }}
+          >
+            复制提示词
+          </Button>
+        </>
+      ),
+    },
+    {
+      title: "粘贴 LLM 结果",
+      content: (
+        <>
+          <TextArea
+            rows={12}
+            placeholder="粘贴 LLM 输出的 JSONL 结果"
+            value={jsonlInput}
+            onChange={(e) => setJsonlInput(e.target.value)}
+          />
+          <Button type="primary" onClick={parseJsonlResults}>
+            解析结果
+          </Button>
+        </>
+      ),
+    },
+    {
+      title: "对齐预览",
+      content: (
+        <>
+          {alignResults.map((res, idx) => {
+            const sentence = original.find((s) => s.id === res.id);
+            const wordIds = res.words.split(",").map(Number);
+            const wordList = wordIds
+              .map((id) => csvData.find((d) => d.id === id))
+              .filter(Boolean) as WordData[];
+
+            return (
+              <div key={res.id} style={{ marginBottom: 24 }}>
+                <Title level={5}>
+                  {res.id} — {sentence?.content}
+                </Title>
+                <Space wrap>
+                  {wordList.map((w, i) => {
+                    const isFirst = i === 0;
+                    const isLast = i === wordList.length - 1;
+                    return (
+                      <Button
+                        key={w.id}
+                        type={isFirst || isLast ? "primary" : "default"}
+                        icon={isFirst ? <UpSquareOutlined /> : undefined}
+                        onClick={() => {
+                          if (isFirst) moveWord(idx, "prev");
+                          if (isLast) moveWord(idx, "next");
+                        }}
+                      >
+                        {`${w.pali} (${w.nissaya})`}
+                        {isLast && (
+                          <DownSquareOutlined style={{ marginLeft: 4 }} />
+                        )}
+                      </Button>
+                    );
+                  })}
+                </Space>
+              </div>
+            );
+          })}
+        </>
+      ),
+    },
+  ];
+
+  return (
+    <div style={{ padding: 24 }}>
+      <Steps current={current}>
+        {steps.map((item) => (
+          <Step key={item.title} title={item.title} />
+        ))}
+      </Steps>
+      <div style={{ marginTop: 24 }}>{steps[current].content}</div>
+      <div style={{ marginTop: 24 }}>
+        {current > 0 && (
+          <Button onClick={() => setCurrent(current - 1)}>上一步</Button>
+        )}
+        {current < steps.length - 1 && (
+          <Button
+            type="primary"
+            style={{ marginLeft: 8 }}
+            onClick={() => setCurrent(current + 1)}
+          >
+            下一步
+          </Button>
+        )}
+      </div>
+    </div>
+  );
+};
+
+export default NissayaAligner;

+ 60 - 0
dashboard-v4/dashboard/src/components/corpus/NissayaAlignerModal.tsx

@@ -0,0 +1,60 @@
+import { Modal } from "antd";
+import NissayaAligner from "./NissayaAligner";
+import { useEffect, useState } from "react";
+import { IChannel } from "../channel/Channel";
+
+interface IWidget {
+  trigger?: JSX.Element | string;
+  sentencesId?: string[];
+  channel?: IChannel;
+  open?: boolean;
+  onClose?: Function;
+}
+
+const NissayaAlignerModal = ({
+  trigger,
+  sentencesId,
+  channel,
+  open,
+  onClose,
+}: IWidget) => {
+  const [isModalOpen, setIsModalOpen] = useState(open);
+
+  useEffect(() => setIsModalOpen(open), [open]);
+
+  const showModal = () => {
+    setIsModalOpen(true);
+  };
+
+  const modalClose = () => {
+    setIsModalOpen(false);
+    onClose && onClose();
+  };
+  const handleOk = () => {
+    modalClose();
+  };
+
+  const handleCancel = () => {
+    modalClose();
+  };
+
+  return (
+    <>
+      <span onClick={showModal}>{trigger}</span>
+      <Modal
+        width={"95%"}
+        style={{ maxWidth: 1500 }}
+        title="版本间复制"
+        open={isModalOpen}
+        onOk={handleOk}
+        onCancel={handleCancel}
+        destroyOnClose={true}
+        footer={[]}
+      >
+        <NissayaAligner sentencesId={sentencesId} />
+      </Modal>
+    </>
+  );
+};
+
+export default NissayaAlignerModal;