NissayaAligner.tsx 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. import _React, { useEffect, useState } from "react";
  2. import {
  3. Steps,
  4. Upload,
  5. Button,
  6. Table,
  7. Input,
  8. message,
  9. Typography,
  10. Space,
  11. } from "antd";
  12. import type { UploadChangeParam, UploadFile } from "antd/es/upload/interface";
  13. import {
  14. InboxOutlined,
  15. CopyOutlined,
  16. UpSquareOutlined,
  17. DownSquareOutlined,
  18. } from "@ant-design/icons";
  19. import { post } from "../../request";
  20. import type {
  21. ISentenceDiffRequest,
  22. ISentenceDiffResponse,
  23. } from "../../api/Corpus";
  24. const { Step } = Steps;
  25. const { Dragger } = Upload;
  26. const { TextArea } = Input;
  27. const { Title } = Typography;
  28. interface WordData {
  29. id: number;
  30. pali: string;
  31. nissaya: string;
  32. note?: string;
  33. }
  34. interface SentenceData {
  35. id: string;
  36. content: string;
  37. }
  38. interface AlignResult {
  39. id: string;
  40. words: string;
  41. }
  42. interface IWidget {
  43. sentencesId?: string[];
  44. }
  45. const NissayaAligner = ({ sentencesId }: IWidget) => {
  46. const [current, setCurrent] = useState<number>(0);
  47. const [csvData, setCsvData] = useState<WordData[]>([]);
  48. const [jsonlInput, setJsonlInput] = useState<string>("");
  49. const [alignResults, setAlignResults] = useState<AlignResult[]>([]);
  50. const [original, setOriginal] = useState<SentenceData[]>([]);
  51. useEffect(() => {
  52. if (sentencesId) {
  53. post<ISentenceDiffRequest, ISentenceDiffResponse>(`/v2/sent-in-channel`, {
  54. sentences: sentencesId,
  55. channels: ["_System_Pali_VRI_"],
  56. }).then((json) => {
  57. if (json.ok) {
  58. setOriginal(
  59. json.data.rows
  60. .sort((a, b) => {
  61. if (a.book_id !== b.book_id) {
  62. return a.book_id - b.book_id;
  63. }
  64. if (a.paragraph !== b.paragraph) {
  65. return a.paragraph - b.paragraph;
  66. }
  67. return a.word_start - b.word_start;
  68. })
  69. .map((item) => {
  70. return {
  71. id: `${item.book_id}-${item.paragraph}-${item.word_start}-${item.word_end}`,
  72. content: item.content ?? "",
  73. };
  74. })
  75. );
  76. }
  77. });
  78. }
  79. }, [sentencesId]);
  80. const handleUpload = (info: UploadChangeParam<UploadFile<any>>) => {
  81. console.log("Upload change event:", info);
  82. const file = info.fileList?.[0]?.originFileObj || info.file.originFileObj;
  83. if (!file) {
  84. console.error("No valid file found in upload event:", info);
  85. message.error("未检测到文件,请重新选择");
  86. return;
  87. }
  88. console.log("Selected file:", file.name, file);
  89. const reader = new FileReader();
  90. reader.onload = (e) => {
  91. const text = e.target?.result as string;
  92. console.log("✅ File read complete. Length:", text?.length || 0);
  93. parseCSV(text);
  94. };
  95. reader.onerror = (err) => {
  96. console.error("❌ File read error:", err);
  97. message.error("读取文件失败");
  98. };
  99. console.log("📖 Start reading file as text...");
  100. reader.readAsText(file as Blob, "utf-8");
  101. };
  102. const parseCSV = (text: string) => {
  103. console.log(
  104. "Parsing CSV... Raw preview (first 300 chars):",
  105. text.slice(0, 300)
  106. );
  107. const delimiter = text.includes("\t") ? "\t" : ",";
  108. console.log("Detected delimiter:", delimiter === "\t" ? "TAB" : "COMMA");
  109. const lines = text.trim().split(/\r?\n/);
  110. const headers = lines[0]
  111. .split(delimiter)
  112. .map((h) => h.replace(/\"/g, "").trim());
  113. console.log("Detected headers:", headers);
  114. const data: WordData[] = lines.slice(1).map((line, i) => {
  115. const cols = line
  116. .split(delimiter)
  117. .map((c) => c.replace(/\"/g, "").trim());
  118. return {
  119. id: i + 1,
  120. pali:
  121. cols[headers.findIndex((h) => h.toLowerCase().includes("pali"))] ||
  122. "",
  123. nissaya:
  124. cols[headers.findIndex((h) => h.toLowerCase().includes("nissaya"))] ||
  125. "",
  126. note:
  127. cols[headers.findIndex((h) => h.toLowerCase().includes("note"))] ||
  128. "",
  129. };
  130. });
  131. console.log("✅ Parsed CSV rows count:", data.length);
  132. console.table(data.slice(0, 5));
  133. setCsvData(data);
  134. message.success(`CSV 文件解析成功,共 ${data.length} 行`);
  135. };
  136. const generatePrompt = (): string => {
  137. const sentenceJsonl = original
  138. .map((s) => `{"id": "${s.id}", "content": "${s.content}"}`)
  139. .join("\n");
  140. const csvText = ["id,pali,nissaya,note"]
  141. .concat(
  142. csvData.map(
  143. (r) => `${r.id},"${r.pali}","${r.nissaya}","${r.note || ""}"`
  144. )
  145. )
  146. .join("\n");
  147. const prompt =
  148. "将逐词解析数据与句子对应,一个句子对多个逐词解析数据。不是每个单词都有逐词解析数据,保持逐词解析数据的顺序不变,不可以有遗漏。对齐结果jsonl格式,每行一个句子,每个句子有三个字段 , id, content,words. 前两个字段与句子数据相同。逐词解析数据放在words字段中 , words字段里的数据为逐词解析数据的id字段,多个单词之间用逗号隔开";
  149. return `# 句子数据\n\n\n\`\`\`jsonl\n${sentenceJsonl}\n\`\`\`\n\n# 逐词解析数据\n\n\`\`\`csv\n${csvText}\n\`\`\`\n\n${prompt}`;
  150. };
  151. const parseJsonlResults = () => {
  152. try {
  153. console.log("Parsing JSONL input...");
  154. const lines = jsonlInput.trim().split(/\r?\n/);
  155. const results = lines.map((line) => JSON.parse(line)) as AlignResult[];
  156. console.log("Parsed results:", results);
  157. setAlignResults(results);
  158. message.success("结果解析成功");
  159. setCurrent(3);
  160. } catch (err) {
  161. console.error("❌ JSONL parse error:", err);
  162. message.error("JSONL 格式错误");
  163. }
  164. };
  165. const moveWord = (sentenceIndex: number, direction: "prev" | "next") => {
  166. console.log(
  167. `Moving word: sentenceIndex=${sentenceIndex}, direction=${direction}`
  168. );
  169. const targetIndex =
  170. direction === "prev" ? sentenceIndex - 1 : sentenceIndex + 1;
  171. if (targetIndex < 0 || targetIndex >= alignResults.length) return;
  172. const newResults = [...alignResults];
  173. const currentWords = newResults[sentenceIndex].words.split(",");
  174. const movingWord =
  175. direction === "prev" ? currentWords.shift() : currentWords.pop();
  176. if (!movingWord) return;
  177. const targetWords = newResults[targetIndex].words.split(",");
  178. if (direction === "prev") targetWords.push(movingWord);
  179. else targetWords.unshift(movingWord);
  180. newResults[sentenceIndex].words = currentWords.join(",");
  181. newResults[targetIndex].words = targetWords.join(",");
  182. console.log("Updated alignment:", newResults);
  183. setAlignResults(newResults);
  184. };
  185. const steps = [
  186. {
  187. title: "上传 CSV",
  188. content: (
  189. <>
  190. <Dragger
  191. accept=".csv,.tsv,.txt"
  192. showUploadList={false}
  193. beforeUpload={() => false}
  194. onChange={handleUpload}
  195. >
  196. <p className="ant-upload-drag-icon">
  197. <InboxOutlined />
  198. </p>
  199. <p className="ant-upload-text">点击或拖拽上传 CSV 文件</p>
  200. </Dragger>
  201. {csvData.length > 0 && (
  202. <Table
  203. dataSource={csvData}
  204. rowKey="id"
  205. pagination={{ pageSize: 50 }}
  206. scroll={{ y: 340 }}
  207. columns={[
  208. { title: "行号", dataIndex: "id", width: 120 },
  209. { title: "Pali", dataIndex: "pali", width: 420 },
  210. { title: "Nissaya", dataIndex: "nissaya" },
  211. ]}
  212. />
  213. )}
  214. </>
  215. ),
  216. },
  217. {
  218. title: "生成提示词",
  219. content: (
  220. <>
  221. <Title level={5}>生成的提示词:</Title>
  222. <TextArea rows={20} value={generatePrompt()} readOnly />
  223. <Button
  224. icon={<CopyOutlined />}
  225. onClick={() => {
  226. navigator.clipboard.writeText(generatePrompt());
  227. message.success("提示词已复制");
  228. }}
  229. >
  230. 复制提示词
  231. </Button>
  232. </>
  233. ),
  234. },
  235. {
  236. title: "粘贴 LLM 结果",
  237. content: (
  238. <>
  239. <TextArea
  240. rows={12}
  241. placeholder="粘贴 LLM 输出的 JSONL 结果"
  242. value={jsonlInput}
  243. onChange={(e) => setJsonlInput(e.target.value)}
  244. />
  245. <Button type="primary" onClick={parseJsonlResults}>
  246. 解析结果
  247. </Button>
  248. </>
  249. ),
  250. },
  251. {
  252. title: "对齐预览",
  253. content: (
  254. <>
  255. {alignResults.map((res, idx) => {
  256. const sentence = original.find((s) => s.id === res.id);
  257. const wordIds = res.words.split(",").map(Number);
  258. const wordList = wordIds
  259. .map((id) => csvData.find((d) => d.id === id))
  260. .filter(Boolean) as WordData[];
  261. return (
  262. <div key={res.id} style={{ marginBottom: 24 }}>
  263. <Title level={5}>
  264. {res.id} — {sentence?.content}
  265. </Title>
  266. <Space wrap>
  267. {wordList.map((w, i) => {
  268. const isFirst = i === 0;
  269. const isLast = i === wordList.length - 1;
  270. return (
  271. <Button
  272. key={w.id}
  273. type={isFirst || isLast ? "primary" : "default"}
  274. icon={isFirst ? <UpSquareOutlined /> : undefined}
  275. onClick={() => {
  276. if (isFirst) moveWord(idx, "prev");
  277. if (isLast) moveWord(idx, "next");
  278. }}
  279. >
  280. {`${w.pali} (${w.nissaya})`}
  281. {isLast && (
  282. <DownSquareOutlined style={{ marginLeft: 4 }} />
  283. )}
  284. </Button>
  285. );
  286. })}
  287. </Space>
  288. </div>
  289. );
  290. })}
  291. </>
  292. ),
  293. },
  294. ];
  295. return (
  296. <div style={{ padding: 24 }}>
  297. <Steps current={current}>
  298. {steps.map((item) => (
  299. <Step key={item.title} title={item.title} />
  300. ))}
  301. </Steps>
  302. <div style={{ marginTop: 24 }}>{steps[current].content}</div>
  303. <div style={{ marginTop: 24 }}>
  304. {current > 0 && (
  305. <Button onClick={() => setCurrent(current - 1)}>上一步</Button>
  306. )}
  307. {current < steps.length - 1 && (
  308. <Button
  309. type="primary"
  310. style={{ marginLeft: 8 }}
  311. onClick={() => setCurrent(current + 1)}
  312. >
  313. 下一步
  314. </Button>
  315. )}
  316. </div>
  317. </div>
  318. );
  319. };
  320. export default NissayaAligner;