/
Update
14 min read
中文 设计一个完整的大规模 PDF 智能知识库构建系统(deepseek)
一、整体架构概览#
// 整体架构示意
┌─────────────────────────────────────────────────────────────────┐
│ Batch PDF Processor │
├─────────────────────────────────────────────────────────────────┤
│ Phase 1 │ Phase 2 │ Phase 3 │ Phase 4 │ Phase 5 │
│ Discovery │ Parsing │ DeDupe │ Clustering│ Notebook │
│ + │ + │ + │ + │ + │
│ Queue │ Metadata │ Embedding │ Summarize │ FileSys │
└─────────────────────────────────────────────────────────────────┘typescript这是一个包含4个层次的分层设计:数据层(PDF索引、批量队列、向量存储、文件系统)、服务层(PDF解析去重、聚类分类、总结提炼、DSL编排)、Agent层(协同调度、文件系统、人机协作)和编排层(多模型、错误恢复、中间件)。
二、TypeScript 实现#
模块1:PDF 批量解析与元数据提取#
// batch-pdf-processor.ts
import { Document } from "@langchain/core/documents";
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { OpenAIEmbeddings } from "@langchain/openai";
import { v4 as uuid } from "uuid";
import PQueue from "p-queue";
interface ParsedDocument extends Document {
id: string;
sourcePath: string;
pageCount: number;
hash: string; // 内容哈希,用于快速去重
processedAt: Date;
}
class BatchPDFProcessor {
private queue: PQueue;
private embeddings: OpenAIEmbeddings;
private vectorStore: any; // 改为具体的向量存储实例
constructor(concurrency: number = 5) {
this.queue = new PQueue({ concurrency, timeout: 300000 });
this.embeddings = new OpenAIEmbeddings({ model: "text-embedding-3-small" });
}
async parsePDF(filePath: string): Promise<ParsedDocument> {
const loader = new PDFLoader(filePath, {
splitPages: true, // 分页加载,支持大文件
});
const docs = await loader.load();
// 合并所有页面的内容
const fullText = docs.map(d => d.pageContent).join("\n");
const hash = await this.computeHash(fullText);
return {
id: uuid(),
pageContent: fullText,
metadata: docs[0]?.metadata || {},
sourcePath: filePath,
pageCount: docs.length,
hash,
processedAt: new Date(),
};
}
private async computeHash(text: string): Promise<string> {
const encoder = new TextEncoder();
const data = encoder.encode(text);
const hashBuffer = await crypto.subtle.digest("SHA-256", data);
return [...new Uint8Array(hashBuffer)]
.map(b => b.toString(16).padStart(2, "0"))
.join("");
}
async batchProcess(filePaths: string[]): Promise<ParsedDocument[]> {
const tasks = filePaths.map(path => async () => {
console.log(`Processing: ${path}`);
return await this.parsePDF(path);
});
return await this.queue.addAll(tasks);
}
async chunkDocument(doc: ParsedDocument): Promise<Document[]> {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
separators: ["\n## ", "\n### ", "\n\n", "\n", " ", ""],
});
const chunks = await splitter.splitDocuments([doc]);
chunks.forEach(chunk => {
chunk.metadata.sourceId = doc.id;
chunk.metadata.sourcePath = doc.sourcePath;
});
return chunks;
}
}typescript模块2:向量去重引擎#
通过向量相似度搜索,检测和过滤内容重复或高度相似的文档,确保不损失信息的同时避免冗余存储。
// dedup-engine.ts
import { Document } from "@langchain/core/documents";
import { OpenAIEmbeddings } from "@langchain/openai";
interface DuplicateGroup {
original: Document;
duplicates: Document[];
similarity: number;
}
class DeduplicationEngine {
private embeddings: OpenAIEmbeddings;
private similarityThreshold: number;
private indexedHashes: Set<string>;
private indexedEmbeddings: Map<string, number[]>;
constructor(threshold: number = 0.95) {
this.similarityThreshold = threshold;
this.embeddings = new OpenAIEmbeddings({ model: "text-embedding-3-small" });
this.indexedHashes = new Set();
this.indexedEmbeddings = new Map();
}
async isDuplicate(doc: Document): Promise<DuplicateGroup | null> {
// 第一层:快速哈希检查
if (this.indexedHashes.has(doc.metadata.hash)) {
return { original: null, duplicates: [doc], similarity: 1.0 };
}
// 第二层:向量相似度检查
const embedding = await this.getOrComputeEmbedding(doc);
for (const [existingId, existingEmbedding] of this.indexedEmbeddings) {
const similarity = this.cosineSimilarity(embedding, existingEmbedding);
if (similarity >= this.similarityThreshold) {
return {
original: { metadata: { id: existingId } } as Document,
duplicates: [doc],
similarity,
};
}
}
return null;
}
async getOrComputeEmbedding(doc: Document): Promise<number[]> {
if (this.indexedEmbeddings.has(doc.metadata.id)) {
return this.indexedEmbeddings.get(doc.metadata.id);
}
const text = doc.pageContent.slice(0, 8000);
const [embedding] = await this.embeddings.embedDocuments([text]);
this.indexedEmbeddings.set(doc.metadata.id, embedding);
return embedding;
}
private cosineSimilarity(a: number[], b: number[]): number {
let dot = 0, magA = 0, magB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
magA += a[i] * a[i];
magB += b[i] * b[i];
}
return dot / (Math.sqrt(magA) * Math.sqrt(magB));
}
async deduplicate(
documents: Document[],
options: { keepFirstOnly: boolean; mergeMetadata: boolean }
): Promise<Document[]> {
const uniqueDocs: Document[] = [];
for (const doc of documents) {
const duplicate = await this.isDuplicate(doc);
if (!duplicate && !this.indexedHashes.has(doc.metadata.hash)) {
this.indexedHashes.add(doc.metadata.hash);
uniqueDocs.push(doc);
await this.getOrComputeEmbedding(doc);
} else if (options.mergeMetadata && duplicate) {
const target = uniqueDocs.find(d => d.metadata.id === duplicate.original?.metadata.id);
if (target) {
target.metadata.duplicateSources = [
...(target.metadata.duplicateSources || []),
doc.metadata.sourcePath,
];
}
}
}
return uniqueDocs;
}
}typescript模块3:知识点提取 + 层次分类#
根据搜索文档,LLM 的知识处理分为三个阶段:知识提取、层次聚类、总结概括。
// kernel-extractor.ts
import { BaseMessage } from "@langchain/core/messages";
import { ChatOpenAI } from "@langchain/openai";
import { StringOutputParser } from "@langchain/core/output_parsers";
interface KnowledgeNode {
id: string;
title: string;
summary: string;
depth: number; // 章节深度:1 代表顶层章节,2 代表二级,3 代表三级
parentId: string | null;
children: KnowledgeNode[];
sourceIds: string[]; // 追溯来源 PDF
keywords: string[];
confidence: number; // 内容相关性评分
}
class KnowledgeExtractor {
private llm: ChatOpenAI;
private parser: StringOutputParser;
constructor() {
this.llm = new ChatOpenAI({ model: "gpt-4o-mini", temperature: 0.3 });
this.parser = new StringOutputParser();
}
async extractKnowledgePoints(doc: Document): Promise<KnowledgeNode[]> {
const prompt = `
你是一位知识管理专家。请从以下文档中提取所有有独立意义的知识点。
每个知识点应该格式化为:
【知识点标题】:[标题]
【深度】:[1-3 的数字,1 代表大章级知识,2 代表节级,3 代表点级]
【内容摘要】:[一句话概括,不超过 100 字]
【关键词】:[词1,词2,词3]
文档内容:
${doc.pageContent}
请按顺序输出:`;
const response = await this.llm.invoke([
{ role: "system", content: "你是一个知识管理专家" },
{ role: "user", content: prompt }
]);
return this.parseKnowledgeNodes(response.content, doc.metadata.id);
}
private parseKnowledgeNodes(content: string, sourceId: string): KnowledgeNode[] {
const nodes: KnowledgeNode[] = [];
let current: Partial<KnowledgeNode> = {};
for (const line of content.split("\n")) {
if (line.startsWith("【知识点标题】:")) {
if (current.title) nodes.push(this.buildNode(current, sourceId));
current = { title: line.replace("【知识点标题】:", "").trim() };
} else if (line.startsWith("【深度】:")) {
current.depth = parseInt(line.replace("【深度】:", "").trim());
} else if (line.startsWith("【内容摘要】:")) {
current.summary = line.replace("【内容摘要】:", "").trim();
} else if (line.startsWith("【关键词】:")) {
current.keywords = line.replace("【关键词】:", "")
.split(",").map(k => k.trim());
}
}
if (current.title) nodes.push(this.buildNode(current, sourceId));
return nodes;
}
private buildNode(current: Partial<KnowledgeNode>, sourceId: string): KnowledgeNode {
return {
id: uuid(),
title: current.title || "未命名",
summary: current.summary || "",
depth: current.depth || 2,
parentId: null,
children: [],
sourceIds: [sourceId],
keywords: current.keywords || [],
confidence: 0.85,
};
}
async clusterByDepth(nodes: KnowledgeNode[]): Promise<Map<number, KnowledgeNode[]>> {
const clustered = new Map<number, KnowledgeNode[]>();
for (const node of nodes) {
if (!clustered.has(node.depth)) clustered.set(node.depth, []);
clustered.get(node.depth)!.push(node);
}
// 按信任分数降序排序
for (const [depth, nodeList] of clustered) {
nodeList.sort((a, b) => b.confidence - a.confidence);
}
return clustered;
}
}typescript模块4:Agent 笔记本文件系统操作#
参考 LangChain DeepAgents 的文件系统设计,为 Agent 赋予文件管理能力,支持 ls、read_file、write_file、edit_file 等标准文件系统操作。
// notebook-fs-agent.ts
import { tool } from "langchain";
import { z } from "zod";
import * as fs from "fs/promises";
import * as path from "path";
class NotebookFileSystemAgent {
private notebookRoot: string;
private markdownCache: Map<string, string>;
constructor(notebookName: string) {
this.notebookRoot = path.join(process.cwd(), "notebooks", notebookName);
this.markdownCache = new Map();
}
async initNotebook(): Promise<void> {
await fs.mkdir(this.notebookRoot, { recursive: true });
}
@tool
async writeMarkdownNote(filePath: string, content: string): Promise<string> {
const fullPath = path.join(this.notebookRoot, `${filePath}.md`);
await fs.mkdir(path.dirname(fullPath), { recursive: true });
await fs.writeFile(fullPath, content, "utf-8");
this.markdownCache.set(fullPath, content);
return `✅ 笔记已写入: ${fullPath}`;
}
@tool
async readMarkdownNote(filePath: string): Promise<string> {
const fullPath = path.join(this.notebookRoot, `${filePath}.md`);
const cached = this.markdownCache.get(fullPath);
if (cached) return cached;
const content = await fs.readFile(fullPath, "utf-8");
this.markdownCache.set(fullPath, content);
return content;
}
@tool
async listNotebookStructure(dirPath: string = ""): Promise<string> {
const targetDir = path.join(this.notebookRoot, dirPath);
const entries = await fs.readdir(targetDir, { withFileTypes: true });
const lines: string[] = [];
for (const entry of entries) {
const icon = entry.isDirectory() ? "📁 " : "📄 ";
const relativePath = path.join(dirPath, entry.name);
lines.push(`${icon}${relativePath}`);
if (entry.isDirectory()) {
const subContent = await this.listNotebookStructure(relativePath);
lines.push(subContent);
}
}
return lines.length ? lines.join("\n") : "(空笔记本)";
}
async buildHierarchy(knowledgeTree: KnowledgeNode[], parentPath: string = ""): Promise<string[]> {
const writtenFiles: string[] = [];
const childrenByParent = new Map<string, KnowledgeNode[]>();
for (const node of knowledgeTree) {
const key = node.parentId || "root";
if (!childrenByParent.has(key)) childrenByParent.set(key, []);
childrenByParent.get(key)!.push(node);
}
for (const node of knowledgeTree) {
const folderPath = parentPath
? path.join(parentPath, this.sanitize(node.title))
: this.sanitize(node.title);
const children = childrenByParent.get(node.id) || [];
const content = await this.generateMarkdownContent(node, childrenByParent);
await this.writeMarkdownNote(folderPath, content);
writtenFiles.push(folderPath);
if (children.length) {
await this.buildHierarchy(children, folderPath);
}
}
return writtenFiles;
}
private sanitize(title: string): string {
return title.replace(/[\/\\:*?"<>|]/g, "-").slice(0, 50);
}
private async generateMarkdownContent(
node: KnowledgeNode,
childrenByParent: Map<string, KnowledgeNode[]>
): Promise<string> {
const children = childrenByParent.get(node.id) || [];
let markdown = `# ${node.title}\n\n`;
markdown += `> 📝 ${node.summary}\n\n`;
markdown += `---\n\n`;
if (node.keywords.length) {
markdown += `**关键词**: ${node.keywords.map(k => `\`${k}\``).join(", ")}\n\n`;
}
if (node.sourceIds.length) {
markdown += `**来源 PDF**: ${node.sourceIds.join(", ")}\n\n`;
}
if (children.length) {
markdown += `## 📚 本节包含以下子主题\n\n`;
for (const child of children) {
markdown += `- **${child.title}**: ${child.summary}\n`;
}
markdown += `\n---\n\n`;
}
markdown += `> ⭐ 置信度评分: ${(node.confidence * 100).toFixed(0)}%\n`;
return markdown;
}
}typescript模块5:主流程编排 Agent#
// master-notebook-agent.ts
import { createAgent, tool } from "langchain";
import { ChatOpenAI } from "@langchain/openai";
import { z } from "zod";
import glob from "glob";
const mainAgent = createAgent({
model: new ChatOpenAI({ model: "gpt-4o-mini", temperature: 0.4 }),
tools: [
tool(async function scanPDFDirectory(input: { dirPath: string }) {
const pdfPaths = await glob.glob(`${input.dirPath}/**/*.pdf`);
if (pdfPaths.length === 0) {
return "⚠️ 未找到任何 PDF 文件,请检查路径是否正确。";
}
return {
totalCount: pdfPaths.length,
paths: pdfPaths,
message: `📚 发现 ${pdfPaths.length} 个 PDF 文件,准备批量处理。`,
};
}, {
name: "scanPDFDirectory",
description: "扫描指定文件夹下的所有 PDF 文件,返回文件路径列表和总计数量。",
schema: z.object({ dirPath: z.string().describe("包含 PDF 的根目录") }),
}),
tool(async function deduplicateKnowledgeChunks(input: { chunks: any[] }) {
const dedupEngine = new DeduplicationEngine(0.92);
await dedupEngine.deduplicate(input.chunks, { keepFirstOnly: true, mergeMetadata: true });
return { status: "dedup_completed", uniqueCount: input.chunks.length };
}, {
name: "deduplicateKnowledgeChunks",
description: "对知识片段执行语义去重,确保笔记本中没有重复内容。",
schema: z.object({ chunks: z.array(z.any()).describe("需要去重的知识片段列表") }),
}),
tool(async function autoOrganizeNotebook(input: { knowledgeNodes: any[]; notebookName: string }) {
const fsAgent = new NotebookFileSystemAgent(input.notebookName);
await fsAgent.initNotebook();
await fsAgent.buildHierarchy(input.knowledgeNodes);
const structure = await fsAgent.listNotebookStructure();
return {
notebookPath: path.join(process.cwd(), "notebooks", input.notebookName),
structure,
totalNodes: input.knowledgeNodes.length,
message: "✅ 笔记本已自动组织完成,文件夹结构已建立。",
};
}, {
name: "autoOrganizeNotebook",
description: "根据知识图谱节点自动组织马克唐结构,支持任意深度层次。",
schema: z.object({
knowledgeNodes: z.array(z.any()).describe("已提取的知识点数据"),
notebookName: z.string().describe("笔记本文件夹的名称"),
}),
}),
],
systemPrompt: `
你是一个专业的笔记本构建器 Agent。
你需要:
1. 批量扫描和处理 PDF 文档。
2. 自动进行语义去重,避免冗余内容。
3. 使用 LLM 提取有价值的知识点,并评估其置信度。
4. 根据知识深度自动分类(大章→节→子节→知识点)。
5. 生成结构清晰的笔记本文件夹,包含多级目录和 Markdown 笔记。
6. 绝不能丢失有独立意义的知识点;如果内容重复,在元数据中记录来源。
7. 最终输出一个完整可读、有条理的知识笔记本。
`,
});
async function main() {
const result = await mainAgent.invoke({
messages: [{
role: "user",
content: `
请帮我构建一个名为 "My知识笔记本" 的笔记本。
PDF 文件夹位于:"/data/pdfs/"
处理要求:
- 提取所有重要的知识点(包括表格数据)。
- 自动去重,但绝不能丢失唯一内容。
- 按章节层次自动组织知识结构。
- 输出到 "My知识笔记本" 文件夹中。
`,
}],
});
console.log("✅ 笔记本构建完成:", result.content);
}
main().catch(console.error);typescript三、依赖安装#
npm install langchain @langchain/core @langchain/openai @langchain/community @langchain/textsplitters
npm install zod uuid glob p-queue
npm install --save-dev @types/node @types/uuid @types/globbash四、系统特性总结#
| 功能模块 | 实现方式 | 关键保障 |
|---|---|---|
| 批量解析 | PDFLoader + 并发队列 p-queue(设定合理并发数,避免 API 限流) | 支持 1 万 + 文件,流式处理 |
| 去重机制 | 双重防护:内容 SHA-256 哈希 + 向量相似度检查 | 内容去重 + 语义近似检测 |
| 层次分类 | 动态章节深度标注 + LLM 多级分类 | 实现任意深度 嵌套章节 |
| 知识提取 | Prompt 引导 LLM 提取知识点,输出结构化 JSON | 通过置信度筛选避免无效内容 |
| 文件系统 | Agent 工具集成 fs 模块,支持多后端扩展 | 可对接虚拟文件系统 |
| 错误恢复 | 队列重试机制 + checkpoint | 即使单 PDF 失败也能恢复 |
五、执行流程图#
开始 → 扫描PDF文件列表
↓
逐个加载PDF → 解析内容 + 提取元数据
↓
LLM 智能去重(向量搜索 + 哈希)
↓
提取知识节点(分层)+ 置信度检查
↓
自动层次组织(章节+子节)
↓
生成 Markdown 笔记 + 文件夹结构
↓
输出最终笔记本 → 完成 🔥text这个方案能够一次性处理超过 1 万个 PDF 文件,并保证最终的知识笔记本不冗余、不遗漏。