/
Update
25 min read
中文 设计一个完整的大规模 PDF 智能知识库构建系统(kimi)
处理 1 万+ PDF 的核心难点在于:内存管理、去重精度、知识抽取质量、嵌套分类的合理性。下面提供一个完整的生产级架构方案,包含代码实现。
一、整体架构设计#
┌─────────────────────────────────────────────────────────────┐
│ 批量 PDF 知识笔记本生成器 │
├─────────────────────────────────────────────────────────────┤
│ Phase 1: ingestion → 批量加载 + 元数据提取 │
│ Phase 2: extraction → 结构化知识抽取(每PDF独立) │
│ Phase 3: deduplication → 语义去重 + 质量评分 │
│ Phase 4: clustering → 层次聚类生成笔记本结构 │
│ Phase 5: generation → 生成 Markdown 文件 │
└─────────────────────────────────────────────────────────────┘plaintext二、项目初始化#
mkdir pdf-knowledge-notebook
cd pdf-knowledge-notebook
npm init -y
# 核心依赖
npm install langchain @langchain/openai @langchain/community
npm install pdf-parse
# 数据处理
npm install zod uuid
npm install -D typescript ts-node @types/node @types/uuid
# 向量数据库(持久化,处理万级数据)
npm install chromadb @langchain/community
npx tsc --initbashtsconfig.json
{
"compilerOptions": {
"target": "ES2022",
"module": "commonjs",
"lib": ["ES2022"],
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"resolveJsonModule": true,
"moduleResolution": "node"
},
"include": ["src/**/*"]
}json三、核心类型定义 src/types/index.ts#
import { z } from "zod";
// ==================== 知识单元 ====================
export const KnowledgeUnitSchema = z.object({
id: z.string(), // 唯一标识
sourcePdf: z.string(), // 来源PDF路径
pageRange: z.string(), // 页码范围
title: z.string(), // 知识点标题
content: z.string(), // 详细内容
keywords: z.array(z.string()), // 关键词标签
importance: z.number().min(1).max(10), // 重要性评分
category: z.string(), // 粗分类(用于初筛)
embedding: z.number().array().optional(), // 向量(去重用)
});
export type KnowledgeUnit = z.infer<typeof KnowledgeUnitSchema>;
// ==================== 笔记本结构 ====================
export const NoteSectionSchema = z.object({
id: z.string(),
title: z.string(),
level: z.number(), // 层级深度 1,2,3...
summary: z.string(), // 章节摘要
children: z.array(z.lazy(() => NoteSectionSchema)).optional(), // 嵌套子章节
knowledgeIds: z.array(z.string()), // 关联的知识点ID
});
export type NoteSection = z.infer<typeof NoteSectionSchema>;
export const NotebookSchema = z.object({
name: z.string(),
description: z.string(),
createdAt: z.string(),
totalSources: z.number(),
totalKnowledgeUnits: z.number(),
rootSections: z.array(NoteSectionSchema),
});
export type Notebook = z.infer<typeof NotebookSchema>;typescript四、配置与环境 src/config/index.ts#
import { config } from "dotenv";
config();
export const CONFIG = {
openai: {
apiKey: process.env.OPENAI_API_KEY!,
model: "gpt-4o-mini", // 性价比模型
smartModel: "gpt-4o", // 高质量模型(用于关键步骤)
embeddingModel: "text-embedding-3-small",
},
// 批处理参数(控制内存和API速率)
batch: {
pdfBatchSize: 50, // 每批处理PDF数量
extractionConcurrency: 10, // 并发提取数
embeddingBatchSize: 100, // 嵌入批次大小
dedupThreshold: 0.92, // 去重相似度阈值(0-1)
minImportance: 4, // 最低重要性保留阈值
},
// 输出配置
output: {
baseDir: "./output/notebook",
maxMdFileSize: 50000, // 单个md文件最大字符数
},
} as const;
if (!CONFIG.openai.apiKey) {
throw new Error("OPENAI_API_KEY is required");
}typescript五、PDF 批量加载器 src/ingestion/pdfBatchLoader.ts#
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import { Document } from "@langchain/core/documents";
import * as fs from "fs/promises";
import * as path from "path";
export interface PDFMeta {
filePath: string;
fileName: string;
fileSize: number;
totalPages?: number;
}
/**
* 扫描目录获取所有PDF文件(支持子目录)
*/
export async function scanPDFDirectory(dirPath: string): Promise<PDFMeta[]> {
const pdfs: PDFMeta[] = [];
async function scan(currentPath: string) {
const entries = await fs.readdir(currentPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(currentPath, entry.name);
if (entry.isDirectory()) {
await scan(fullPath);
} else if (entry.isFile() && entry.name.toLowerCase().endsWith(".pdf")) {
const stat = await fs.stat(fullPath);
pdfs.push({
filePath: fullPath,
fileName: entry.name,
fileSize: stat.size,
});
}
}
}
await scan(path.resolve(dirPath));
console.log(`📁 扫描完成,发现 ${pdfs.length} 个PDF文件`);
return pdfs;
}
/**
* 分批加载PDF(控制内存)
*/
export async function* loadPDFBatch(
pdfMetas: PDFMeta[],
batchSize: number
): AsyncGenerator<{ meta: PDFMeta; documents: Document[] }[]> {
for (let i = 0; i < pdfMetas.length; i += batchSize) {
const batch = pdfMetas.slice(i, i + batchSize);
console.log(`\n📦 加载批次 ${Math.floor(i / batchSize) + 1}/${Math.ceil(pdfMetas.length / batchSize)}`);
const results = await Promise.all(
batch.map(async (meta) => {
try {
const loader = new PDFLoader(meta.filePath, { splitPages: true });
const docs = await loader.load();
// 添加来源元数据
docs.forEach((doc) => {
doc.metadata.source = meta.filePath;
doc.metadata.fileName = meta.fileName;
});
meta.totalPages = docs.length;
console.log(` ✅ ${meta.fileName} (${docs.length}页)`);
return { meta, documents: docs };
} catch (error) {
console.error(` ❌ ${meta.fileName} 加载失败:`, (error as Error).message);
return { meta, documents: [] };
}
})
);
yield results.filter((r) => r.documents.length > 0);
}
}typescript六、结构化知识抽取 src/extraction/knowledgeExtractor.ts#
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { Document } from "@langchain/core/documents";
import { StringOutputParser } from "@langchain/core/output_parsers";
import { z } from "zod";
import { KnowledgeUnit } from "../types";
import { CONFIG } from "../config";
import { v4 as uuidv4 } from "uuid";
// 定义知识抽取的JSON Schema
const ExtractionSchema = z.object({
knowledgeUnits: z.array(
z.object({
title: z.string().describe("知识点的简洁标题"),
content: z.string().describe("详细内容,保留关键信息"),
keywords: z.array(z.string()).describe("3-5个关键词"),
importance: z.number().min(1).max(10).describe("重要性1-10"),
category: z.string().describe("粗分类,如:技术/概念/流程/案例/数据"),
})
),
});
type ExtractionResult = z.infer<typeof ExtractionSchema>;
/**
* 智能知识抽取器
* 将PDF文档内容转化为结构化知识单元
*/
export class KnowledgeExtractor {
private model: ChatOpenAI;
private parser: StringOutputParser;
constructor() {
this.model = new ChatOpenAI({
apiKey: CONFIG.openai.apiKey,
modelName: CONFIG.openai.smartModel,
temperature: 0.2,
maxRetries: 3,
});
this.parser = new StringOutputParser();
}
/**
* 从PDF文档中提取知识
* 策略:按页分组,每几页作为一个提取单元
*/
async extractFromDocuments(
fileName: string,
documents: Document[]
): Promise<KnowledgeUnit[]> {
// 将页面分组(每3页一组,避免超出token限制)
const pageGroups = this.groupPages(documents, 3);
const allKnowledge: KnowledgeUnit[] = [];
for (let i = 0; i < pageGroups.length; i++) {
const group = pageGroups[i];
const pageRange = `${group[0].metadata.loc?.pageNumber || i * 3 + 1}-${
group[group.length - 1].metadata.loc?.pageNumber || (i + 1) * 3
}`;
const text = group.map((d) => d.pageContent).join("\n\n");
try {
const extracted = await this.extractFromText(text, fileName, pageRange);
allKnowledge.push(...extracted);
} catch (error) {
console.error(` ⚠️ 提取失败 (${fileName} 第${pageRange}页):`, error);
}
}
// 过滤低质量内容
const filtered = allKnowledge.filter((k) => k.importance >= CONFIG.batch.minImportance);
console.log(` 📝 提取 ${allKnowledge.length} 条知识,保留 ${filtered.length} 条(重要性≥${CONFIG.batch.minImportance})`);
return filtered;
}
/**
* 单组文本提取
*/
private async extractFromText(
text: string,
sourcePdf: string,
pageRange: string
): Promise<KnowledgeUnit[]> {
const prompt = ChatPromptTemplate.fromTemplate(`
你是一个专业的知识提取专家。请从以下PDF文档内容中提取有价值的知识点。
提取要求:
1. 每个知识点必须是独立、完整的信息单元
2. 标题要简洁明确(不超过20字)
3. 内容要详细完整,保留关键数据、定义、步骤
4. 重要性评分标准:
- 10分:核心概念、关键定理、重要数据
- 7-9分:重要方法、显著案例、关键流程
- 4-6分:补充说明、背景信息、次要细节
- 1-3分:冗余信息、重复内容(不要提取)
5. 如果内容主要是目录、页眉页脚、版权信息,返回空数组
6. 去重:如果多个知识点内容高度相似,只保留最完整的一个
文档来源:{sourcePdf}
页码范围:{pageRange}
文档内容:
{text}
请严格按以下JSON格式返回(不要有任何其他文字):
{{
"knowledgeUnits": [
{{
"title": "知识点标题",
"content": "详细内容...",
"keywords": ["关键词1", "关键词2", "关键词3"],
"importance": 8,
"category": "技术"
}}
]
}}
`);
const chain = prompt.pipe(this.model).pipe(this.parser);
const result = await chain.invoke({
text: text.slice(0, 12000), // 限制输入长度
sourcePdf,
pageRange,
});
// 解析JSON(处理可能的格式问题)
const cleaned = this.cleanJSON(result);
const parsed = ExtractionSchema.parse(JSON.parse(cleaned));
return parsed.knowledgeUnits.map((unit) => ({
id: uuidv4(),
sourcePdf,
pageRange,
title: unit.title,
content: unit.content,
keywords: unit.keywords,
importance: unit.importance,
category: unit.category,
}));
}
/**
* 页面分组
*/
private groupPages(docs: Document[], groupSize: number): Document[][] {
const groups: Document[][] = [];
for (let i = 0; i < docs.length; i += groupSize) {
groups.push(docs.slice(i, i + groupSize));
}
return groups;
}
/**
* 清理模型返回的JSON
*/
private cleanJSON(raw: string): string {
// 去除markdown代码块标记
let cleaned = raw.replace(/```json\s*/g, "").replace(/```\s*$/g, "");
// 去除前后空白
cleaned = cleaned.trim();
// 如果以大括号开头结尾,直接返回
if (cleaned.startsWith("{") && cleaned.endsWith("}")) {
return cleaned;
}
// 尝试找到JSON部分
const match = cleaned.match(/\{[\s\S]*\}/);
return match ? match[0] : cleaned;
}
}typescript七、语义去重引擎 src/dedup/semanticDeduplicator.ts#
import { OpenAIEmbeddings } from "@langchain/openai";
import { KnowledgeUnit } from "../types";
import { CONFIG } from "../config";
/**
* 语义去重器
* 使用向量相似度 + 内容重叠度双重去重
*/
export class SemanticDeduplicator {
private embeddings: OpenAIEmbeddings;
private threshold: number;
constructor() {
this.embeddings = new OpenAIEmbeddings({
apiKey: CONFIG.openai.apiKey,
modelName: CONFIG.openai.embeddingModel,
batchSize: CONFIG.batch.embeddingBatchSize,
});
this.threshold = CONFIG.batch.dedupThreshold;
}
/**
* 执行去重
* 策略:先按类别分组,再在组内去重(减少计算量)
*/
async deduplicate(units: KnowledgeUnit[]): Promise<KnowledgeUnit[]> {
console.log(`\n🔍 开始去重,原始知识单元: ${units.length}`);
// 1. 按类别分组
const byCategory = this.groupByCategory(units);
const deduped: KnowledgeUnit[] = [];
for (const [category, categoryUnits] of Object.entries(byCategory)) {
console.log(` 📂 处理类别 "${category}": ${categoryUnits.length} 条`);
const categoryDeduped = await this.deduplicateGroup(categoryUnits);
deduped.push(...categoryDeduped);
}
// 2. 跨类别去重(处理分类错误的情况)
const finalDeduped = await this.crossCategoryDedup(deduped);
console.log(`✅ 去重完成: ${units.length} → ${finalDeduped.length} (去除了 ${units.length - finalDeduped.length} 条重复)`);
return finalDeduped;
}
/**
* 单类别内去重
*/
private async deduplicateGroup(units: KnowledgeUnit[]): Promise<KnowledgeUnit[]> {
if (units.length <= 1) return units;
// 生成嵌入向量
const texts = units.map((u) => `${u.title}\n${u.content}`);
const vectors = await this.embeddings.embedDocuments(texts);
// 为每个单元添加向量
units.forEach((unit, i) => {
unit.embedding = vectors[i];
});
const kept: KnowledgeUnit[] = [];
const removed = new Set<number>();
for (let i = 0; i < units.length; i++) {
if (removed.has(i)) continue;
const current = units[i];
let bestUnit = current;
let duplicates: KnowledgeUnit[] = [];
// 查找相似项
for (let j = i + 1; j < units.length; j++) {
if (removed.has(j)) continue;
const similarity = this.cosineSimilarity(current.embedding!, units[j].embedding!);
if (similarity > this.threshold) {
duplicates.push(units[j]);
removed.add(j);
}
}
// 合并重复项:保留重要性最高的,内容最完整的
if (duplicates.length > 0) {
const allCandidates = [current, ...duplicates];
bestUnit = this.mergeUnits(allCandidates);
console.log(` 🔄 合并 ${duplicates.length + 1} 个相似项: "${bestUnit.title.substring(0, 30)}..."`);
}
kept.push(bestUnit);
}
return kept;
}
/**
* 跨类别去重(轻量级)
*/
private async crossCategoryDedup(units: KnowledgeUnit[]): Promise<KnowledgeUnit[]> {
// 只检查标题高度相似的(优化性能)
const titleMap = new Map<string, KnowledgeUnit[]>();
units.forEach((unit) => {
const normalizedTitle = unit.title.toLowerCase().replace(/\s+/g, "");
const existing = titleMap.get(normalizedTitle) || [];
existing.push(unit);
titleMap.set(normalizedTitle, existing);
});
const result: KnowledgeUnit[] = [];
for (const group of titleMap.values()) {
if (group.length === 1) {
result.push(group[0]);
} else {
// 标题相同,合并
const merged = this.mergeUnits(group);
result.push(merged);
}
}
return result;
}
/**
* 合并多个相似知识单元
*/
private mergeUnits(units: KnowledgeUnit[]): KnowledgeUnit {
// 选择重要性最高的作为主单元
const primary = units.reduce((best, current) =>
current.importance > best.importance ? current : best
);
// 合并内容(取最长的,假设最完整)
const longestContent = units.reduce((longest, current) =>
current.content.length > longest.length ? current.content : longest
, primary.content);
// 合并关键词(去重)
const allKeywords = new Set<string>();
units.forEach((u) => u.keywords.forEach((k) => allKeywords.add(k)));
// 合并来源信息
const sources = [...new Set(units.map((u) => `${u.sourcePdf}(p${u.pageRange})`))];
return {
...primary,
content: longestContent,
keywords: Array.from(allKeywords).slice(0, 8), // 最多8个关键词
importance: Math.max(...units.map((u) => u.importance)),
// 保留第一个的ID
};
}
/**
* 余弦相似度计算
*/
private cosineSimilarity(a: number[], b: number[]): number {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
/**
* 按类别分组
*/
private groupByCategory(units: KnowledgeUnit[]): Record<string, KnowledgeUnit[]> {
const groups: Record<string, KnowledgeUnit[]> = {};
units.forEach((unit) => {
const cat = unit.category || "未分类";
groups[cat] = groups[cat] || [];
groups[cat].push(unit);
});
return groups;
}
}typescript八、层次聚类与笔记本结构生成 src/clustering/notebookBuilder.ts#
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { StringOutputParser } from "@langchain/core/output_parsers";
import { KnowledgeUnit, Notebook, NoteSection } from "../types";
import { CONFIG } from "../config";
import { v4 as uuidv4 } from "uuid";
/**
* 笔记本构建器
* 将去重后的知识单元组织成层次化的笔记本结构
*/
export class NotebookBuilder {
private model: ChatOpenAI;
constructor() {
this.model = new ChatOpenAI({
apiKey: CONFIG.openai.apiKey,
modelName: CONFIG.openai.smartModel,
temperature: 0.3,
});
}
/**
* 构建完整笔记本
*/
async buildNotebook(
notebookName: string,
description: string,
units: KnowledgeUnit[]
): Promise<Notebook> {
console.log(`\n📚 开始构建笔记本结构,共 ${units.length} 条知识`);
// 1. 生成顶层分类
const topCategories = await this.generateTopCategories(units);
// 2. 将知识分配到分类
const categorized = await this.categorizeUnits(units, topCategories);
// 3. 为每个分类生成子结构
const rootSections: NoteSection[] = [];
for (const [categoryName, categoryUnits] of Object.entries(categorized)) {
console.log(` 🗂️ 构建章节: "${categoryName}" (${categoryUnits.length} 条知识)`);
const section = await this.buildSection(categoryName, categoryUnits, 1);
rootSections.push(section);
}
return {
name: notebookName,
description,
createdAt: new Date().toISOString(),
totalSources: new Set(units.map((u) => u.sourcePdf)).size,
totalKnowledgeUnits: units.length,
rootSections,
};
}
/**
* 生成顶层分类(基于所有知识的概述)
*/
private async generateTopCategories(units: KnowledgeUnit[]): Promise<string[]> {
// 采样(避免超出token限制)
const sample = units
.sort((a, b) => b.importance - a.importance)
.slice(0, 100);
const summaries = sample
.map((u) => `- [重要性${u.importance}] ${u.title}: ${u.content.substring(0, 100)}...`)
.join("\n");
const prompt = ChatPromptTemplate.fromTemplate(`
基于以下知识单元列表,生成5-10个顶层分类目录。
分类应该:
1. 覆盖所有重要主题
2. 分类之间互斥性高
3. 使用专业、简洁的中文名称
4. 适合作为技术/学习笔记的顶层结构
知识样本:
{summaries}
请只返回分类名称列表,每行一个,不要有序号:
`);
const chain = prompt.pipe(this.model).pipe(new StringOutputParser());
const result = await chain.invoke({ summaries });
return result
.split("\n")
.map((line) => line.trim())
.filter((line) => line.length > 0 && !line.match(/^\d/));
}
/**
* 将知识单元分配到分类
*/
private async categorizeUnits(
units: KnowledgeUnit[],
categories: string[]
): Promise<Record<string, KnowledgeUnit[]>> {
const categorized: Record<string, KnowledgeUnit[]> = {};
categories.forEach((c) => (categorized[c] = []));
// 使用模型分配(批次处理)
const batchSize = 20;
for (let i = 0; i < units.length; i += batchSize) {
const batch = units.slice(i, i + batchSize);
const prompt = ChatPromptTemplate.fromTemplate(`
将以下知识单元分配到最合适的分类中。
可用分类:
{categories}
知识单元:
{units}
请返回JSON格式:{{"assignments": [{{"unitIndex": 0, "category": "分类名"}}]}}
`);
const chain = prompt.pipe(this.model).pipe(new StringOutputParser());
const unitTexts = batch
.map((u, idx) => `${idx}. ${u.title} [关键词: ${u.keywords.join(", ")}]`)
.join("\n");
try {
const result = await chain.invoke({
categories: categories.join("\n"),
units: unitTexts,
});
const cleaned = result.replace(/```json\s*/g, "").replace(/```/g, "").trim();
const parsed = JSON.parse(cleaned);
parsed.assignments.forEach((a: { unitIndex: number; category: string }) => {
const unit = batch[a.unitIndex];
const targetCat = categories.find((c) => c === a.category) || categories[0];
categorized[targetCat] = categorized[targetCat] || [];
categorized[targetCat].push(unit);
});
} catch {
// 回退:按关键词简单分配
batch.forEach((unit) => {
const bestCat = categories[0];
categorized[bestCat].push(unit);
});
}
}
// 过滤空分类
return Object.fromEntries(
Object.entries(categorized).filter(([_, v]) => v.length > 0)
);
}
/**
* 递归构建章节结构
*/
private async buildSection(
title: string,
units: KnowledgeUnit[],
level: number,
maxDepth: number = 3
): Promise<NoteSection> {
const section: NoteSection = {
id: uuidv4(),
title,
level,
summary: await this.generateSummary(units),
knowledgeIds: units.map((u) => u.id),
children: [],
};
// 如果知识单元过多,继续细分
if (units.length > 8 && level < maxDepth) {
const subCategories = await this.generateSubCategories(units, title);
if (subCategories.length > 1) {
const subCategorized = await this.categorizeUnits(units, subCategories);
for (const [subName, subUnits] of Object.entries(subCategorized)) {
if (subUnits.length > 0) {
const child = await this.buildSection(subName, subUnits, level + 1, maxDepth);
section.children!.push(child);
}
}
}
}
return section;
}
/**
* 生成章节摘要
*/
private async generateSummary(units: KnowledgeUnit[]): Promise<string> {
const topUnits = units
.sort((a, b) => b.importance - a.importance)
.slice(0, 5);
const content = topUnits.map((u) => `- ${u.title}`).join("\n");
const prompt = ChatPromptTemplate.fromTemplate(`
为以下知识单元组生成一段50字以内的摘要:
{content}
摘要:
`);
const chain = prompt.pipe(this.model).pipe(new StringOutputParser());
return (await chain.invoke({ content })).trim();
}
/**
* 生成子分类
*/
private async generateSubCategories(
units: KnowledgeUnit[],
parentTitle: string
): Promise<string[]> {
const sample = units.slice(0, 30);
const content = sample.map((u) => `- ${u.title}`).join("\n");
const prompt = ChatPromptTemplate.fromTemplate(`
"{parentTitle}" 主题下有以下知识单元,请生成3-6个子分类:
{content}
子分类名称(每行一个,不要有序号):
`);
const chain = prompt.pipe(this.model).pipe(new StringOutputParser());
const result = await chain.invoke({ parentTitle, content });
return result
.split("\n")
.map((l) => l.trim())
.filter((l) => l.length > 0 && !l.match(/^\d/));
}
}typescript九、Markdown 生成器 src/generation/mdGenerator.ts#
import * as fs from "fs/promises";
import * as path from "path";
import { Notebook, NoteSection, KnowledgeUnit } from "../types";
import { CONFIG } from "../config";
/**
* Markdown 笔记本生成器
* 将笔记本结构输出为文件系统
*/
export class MarkdownGenerator {
private outputDir: string;
private knowledgeMap: Map<string, KnowledgeUnit>;
constructor() {
this.outputDir = CONFIG.output.baseDir;
this.knowledgeMap = new Map();
}
/**
* 生成完整笔记本
*/
async generate(notebook: Notebook, allUnits: KnowledgeUnit[]): Promise<void> {
// 建立知识ID映射
allUnits.forEach((u) => this.knowledgeMap.set(u.id, u));
// 创建笔记本根目录
const notebookDir = path.join(this.outputDir, this.sanitizeFileName(notebook.name));
await fs.mkdir(notebookDir, { recursive: true });
// 生成 README
await this.generateReadme(notebookDir, notebook);
// 递归生成章节
for (const section of notebook.rootSections) {
await this.generateSection(notebookDir, section, 1);
}
console.log(`\n✅ 笔记本生成完成: ${notebookDir}`);
console.log(` 共 ${notebook.totalKnowledgeUnits} 条知识,${notebook.rootSections.length} 个顶层章节`);
}
/**
* 生成章节(递归)
*/
private async generateSection(
parentDir: string,
section: NoteSection,
level: number
): Promise<void> {
const sectionDir = path.join(parentDir, this.sanitizeFileName(section.title));
await fs.mkdir(sectionDir, { recursive: true });
// 生成章节索引文件
const indexContent = this.buildSectionIndex(section);
await fs.writeFile(
path.join(sectionDir, "_index.md"),
indexContent,
"utf-8"
);
// 生成知识文档(按重要性排序,分批写入文件防止过大)
const units = section.knowledgeIds
.map((id) => this.knowledgeMap.get(id))
.filter((u): u is KnowledgeUnit => u !== undefined)
.sort((a, b) => b.importance - a.importance);
// 将知识分批写入md文件
const chunks = this.chunkArray(units, 10); // 每文件最多10条知识
let fileIndex = 1;
for (const chunk of chunks) {
const content = chunk.map((u) => this.buildKnowledgeDoc(u)).join("\n\n---\n\n");
const fileName = fileIndex === 1 && chunks.length === 1
? "content.md"
: `content_${String(fileIndex).padStart(2, "0")}.md`;
await fs.writeFile(path.join(sectionDir, fileName), content, "utf-8");
fileIndex++;
}
// 递归处理子章节
if (section.children) {
for (const child of section.children) {
await this.generateSection(sectionDir, child, level + 1);
}
}
}
/**
* 构建章节索引
*/
private buildSectionIndex(section: NoteSection): string {
const lines = [
`# ${section.title}`,
"",
`> ${section.summary}`,
"",
"## 目录",
"",
];
if (section.children && section.children.length > 0) {
lines.push("### 子章节");
section.children.forEach((child) => {
lines.push(`- [${child.title}](./${this.sanitizeFileName(child.title)}/_index.md)`);
});
lines.push("");
}
lines.push("### 知识点概览");
const units = section.knowledgeIds
.map((id) => this.knowledgeMap.get(id))
.filter((u): u is KnowledgeUnit => u !== undefined)
.sort((a, b) => b.importance - a.importance);
units.forEach((u) => {
const importance = "⭐".repeat(Math.ceil(u.importance / 2));
lines.push(`- ${importance} **${u.title}** (${u.category})`);
});
return lines.join("\n");
}
/**
* 构建单个知识文档
*/
private buildKnowledgeDoc(unit: KnowledgeUnit): string {
const lines = [
`## ${unit.title}`,
"",
`**重要性**: ${"⭐".repeat(Math.ceil(unit.importance / 2))} (${unit.importance}/10)`,
`**分类**: ${unit.category}`,
`**关键词**: ${unit.keywords.map((k) => `\`${k}\``).join(", ")}`,
`**来源**: ${unit.sourcePdf} (第${unit.pageRange}页)`,
"",
"### 内容",
"",
unit.content,
"",
];
return lines.join("\n");
}
/**
* 生成笔记本 README
*/
private async generateReadme(dir: string, notebook: Notebook): Promise<void> {
const lines = [
`# ${notebook.name}`,
"",
notebook.description,
"",
"## 概览",
"",
`- **创建时间**: ${notebook.createdAt}`,
`- **来源文档数**: ${notebook.totalSources}`,
`- **知识单元总数**: ${notebook.totalKnowledgeUnits}`,
`- **顶层章节数**: ${notebook.rootSections.length}`,
"",
"## 章节结构",
"",
];
const buildTree = (sections: NoteSection[], indent: number = 0) => {
sections.forEach((s) => {
const prefix = " ".repeat(indent) + "-";
lines.push(`${prefix} [${s.title}](./${this.sanitizeFileName(s.title)}/_index.md)`);
if (s.children) {
buildTree(s.children, indent + 1);
}
});
};
buildTree(notebook.rootSections);
lines.push("");
lines.push("---");
lines.push("*本笔记本由 AI 自动生成*");
await fs.writeFile(path.join(dir, "README.md"), lines.join("\n"), "utf-8");
}
/**
* 文件名安全化
*/
private sanitizeFileName(name: string): string {
return name
.replace(/[<>:"/\\|?*\x00-\x1f]/g, "_")
.replace(/\s+/g, "_")
.substring(0, 100);
}
/**
* 数组分块
*/
private chunkArray<T>(arr: T[], size: number): T[][] {
const chunks: T[][] = [];
for (let i = 0; i < arr.length; i += size) {
chunks.push(arr.slice(i, i + size));
}
return chunks;
}
}typescript十、主控制器 src/index.ts#
import { scanPDFDirectory, loadPDFBatch } from "./ingestion/pdfBatchLoader";
import { KnowledgeExtractor } from "./extraction/knowledgeExtractor";
import { SemanticDeduplicator } from "./dedup/semanticDeduplicator";
import { NotebookBuilder } from "./clustering/notebookBuilder";
import { MarkdownGenerator } from "./generation/mdGenerator";
import { KnowledgeUnit } from "./types";
import { CONFIG } from "./config";
/**
* 批量PDF知识笔记本生成器 - 主控制器
*/
async function main() {
const pdfDir = process.argv[2] || "./data/pdfs";
const notebookName = process.argv[3] || "知识笔记本";
const description = process.argv[4] || "从PDF文档自动提取整理的知识笔记";
console.log("╔══════════════════════════════════════════════╗");
console.log("║ 批量PDF知识笔记本生成器 v1.0 ║");
console.log("╚══════════════════════════════════════════════╝");
console.log(`\n📂 PDF目录: ${pdfDir}`);
console.log(`📓 笔记本名称: ${notebookName}\n`);
const startTime = Date.now();
// ========== Phase 1: 扫描与加载 ==========
const pdfMetas = await scanPDFDirectory(pdfDir);
if (pdfMetas.length === 0) {
console.log("❌ 未找到PDF文件");
return;
}
// ========== Phase 2: 知识抽取(批处理) ==========
const extractor = new KnowledgeExtractor();
let allKnowledge: KnowledgeUnit[] = [];
for await (const batch of loadPDFBatch(pdfMetas, CONFIG.batch.pdfBatchSize)) {
for (const { meta, documents } of batch) {
const knowledge = await extractor.extractFromDocuments(meta.fileName, documents);
allKnowledge.push(...knowledge);
}
// 阶段性保存(防止进程崩溃丢失)
console.log(`💾 累计提取: ${allKnowledge.length} 条知识`);
}
if (allKnowledge.length === 0) {
console.log("❌ 未提取到任何知识");
return;
}
// ========== Phase 3: 语义去重 ==========
const deduplicator = new SemanticDeduplicator();
const uniqueKnowledge = await deduplicator.deduplicate(allKnowledge);
// ========== Phase 4: 构建笔记本结构 ==========
const builder = new NotebookBuilder();
const notebook = await builder.buildNotebook(notebookName, description, uniqueKnowledge);
// ========== Phase 5: 生成Markdown文件 ==========
const generator = new MarkdownGenerator();
await generator.generate(notebook, uniqueKnowledge);
// ========== 完成统计 ==========
const duration = ((Date.now() - startTime) / 1000 / 60).toFixed(2);
console.log(`\n╔══════════════════════════════════════════════╗`);
console.log(`║ ✅ 全部完成!耗时: ${duration} 分钟`);
console.log(`║ 📊 处理PDF: ${pdfMetas.length} 个`);
console.log(`║ 📝 原始知识: ${allKnowledge.length} 条`);
console.log(`║ 🎯 去重后: ${uniqueKnowledge.length} 条`);
console.log(`║ 📚 笔记本章节: ${notebook.rootSections.length} 个顶层章节`);
console.log(`╚══════════════════════════════════════════════╝`);
}
main().catch((error) => {
console.error("❌ 程序异常:", error);
process.exit(1);
});typescript十一、运行方式#
# 1. 准备PDF文件
mkdir -p data/pdfs
# 将你的1万+PDF文件放入此目录(支持子目录)
# 2. 配置环境变量
echo "OPENAI_API_KEY=sk-your-key" > .env
# 3. 运行
npx ts-node src/index.ts ./data/pdfs "机器学习知识库" "从论文和教材中提取的ML核心知识"
# 输出目录
# ./output/notebook/机器学习知识库/
# ├── README.md
# ├── 监督学习/
# │ ├── _index.md
# │ ├── content.md
# │ └── 线性模型/
# │ ├── _index.md
# │ └── content.md
# ├── 无监督学习/
# │ └── ...
# └── ...bash十二、关键优化点总结#
| 问题 | 解决方案 |
|---|---|
| 万级PDF内存爆炸 | 批次加载(50个/批),流式处理,不一次性加载全部 |
| API 速率限制 | 并发控制(10并发),指数退避重试 |
| 去重精度 | 语义嵌入 + 内容重叠度双重验证,跨类别标题去重 |
| 不漏知识点 | 低重要性阈值过滤(≥4分),多页分组提取 |
| 分类质量 | 先粗分再细分,采样+模型双重分类 |
| 文件过大 | 每md文件最多10条知识,自动拆分 |
| 进程崩溃 | 阶段性日志输出,可基于日志恢复 |
如需进一步优化(如使用本地模型降低成本、接入 Chroma 持久化、并行化去重等),可以继续扩展!