glownight

返回

处理 1 万+ PDF 的核心难点在于:内存管理、去重精度、知识抽取质量、嵌套分类的合理性。下面提供一个完整的生产级架构方案,包含代码实现。


一、整体架构设计#

┌─────────────────────────────────────────────────────────────┐
│                    批量 PDF 知识笔记本生成器                     │
├─────────────────────────────────────────────────────────────┤
│  Phase 1:  ingestion     →  批量加载 + 元数据提取              │
│  Phase 2:  extraction    →  结构化知识抽取(每PDF独立)         │
│  Phase 3:  deduplication →  语义去重 + 质量评分               │
│  Phase 4:  clustering    →  层次聚类生成笔记本结构              │
│  Phase 5:  generation    →  生成 Markdown 文件                │
└─────────────────────────────────────────────────────────────┘
plaintext

二、项目初始化#

mkdir pdf-knowledge-notebook
cd pdf-knowledge-notebook
npm init -y

# 核心依赖
npm install langchain @langchain/openai @langchain/community
npm install pdf-parse

# 数据处理
npm install zod uuid
npm install -D typescript ts-node @types/node @types/uuid

# 向量数据库(持久化,处理万级数据)
npm install chromadb @langchain/community

npx tsc --init
bash

tsconfig.json

{
  "compilerOptions": {
    "target": "ES2022",
    "module": "commonjs",
    "lib": ["ES2022"],
    "outDir": "./dist",
    "rootDir": "./src",
    "strict": true,
    "esModuleInterop": true,
    "skipLibCheck": true,
    "resolveJsonModule": true,
    "moduleResolution": "node"
  },
  "include": ["src/**/*"]
}
json

三、核心类型定义 src/types/index.ts#

import { z } from "zod";

// ==================== 知识单元 ====================

export const KnowledgeUnitSchema = z.object({
  id: z.string(),                    // 唯一标识
  sourcePdf: z.string(),             // 来源PDF路径
  pageRange: z.string(),             // 页码范围
  title: z.string(),                 // 知识点标题
  content: z.string(),               // 详细内容
  keywords: z.array(z.string()),     // 关键词标签
  importance: z.number().min(1).max(10), // 重要性评分
  category: z.string(),              // 粗分类(用于初筛)
  embedding: z.number().array().optional(), // 向量(去重用)
});

export type KnowledgeUnit = z.infer<typeof KnowledgeUnitSchema>;

// ==================== 笔记本结构 ====================

export const NoteSectionSchema = z.object({
  id: z.string(),
  title: z.string(),
  level: z.number(),                 // 层级深度 1,2,3...
  summary: z.string(),               // 章节摘要
  children: z.array(z.lazy(() => NoteSectionSchema)).optional(), // 嵌套子章节
  knowledgeIds: z.array(z.string()), // 关联的知识点ID
});

export type NoteSection = z.infer<typeof NoteSectionSchema>;

export const NotebookSchema = z.object({
  name: z.string(),
  description: z.string(),
  createdAt: z.string(),
  totalSources: z.number(),
  totalKnowledgeUnits: z.number(),
  rootSections: z.array(NoteSectionSchema),
});

export type Notebook = z.infer<typeof NotebookSchema>;
typescript

四、配置与环境 src/config/index.ts#

import { config } from "dotenv";
config();

export const CONFIG = {
  openai: {
    apiKey: process.env.OPENAI_API_KEY!,
    model: "gpt-4o-mini",           // 性价比模型
    smartModel: "gpt-4o",           // 高质量模型(用于关键步骤)
    embeddingModel: "text-embedding-3-small",
  },
  
  // 批处理参数(控制内存和API速率)
  batch: {
    pdfBatchSize: 50,               // 每批处理PDF数量
    extractionConcurrency: 10,      // 并发提取数
    embeddingBatchSize: 100,        // 嵌入批次大小
    dedupThreshold: 0.92,           // 去重相似度阈值(0-1)
    minImportance: 4,               // 最低重要性保留阈值
  },
  
  // 输出配置
  output: {
    baseDir: "./output/notebook",
    maxMdFileSize: 50000,           // 单个md文件最大字符数
  },
} as const;

if (!CONFIG.openai.apiKey) {
  throw new Error("OPENAI_API_KEY is required");
}
typescript

五、PDF 批量加载器 src/ingestion/pdfBatchLoader.ts#

import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import { Document } from "@langchain/core/documents";
import * as fs from "fs/promises";
import * as path from "path";

export interface PDFMeta {
  filePath: string;
  fileName: string;
  fileSize: number;
  totalPages?: number;
}

/**
 * 扫描目录获取所有PDF文件(支持子目录)
 */
export async function scanPDFDirectory(dirPath: string): Promise<PDFMeta[]> {
  const pdfs: PDFMeta[] = [];
  
  async function scan(currentPath: string) {
    const entries = await fs.readdir(currentPath, { withFileTypes: true });
    
    for (const entry of entries) {
      const fullPath = path.join(currentPath, entry.name);
      
      if (entry.isDirectory()) {
        await scan(fullPath);
      } else if (entry.isFile() && entry.name.toLowerCase().endsWith(".pdf")) {
        const stat = await fs.stat(fullPath);
        pdfs.push({
          filePath: fullPath,
          fileName: entry.name,
          fileSize: stat.size,
        });
      }
    }
  }
  
  await scan(path.resolve(dirPath));
  console.log(`📁 扫描完成,发现 ${pdfs.length} 个PDF文件`);
  return pdfs;
}

/**
 * 分批加载PDF(控制内存)
 */
export async function* loadPDFBatch(
  pdfMetas: PDFMeta[],
  batchSize: number
): AsyncGenerator<{ meta: PDFMeta; documents: Document[] }[]> {
  for (let i = 0; i < pdfMetas.length; i += batchSize) {
    const batch = pdfMetas.slice(i, i + batchSize);
    console.log(`\n📦 加载批次 ${Math.floor(i / batchSize) + 1}/${Math.ceil(pdfMetas.length / batchSize)}`);
    
    const results = await Promise.all(
      batch.map(async (meta) => {
        try {
          const loader = new PDFLoader(meta.filePath, { splitPages: true });
          const docs = await loader.load();
          
          // 添加来源元数据
          docs.forEach((doc) => {
            doc.metadata.source = meta.filePath;
            doc.metadata.fileName = meta.fileName;
          });
          
          meta.totalPages = docs.length;
          console.log(`  ✅ ${meta.fileName} (${docs.length}页)`);
          return { meta, documents: docs };
        } catch (error) {
          console.error(`  ❌ ${meta.fileName} 加载失败:`, (error as Error).message);
          return { meta, documents: [] };
        }
      })
    );
    
    yield results.filter((r) => r.documents.length > 0);
  }
}
typescript

六、结构化知识抽取 src/extraction/knowledgeExtractor.ts#

import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { Document } from "@langchain/core/documents";
import { StringOutputParser } from "@langchain/core/output_parsers";
import { z } from "zod";
import { KnowledgeUnit } from "../types";
import { CONFIG } from "../config";
import { v4 as uuidv4 } from "uuid";

// 定义知识抽取的JSON Schema
const ExtractionSchema = z.object({
  knowledgeUnits: z.array(
    z.object({
      title: z.string().describe("知识点的简洁标题"),
      content: z.string().describe("详细内容,保留关键信息"),
      keywords: z.array(z.string()).describe("3-5个关键词"),
      importance: z.number().min(1).max(10).describe("重要性1-10"),
      category: z.string().describe("粗分类,如:技术/概念/流程/案例/数据"),
    })
  ),
});

type ExtractionResult = z.infer<typeof ExtractionSchema>;

/**
 * 智能知识抽取器
 * 将PDF文档内容转化为结构化知识单元
 */
export class KnowledgeExtractor {
  private model: ChatOpenAI;
  private parser: StringOutputParser;

  constructor() {
    this.model = new ChatOpenAI({
      apiKey: CONFIG.openai.apiKey,
      modelName: CONFIG.openai.smartModel,
      temperature: 0.2,
      maxRetries: 3,
    });
    this.parser = new StringOutputParser();
  }

  /**
   * 从PDF文档中提取知识
   * 策略:按页分组,每几页作为一个提取单元
   */
  async extractFromDocuments(
    fileName: string,
    documents: Document[]
  ): Promise<KnowledgeUnit[]> {
    // 将页面分组(每3页一组,避免超出token限制)
    const pageGroups = this.groupPages(documents, 3);
    const allKnowledge: KnowledgeUnit[] = [];

    for (let i = 0; i < pageGroups.length; i++) {
      const group = pageGroups[i];
      const pageRange = `${group[0].metadata.loc?.pageNumber || i * 3 + 1}-${
        group[group.length - 1].metadata.loc?.pageNumber || (i + 1) * 3
      }`;

      const text = group.map((d) => d.pageContent).join("\n\n");
      
      try {
        const extracted = await this.extractFromText(text, fileName, pageRange);
        allKnowledge.push(...extracted);
      } catch (error) {
        console.error(`  ⚠️ 提取失败 (${fileName} 第${pageRange}页):`, error);
      }
    }

    // 过滤低质量内容
    const filtered = allKnowledge.filter((k) => k.importance >= CONFIG.batch.minImportance);
    console.log(`  📝 提取 ${allKnowledge.length} 条知识,保留 ${filtered.length} 条(重要性≥${CONFIG.batch.minImportance})`);
    
    return filtered;
  }

  /**
   * 单组文本提取
   */
  private async extractFromText(
    text: string,
    sourcePdf: string,
    pageRange: string
  ): Promise<KnowledgeUnit[]> {
    const prompt = ChatPromptTemplate.fromTemplate(`
你是一个专业的知识提取专家。请从以下PDF文档内容中提取有价值的知识点。

提取要求:
1. 每个知识点必须是独立、完整的信息单元
2. 标题要简洁明确(不超过20字)
3. 内容要详细完整,保留关键数据、定义、步骤
4. 重要性评分标准:
   - 10分:核心概念、关键定理、重要数据
   - 7-9分:重要方法、显著案例、关键流程
   - 4-6分:补充说明、背景信息、次要细节
   - 1-3分:冗余信息、重复内容(不要提取)
5. 如果内容主要是目录、页眉页脚、版权信息,返回空数组
6. 去重:如果多个知识点内容高度相似,只保留最完整的一个

文档来源:{sourcePdf}
页码范围:{pageRange}

文档内容:
{text}

请严格按以下JSON格式返回(不要有任何其他文字):
{{
  "knowledgeUnits": [
    {{
      "title": "知识点标题",
      "content": "详细内容...",
      "keywords": ["关键词1", "关键词2", "关键词3"],
      "importance": 8,
      "category": "技术"
    }}
  ]
}}
`);

    const chain = prompt.pipe(this.model).pipe(this.parser);
    const result = await chain.invoke({
      text: text.slice(0, 12000), // 限制输入长度
      sourcePdf,
      pageRange,
    });

    // 解析JSON(处理可能的格式问题)
    const cleaned = this.cleanJSON(result);
    const parsed = ExtractionSchema.parse(JSON.parse(cleaned));

    return parsed.knowledgeUnits.map((unit) => ({
      id: uuidv4(),
      sourcePdf,
      pageRange,
      title: unit.title,
      content: unit.content,
      keywords: unit.keywords,
      importance: unit.importance,
      category: unit.category,
    }));
  }

  /**
   * 页面分组
   */
  private groupPages(docs: Document[], groupSize: number): Document[][] {
    const groups: Document[][] = [];
    for (let i = 0; i < docs.length; i += groupSize) {
      groups.push(docs.slice(i, i + groupSize));
    }
    return groups;
  }

  /**
   * 清理模型返回的JSON
   */
  private cleanJSON(raw: string): string {
    // 去除markdown代码块标记
    let cleaned = raw.replace(/```json\s*/g, "").replace(/```\s*$/g, "");
    // 去除前后空白
    cleaned = cleaned.trim();
    // 如果以大括号开头结尾,直接返回
    if (cleaned.startsWith("{") && cleaned.endsWith("}")) {
      return cleaned;
    }
    // 尝试找到JSON部分
    const match = cleaned.match(/\{[\s\S]*\}/);
    return match ? match[0] : cleaned;
  }
}
typescript

七、语义去重引擎 src/dedup/semanticDeduplicator.ts#

import { OpenAIEmbeddings } from "@langchain/openai";
import { KnowledgeUnit } from "../types";
import { CONFIG } from "../config";

/**
 * 语义去重器
 * 使用向量相似度 + 内容重叠度双重去重
 */
export class SemanticDeduplicator {
  private embeddings: OpenAIEmbeddings;
  private threshold: number;

  constructor() {
    this.embeddings = new OpenAIEmbeddings({
      apiKey: CONFIG.openai.apiKey,
      modelName: CONFIG.openai.embeddingModel,
      batchSize: CONFIG.batch.embeddingBatchSize,
    });
    this.threshold = CONFIG.batch.dedupThreshold;
  }

  /**
   * 执行去重
   * 策略:先按类别分组,再在组内去重(减少计算量)
   */
  async deduplicate(units: KnowledgeUnit[]): Promise<KnowledgeUnit[]> {
    console.log(`\n🔍 开始去重,原始知识单元: ${units.length}`);

    // 1. 按类别分组
    const byCategory = this.groupByCategory(units);
    const deduped: KnowledgeUnit[] = [];

    for (const [category, categoryUnits] of Object.entries(byCategory)) {
      console.log(`  📂 处理类别 "${category}": ${categoryUnits.length} 条`);
      
      const categoryDeduped = await this.deduplicateGroup(categoryUnits);
      deduped.push(...categoryDeduped);
    }

    // 2. 跨类别去重(处理分类错误的情况)
    const finalDeduped = await this.crossCategoryDedup(deduped);

    console.log(`✅ 去重完成: ${units.length} → ${finalDeduped.length} (去除了 ${units.length - finalDeduped.length} 条重复)`);
    
    return finalDeduped;
  }

  /**
   * 单类别内去重
   */
  private async deduplicateGroup(units: KnowledgeUnit[]): Promise<KnowledgeUnit[]> {
    if (units.length <= 1) return units;

    // 生成嵌入向量
    const texts = units.map((u) => `${u.title}\n${u.content}`);
    const vectors = await this.embeddings.embedDocuments(texts);

    // 为每个单元添加向量
    units.forEach((unit, i) => {
      unit.embedding = vectors[i];
    });

    const kept: KnowledgeUnit[] = [];
    const removed = new Set<number>();

    for (let i = 0; i < units.length; i++) {
      if (removed.has(i)) continue;

      const current = units[i];
      let bestUnit = current;
      let duplicates: KnowledgeUnit[] = [];

      // 查找相似项
      for (let j = i + 1; j < units.length; j++) {
        if (removed.has(j)) continue;

        const similarity = this.cosineSimilarity(current.embedding!, units[j].embedding!);
        
        if (similarity > this.threshold) {
          duplicates.push(units[j]);
          removed.add(j);
        }
      }

      // 合并重复项:保留重要性最高的,内容最完整的
      if (duplicates.length > 0) {
        const allCandidates = [current, ...duplicates];
        bestUnit = this.mergeUnits(allCandidates);
        console.log(`    🔄 合并 ${duplicates.length + 1} 个相似项: "${bestUnit.title.substring(0, 30)}..."`);
      }

      kept.push(bestUnit);
    }

    return kept;
  }

  /**
   * 跨类别去重(轻量级)
   */
  private async crossCategoryDedup(units: KnowledgeUnit[]): Promise<KnowledgeUnit[]> {
    // 只检查标题高度相似的(优化性能)
    const titleMap = new Map<string, KnowledgeUnit[]>();
    
    units.forEach((unit) => {
      const normalizedTitle = unit.title.toLowerCase().replace(/\s+/g, "");
      const existing = titleMap.get(normalizedTitle) || [];
      existing.push(unit);
      titleMap.set(normalizedTitle, existing);
    });

    const result: KnowledgeUnit[] = [];
    
    for (const group of titleMap.values()) {
      if (group.length === 1) {
        result.push(group[0]);
      } else {
        // 标题相同,合并
        const merged = this.mergeUnits(group);
        result.push(merged);
      }
    }

    return result;
  }

  /**
   * 合并多个相似知识单元
   */
  private mergeUnits(units: KnowledgeUnit[]): KnowledgeUnit {
    // 选择重要性最高的作为主单元
    const primary = units.reduce((best, current) => 
      current.importance > best.importance ? current : best
    );

    // 合并内容(取最长的,假设最完整)
    const longestContent = units.reduce((longest, current) => 
      current.content.length > longest.length ? current.content : longest
    , primary.content);

    // 合并关键词(去重)
    const allKeywords = new Set<string>();
    units.forEach((u) => u.keywords.forEach((k) => allKeywords.add(k)));

    // 合并来源信息
    const sources = [...new Set(units.map((u) => `${u.sourcePdf}(p${u.pageRange})`))];

    return {
      ...primary,
      content: longestContent,
      keywords: Array.from(allKeywords).slice(0, 8), // 最多8个关键词
      importance: Math.max(...units.map((u) => u.importance)),
      // 保留第一个的ID
    };
  }

  /**
   * 余弦相似度计算
   */
  private cosineSimilarity(a: number[], b: number[]): number {
    let dotProduct = 0;
    let normA = 0;
    let normB = 0;
    
    for (let i = 0; i < a.length; i++) {
      dotProduct += a[i] * b[i];
      normA += a[i] * a[i];
      normB += b[i] * b[i];
    }
    
    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
  }

  /**
   * 按类别分组
   */
  private groupByCategory(units: KnowledgeUnit[]): Record<string, KnowledgeUnit[]> {
    const groups: Record<string, KnowledgeUnit[]> = {};
    
    units.forEach((unit) => {
      const cat = unit.category || "未分类";
      groups[cat] = groups[cat] || [];
      groups[cat].push(unit);
    });
    
    return groups;
  }
}
typescript

八、层次聚类与笔记本结构生成 src/clustering/notebookBuilder.ts#

import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { StringOutputParser } from "@langchain/core/output_parsers";
import { KnowledgeUnit, Notebook, NoteSection } from "../types";
import { CONFIG } from "../config";
import { v4 as uuidv4 } from "uuid";

/**
 * 笔记本构建器
 * 将去重后的知识单元组织成层次化的笔记本结构
 */
export class NotebookBuilder {
  private model: ChatOpenAI;

  constructor() {
    this.model = new ChatOpenAI({
      apiKey: CONFIG.openai.apiKey,
      modelName: CONFIG.openai.smartModel,
      temperature: 0.3,
    });
  }

  /**
   * 构建完整笔记本
   */
  async buildNotebook(
    notebookName: string,
    description: string,
    units: KnowledgeUnit[]
  ): Promise<Notebook> {
    console.log(`\n📚 开始构建笔记本结构,共 ${units.length} 条知识`);

    // 1. 生成顶层分类
    const topCategories = await this.generateTopCategories(units);
    
    // 2. 将知识分配到分类
    const categorized = await this.categorizeUnits(units, topCategories);
    
    // 3. 为每个分类生成子结构
    const rootSections: NoteSection[] = [];
    
    for (const [categoryName, categoryUnits] of Object.entries(categorized)) {
      console.log(`  🗂️ 构建章节: "${categoryName}" (${categoryUnits.length} 条知识)`);
      const section = await this.buildSection(categoryName, categoryUnits, 1);
      rootSections.push(section);
    }

    return {
      name: notebookName,
      description,
      createdAt: new Date().toISOString(),
      totalSources: new Set(units.map((u) => u.sourcePdf)).size,
      totalKnowledgeUnits: units.length,
      rootSections,
    };
  }

  /**
   * 生成顶层分类(基于所有知识的概述)
   */
  private async generateTopCategories(units: KnowledgeUnit[]): Promise<string[]> {
    // 采样(避免超出token限制)
    const sample = units
      .sort((a, b) => b.importance - a.importance)
      .slice(0, 100);

    const summaries = sample
      .map((u) => `- [重要性${u.importance}] ${u.title}: ${u.content.substring(0, 100)}...`)
      .join("\n");

    const prompt = ChatPromptTemplate.fromTemplate(`
基于以下知识单元列表,生成5-10个顶层分类目录。
分类应该:
1. 覆盖所有重要主题
2. 分类之间互斥性高
3. 使用专业、简洁的中文名称
4. 适合作为技术/学习笔记的顶层结构

知识样本:
{summaries}

请只返回分类名称列表,每行一个,不要有序号:
`);

    const chain = prompt.pipe(this.model).pipe(new StringOutputParser());
    const result = await chain.invoke({ summaries });

    return result
      .split("\n")
      .map((line) => line.trim())
      .filter((line) => line.length > 0 && !line.match(/^\d/));
  }

  /**
   * 将知识单元分配到分类
   */
  private async categorizeUnits(
    units: KnowledgeUnit[],
    categories: string[]
  ): Promise<Record<string, KnowledgeUnit[]>> {
    const categorized: Record<string, KnowledgeUnit[]> = {};
    categories.forEach((c) => (categorized[c] = []));

    // 使用模型分配(批次处理)
    const batchSize = 20;
    
    for (let i = 0; i < units.length; i += batchSize) {
      const batch = units.slice(i, i + batchSize);
      
      const prompt = ChatPromptTemplate.fromTemplate(`
将以下知识单元分配到最合适的分类中。

可用分类:
{categories}

知识单元:
{units}

请返回JSON格式:{{"assignments": [{{"unitIndex": 0, "category": "分类名"}}]}}
`);

      const chain = prompt.pipe(this.model).pipe(new StringOutputParser());
      
      const unitTexts = batch
        .map((u, idx) => `${idx}. ${u.title} [关键词: ${u.keywords.join(", ")}]`)
        .join("\n");

      try {
        const result = await chain.invoke({
          categories: categories.join("\n"),
          units: unitTexts,
        });

        const cleaned = result.replace(/```json\s*/g, "").replace(/```/g, "").trim();
        const parsed = JSON.parse(cleaned);

        parsed.assignments.forEach((a: { unitIndex: number; category: string }) => {
          const unit = batch[a.unitIndex];
          const targetCat = categories.find((c) => c === a.category) || categories[0];
          categorized[targetCat] = categorized[targetCat] || [];
          categorized[targetCat].push(unit);
        });
      } catch {
        // 回退:按关键词简单分配
        batch.forEach((unit) => {
          const bestCat = categories[0];
          categorized[bestCat].push(unit);
        });
      }
    }

    // 过滤空分类
    return Object.fromEntries(
      Object.entries(categorized).filter(([_, v]) => v.length > 0)
    );
  }

  /**
   * 递归构建章节结构
   */
  private async buildSection(
    title: string,
    units: KnowledgeUnit[],
    level: number,
    maxDepth: number = 3
  ): Promise<NoteSection> {
    const section: NoteSection = {
      id: uuidv4(),
      title,
      level,
      summary: await this.generateSummary(units),
      knowledgeIds: units.map((u) => u.id),
      children: [],
    };

    // 如果知识单元过多,继续细分
    if (units.length > 8 && level < maxDepth) {
      const subCategories = await this.generateSubCategories(units, title);
      
      if (subCategories.length > 1) {
        const subCategorized = await this.categorizeUnits(units, subCategories);
        
        for (const [subName, subUnits] of Object.entries(subCategorized)) {
          if (subUnits.length > 0) {
            const child = await this.buildSection(subName, subUnits, level + 1, maxDepth);
            section.children!.push(child);
          }
        }
      }
    }

    return section;
  }

  /**
   * 生成章节摘要
   */
  private async generateSummary(units: KnowledgeUnit[]): Promise<string> {
    const topUnits = units
      .sort((a, b) => b.importance - a.importance)
      .slice(0, 5);

    const content = topUnits.map((u) => `- ${u.title}`).join("\n");

    const prompt = ChatPromptTemplate.fromTemplate(`
为以下知识单元组生成一段50字以内的摘要:
{content}

摘要:
`);

    const chain = prompt.pipe(this.model).pipe(new StringOutputParser());
    return (await chain.invoke({ content })).trim();
  }

  /**
   * 生成子分类
   */
  private async generateSubCategories(
    units: KnowledgeUnit[],
    parentTitle: string
  ): Promise<string[]> {
    const sample = units.slice(0, 30);
    const content = sample.map((u) => `- ${u.title}`).join("\n");

    const prompt = ChatPromptTemplate.fromTemplate(`
"{parentTitle}" 主题下有以下知识单元,请生成3-6个子分类:

{content}

子分类名称(每行一个,不要有序号):
`);

    const chain = prompt.pipe(this.model).pipe(new StringOutputParser());
    const result = await chain.invoke({ parentTitle, content });

    return result
      .split("\n")
      .map((l) => l.trim())
      .filter((l) => l.length > 0 && !l.match(/^\d/));
  }
}
typescript

九、Markdown 生成器 src/generation/mdGenerator.ts#

import * as fs from "fs/promises";
import * as path from "path";
import { Notebook, NoteSection, KnowledgeUnit } from "../types";
import { CONFIG } from "../config";

/**
 * Markdown 笔记本生成器
 * 将笔记本结构输出为文件系统
 */
export class MarkdownGenerator {
  private outputDir: string;
  private knowledgeMap: Map<string, KnowledgeUnit>;

  constructor() {
    this.outputDir = CONFIG.output.baseDir;
    this.knowledgeMap = new Map();
  }

  /**
   * 生成完整笔记本
   */
  async generate(notebook: Notebook, allUnits: KnowledgeUnit[]): Promise<void> {
    // 建立知识ID映射
    allUnits.forEach((u) => this.knowledgeMap.set(u.id, u));

    // 创建笔记本根目录
    const notebookDir = path.join(this.outputDir, this.sanitizeFileName(notebook.name));
    await fs.mkdir(notebookDir, { recursive: true });

    // 生成 README
    await this.generateReadme(notebookDir, notebook);

    // 递归生成章节
    for (const section of notebook.rootSections) {
      await this.generateSection(notebookDir, section, 1);
    }

    console.log(`\n✅ 笔记本生成完成: ${notebookDir}`);
    console.log(`   共 ${notebook.totalKnowledgeUnits} 条知识,${notebook.rootSections.length} 个顶层章节`);
  }

  /**
   * 生成章节(递归)
   */
  private async generateSection(
    parentDir: string,
    section: NoteSection,
    level: number
  ): Promise<void> {
    const sectionDir = path.join(parentDir, this.sanitizeFileName(section.title));
    await fs.mkdir(sectionDir, { recursive: true });

    // 生成章节索引文件
    const indexContent = this.buildSectionIndex(section);
    await fs.writeFile(
      path.join(sectionDir, "_index.md"),
      indexContent,
      "utf-8"
    );

    // 生成知识文档(按重要性排序,分批写入文件防止过大)
    const units = section.knowledgeIds
      .map((id) => this.knowledgeMap.get(id))
      .filter((u): u is KnowledgeUnit => u !== undefined)
      .sort((a, b) => b.importance - a.importance);

    // 将知识分批写入md文件
    const chunks = this.chunkArray(units, 10); // 每文件最多10条知识
    let fileIndex = 1;

    for (const chunk of chunks) {
      const content = chunk.map((u) => this.buildKnowledgeDoc(u)).join("\n\n---\n\n");
      const fileName = fileIndex === 1 && chunks.length === 1 
        ? "content.md" 
        : `content_${String(fileIndex).padStart(2, "0")}.md`;
      
      await fs.writeFile(path.join(sectionDir, fileName), content, "utf-8");
      fileIndex++;
    }

    // 递归处理子章节
    if (section.children) {
      for (const child of section.children) {
        await this.generateSection(sectionDir, child, level + 1);
      }
    }
  }

  /**
   * 构建章节索引
   */
  private buildSectionIndex(section: NoteSection): string {
    const lines = [
      `# ${section.title}`,
      "",
      `> ${section.summary}`,
      "",
      "## 目录",
      "",
    ];

    if (section.children && section.children.length > 0) {
      lines.push("### 子章节");
      section.children.forEach((child) => {
        lines.push(`- [${child.title}](./${this.sanitizeFileName(child.title)}/_index.md)`);
      });
      lines.push("");
    }

    lines.push("### 知识点概览");
    const units = section.knowledgeIds
      .map((id) => this.knowledgeMap.get(id))
      .filter((u): u is KnowledgeUnit => u !== undefined)
      .sort((a, b) => b.importance - a.importance);

    units.forEach((u) => {
      const importance = "⭐".repeat(Math.ceil(u.importance / 2));
      lines.push(`- ${importance} **${u.title}** (${u.category})`);
    });

    return lines.join("\n");
  }

  /**
   * 构建单个知识文档
   */
  private buildKnowledgeDoc(unit: KnowledgeUnit): string {
    const lines = [
      `## ${unit.title}`,
      "",
      `**重要性**: ${"⭐".repeat(Math.ceil(unit.importance / 2))} (${unit.importance}/10)`,
      `**分类**: ${unit.category}`,
      `**关键词**: ${unit.keywords.map((k) => `\`${k}\``).join(", ")}`,
      `**来源**: ${unit.sourcePdf} (第${unit.pageRange}页)`,
      "",
      "### 内容",
      "",
      unit.content,
      "",
    ];

    return lines.join("\n");
  }

  /**
   * 生成笔记本 README
   */
  private async generateReadme(dir: string, notebook: Notebook): Promise<void> {
    const lines = [
      `# ${notebook.name}`,
      "",
      notebook.description,
      "",
      "## 概览",
      "",
      `- **创建时间**: ${notebook.createdAt}`,
      `- **来源文档数**: ${notebook.totalSources}`,
      `- **知识单元总数**: ${notebook.totalKnowledgeUnits}`,
      `- **顶层章节数**: ${notebook.rootSections.length}`,
      "",
      "## 章节结构",
      "",
    ];

    const buildTree = (sections: NoteSection[], indent: number = 0) => {
      sections.forEach((s) => {
        const prefix = "  ".repeat(indent) + "-";
        lines.push(`${prefix} [${s.title}](./${this.sanitizeFileName(s.title)}/_index.md)`);
        if (s.children) {
          buildTree(s.children, indent + 1);
        }
      });
    };

    buildTree(notebook.rootSections);

    lines.push("");
    lines.push("---");
    lines.push("*本笔记本由 AI 自动生成*");

    await fs.writeFile(path.join(dir, "README.md"), lines.join("\n"), "utf-8");
  }

  /**
   * 文件名安全化
   */
  private sanitizeFileName(name: string): string {
    return name
      .replace(/[<>:"/\\|?*\x00-\x1f]/g, "_")
      .replace(/\s+/g, "_")
      .substring(0, 100);
  }

  /**
   * 数组分块
   */
  private chunkArray<T>(arr: T[], size: number): T[][] {
    const chunks: T[][] = [];
    for (let i = 0; i < arr.length; i += size) {
      chunks.push(arr.slice(i, i + size));
    }
    return chunks;
  }
}
typescript

十、主控制器 src/index.ts#

import { scanPDFDirectory, loadPDFBatch } from "./ingestion/pdfBatchLoader";
import { KnowledgeExtractor } from "./extraction/knowledgeExtractor";
import { SemanticDeduplicator } from "./dedup/semanticDeduplicator";
import { NotebookBuilder } from "./clustering/notebookBuilder";
import { MarkdownGenerator } from "./generation/mdGenerator";
import { KnowledgeUnit } from "./types";
import { CONFIG } from "./config";

/**
 * 批量PDF知识笔记本生成器 - 主控制器
 */
async function main() {
  const pdfDir = process.argv[2] || "./data/pdfs";
  const notebookName = process.argv[3] || "知识笔记本";
  const description = process.argv[4] || "从PDF文档自动提取整理的知识笔记";

  console.log("╔══════════════════════════════════════════════╗");
  console.log("║     批量PDF知识笔记本生成器 v1.0              ║");
  console.log("╚══════════════════════════════════════════════╝");
  console.log(`\n📂 PDF目录: ${pdfDir}`);
  console.log(`📓 笔记本名称: ${notebookName}\n`);

  const startTime = Date.now();

  // ========== Phase 1: 扫描与加载 ==========
  const pdfMetas = await scanPDFDirectory(pdfDir);
  
  if (pdfMetas.length === 0) {
    console.log("❌ 未找到PDF文件");
    return;
  }

  // ========== Phase 2: 知识抽取(批处理) ==========
  const extractor = new KnowledgeExtractor();
  let allKnowledge: KnowledgeUnit[] = [];

  for await (const batch of loadPDFBatch(pdfMetas, CONFIG.batch.pdfBatchSize)) {
    for (const { meta, documents } of batch) {
      const knowledge = await extractor.extractFromDocuments(meta.fileName, documents);
      allKnowledge.push(...knowledge);
    }

    // 阶段性保存(防止进程崩溃丢失)
    console.log(`💾 累计提取: ${allKnowledge.length} 条知识`);
  }

  if (allKnowledge.length === 0) {
    console.log("❌ 未提取到任何知识");
    return;
  }

  // ========== Phase 3: 语义去重 ==========
  const deduplicator = new SemanticDeduplicator();
  const uniqueKnowledge = await deduplicator.deduplicate(allKnowledge);

  // ========== Phase 4: 构建笔记本结构 ==========
  const builder = new NotebookBuilder();
  const notebook = await builder.buildNotebook(notebookName, description, uniqueKnowledge);

  // ========== Phase 5: 生成Markdown文件 ==========
  const generator = new MarkdownGenerator();
  await generator.generate(notebook, uniqueKnowledge);

  // ========== 完成统计 ==========
  const duration = ((Date.now() - startTime) / 1000 / 60).toFixed(2);
  console.log(`\n╔══════════════════════════════════════════════╗`);
  console.log(`║  ✅ 全部完成!耗时: ${duration} 分钟`);
  console.log(`║  📊 处理PDF: ${pdfMetas.length} 个`);
  console.log(`║  📝 原始知识: ${allKnowledge.length} 条`);
  console.log(`║  🎯 去重后: ${uniqueKnowledge.length} 条`);
  console.log(`║  📚 笔记本章节: ${notebook.rootSections.length} 个顶层章节`);
  console.log(`╚══════════════════════════════════════════════╝`);
}

main().catch((error) => {
  console.error("❌ 程序异常:", error);
  process.exit(1);
});
typescript

十一、运行方式#

# 1. 准备PDF文件
mkdir -p data/pdfs
# 将你的1万+PDF文件放入此目录(支持子目录)

# 2. 配置环境变量
echo "OPENAI_API_KEY=sk-your-key" > .env

# 3. 运行
npx ts-node src/index.ts ./data/pdfs "机器学习知识库" "从论文和教材中提取的ML核心知识"

# 输出目录
# ./output/notebook/机器学习知识库/
# ├── README.md
# ├── 监督学习/
# │   ├── _index.md
# │   ├── content.md
# │   └── 线性模型/
# │       ├── _index.md
# │       └── content.md
# ├── 无监督学习/
# │   └── ...
# └── ...
bash

十二、关键优化点总结#

问题解决方案
万级PDF内存爆炸批次加载(50个/批),流式处理,不一次性加载全部
API 速率限制并发控制(10并发),指数退避重试
去重精度语义嵌入 + 内容重叠度双重验证,跨类别标题去重
不漏知识点低重要性阈值过滤(≥4分),多页分组提取
分类质量先粗分再细分,采样+模型双重分类
文件过大每md文件最多10条知识,自动拆分
进程崩溃阶段性日志输出,可基于日志恢复

如需进一步优化(如使用本地模型降低成本、接入 Chroma 持久化、并行化去重等),可以继续扩展!

设计一个完整的大规模 PDF 智能知识库构建系统(kimi)
作者 glownight
发布于 2026年4月29日