docs(rag): add MemoryBear RAG implementation docs v1.0
Some checks failed
Sync to Gitee / sync (push) Has been cancelled

Submit the formed RAG documentation set produced across Sprint-1/2/3
(WS-12 through WS-26) under docs/rag/. Includes:

- README.md / INDEX.md: landing + total index (responsibility matrix,
  review verdicts, dual-link to source issues)
- overview/: full-pipeline architecture (4 .mmd diagrams),
  11-stage boundary contracts, doc map, source-code inventory
- pipeline/: 5 deep-dives (Loader/Parser/Chunking, Embedding,
  VDB & retrieval, GraphRAG, Rerank/Prompt/LLM)
- graphrag/, end-to-end/: v1.0 formal versions with full source
  retained as reference
- evolution/: 11 architecture-refactor proposals,
  6-direction roadmap, capability map
- review/: S3-T1 / S3-T2 final reviews, S2-T7 final summary
- _indexes/: glossary (81 terms), source->doc reverse index, chart index
- _release/: v1.0-RC1 release manifest, versioning convention,
  ops & freshness plan
- _meta/README.md: placeholder noting WS-12 governance assets gap

Aggregate review score 92.6/100 (8/8 PASS, 31/31 source-code spot
checks hit). The legacy docs/ ignore in .gitignore is narrowed to
docs/* with an explicit allowlist for docs/rag/.

Refs: WS-26
Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
Multica PM Agent
2026-05-09 10:51:48 +08:00
parent feae2f2e1e
commit 343a5eebe3
33 changed files with 8410 additions and 1 deletions

View File

@@ -0,0 +1,132 @@
%% MemoryBear RAG 全链路架构图Mermaid Flowchart
%% 约定:浅蓝色 = 数据来源层;浅绿色 = 解析与分块;浅黄色 = 向量化与存储;浅紫色 = 检索;浅橙色 = 生成;浅灰色 = 支撑组件
flowchart TB
subgraph DATA_SOURCES["数据来源层 (Loader)"]
CRAWLER["Web Crawler\ncrawler/web_crawler.py\n-> 输出: CrawledDocument"]
FEISHU["飞书 API\nintegrations/feishu/client.py\n-> 输出: 本地文件 (.docx/.pdf)"]
YUQUE["语雀 API\nintegrations/yuque/client.py\n-> 输出: 本地文件 (.md/.html/.xlsx)"]
UPLOAD["用户上传\ncontrollers/document_controller.py:275\n-> 输出: 文件路径"]
end
subgraph PARSER["文档解析与分块 (Parser + Chunking)"]
NAIVE["app/naive.py:chunk()\n统一分块入口\nDispatch by filename extension"]
PDFP["deepdoc/parser/pdf_parser.py\nOCR + Layout + Table"]
DOCXP["deepdoc/parser/docx_parser.py"]
HTMLP["deepdoc/parser/html_parser.py"]
MDPP["deepdoc/parser/markdown_parser.py"]
EXCELP["deepdoc/parser/excel_parser.py"]
TXTPIP["deepdoc/parser/txt_parser.py"]
VISION["deepdoc/vision/\nocr.py + layout_recognizer.py\n+ table_structure_recognizer.py"]
NLP["nlp/__init__.py\ntokenize / naive_merge / hierarchical_merge"]
end
subgraph CHUNK_TYPES["文档类型适配 (Task Types)"]
BOOK["app/book.py\n长文档分级分块"]
PAPER["app/paper.py\n论文结构保持"]
MANUAL["app/manual.py\n手册按节分块"]
LAWS["app/laws.py\n法规层级树分块"]
QA["app/qa.py\n问答对独立分块"]
ONE["app/one.py\n整文件单块"]
PIC["app/picture.py\nOCR + VLM描述"]
AUD["app/audio.py\n语音转文本"]
end
subgraph EMBED["向量化 (Embedding)"]
EMB_BASE["llm/embedding_model.py\nBase.encode(texts: list)\n→ (np.array, token_count)"]
EMB_PROV["Provider 工厂\nOpenAI / LocalAI / Azure / Tongyi /\nHuggingFace / Xinference / VolcEngine /\nGPUStack / NVIDIA / BaiChuan"]
end
subgraph VDB["向量数据库 (VDB)"]
ES_VECT["vdb/elasticsearch/elasticsearch_vector.py\nDense + Sparse 混合索引\ncosineSimilarity + BM25"]
ES_CONN["utils/es_conn.py\nES 连接管理"]
ES_SCHEMA["vdb/field.py\npage_content / metadata / vector / text\n+ doc_id / knowledge_id / sort_id"]
end
subgraph GRAPHRAG["知识图谱 (GraphRAG)"]
G_LIGHT["graphrag/light/\ngraph_extractor.py\n实体+关系抽取\n→ nx.Graph"]
G_GEN["graphrag/general/\ngraph_extractor.py\n→ community_reports_extractor.py\n+ mind_map_extractor.py"]
G_LEIDEN["general/leiden.py\n层次聚类"]
G_RESOLVE["entity_resolution.py\n实体消歧 LLM 匹配"]
G_SEARCH["graphrag/search.py\nKGSearch.retrieval()\nQuery分析→实体检索→N-hop→社区报告"]
end
subgraph RETRIEVAL["检索 (Retrieval)"]
DEALER["nlp/search.py\nDealer.search()\nHybrid: BM25 0.05 + Vector 0.95"]
QRYR["nlp/query.py\nQuery理解 / 关键词扩展"]
KNOWLEDGE["nlp/search.py:36\nknowledge_retrieval()\n→ 多知识库合并"]
end
subgraph RERANK["重排序 (Reranking)"]
RERANK_M["models/rerank.py\nRedBearRerank\ncompress_documents() / rerank()"]
RERANK_P["Provider: JinaRerank /\nDashScopeRerank /\nXINFERENCE / GPUSTACK"]
end
subgraph PROMPT["Prompt 组装"]
PGEN["prompts/generator.py\ncitation_prompt / keyword_extraction /\nfull_question / content_tagging /\ntoc_relevance / structured_output"]
PTEMPLATE["prompts/template.py\n加载 .md 模板文件"]
end
subgraph LLM["LLM 生成"]
CHAT["llm/chat_model.py\nBase.chat() / chat_streamly()\n→ (str, tokens)"]
CHAT_PROV["Provider 工厂\nOpenAI / Azure / LocalAI /\nXinference / Tongyi /\nHuggingFace / GPUStack / VolcEngine"]
end
subgraph ORCH["编排层 (Orchestration)"]
CELERY["tasks.py\nparse_document() /\nbuild_graphrag_for_kb() /\nbuild_graphrag_for_document()"]
WORKFLOW["workflow/nodes/knowledge/node.py\nKnowledgeRetrievalNode.execute()\n→ 检索→去重→重排→返回 chunks"]
end
subgraph POST["后处理"]
CITE["插入引用标注\nDealer.insert_citations()\npagerank*sim 评分"]
CACHE["缓存层\nutils/redis_conn.py\nLLM 结果缓存"]
end
%% === 数据流 ===
DATA_SOURCES --> NAIVE
NAIVE --> |PDF| PDFP
NAIVE --> |DOCX| DOCXP
NAIVE --> |HTML| HTMLP
NAIVE --> |MD| MDPP
NAIVE --> |XLSX| EXCELP
NAIVE --> |TXT| TXTPIP
PDFP --> VISION
VISION --> NLP
DOCXP --> NLP
HTMLP --> NLP
MDPP --> NLP
EXCELP --> NLP
TXTPIP --> NLP
NAIVE --> |按文档类型| CHUNK_TYPES
CHUNK_TYPES --> NLP
NLP --> EMB_BASE
EMB_BASE --> EMB_PROV
EMB_PROV --> ES_VECT
ES_SCHEMA --> ES_VECT
ES_CONN --> ES_VECT
NLP -.-> |"并行 (async)"| GRAPHRAG
G_LIGHT --> G_SEARCH
G_GEN --> G_LEIDEN
G_GEN --> G_RESOLVE
G_LEIDEN --> G_SEARCH
G_RESOLVE --> G_SEARCH
CELERY --> NAIVE
CELERY -.-> |"触发"| GRAPHRAG
WORKFLOW --> QRYR
QRYR --> DEALER
DEALER --> KNOWLEDGE
KNOWLEDGE --> RERANK_M
G_SEARCH --> |"GRAPH模式"| KNOWLEDGE
RERANK_M --> RERANK_P
RERANK_P --> PGEN
PGEN --> PTEMPLATE
PTEMPLATE --> CHAT
CHAT --> CHAT_PROV
CHAT --> CITE
CITE --> CACHE