Some checks failed
Sync to Gitee / sync (push) Has been cancelled
Submit the formed RAG documentation set produced across Sprint-1/2/3 (WS-12 through WS-26) under docs/rag/. Includes: - README.md / INDEX.md: landing + total index (responsibility matrix, review verdicts, dual-link to source issues) - overview/: full-pipeline architecture (4 .mmd diagrams), 11-stage boundary contracts, doc map, source-code inventory - pipeline/: 5 deep-dives (Loader/Parser/Chunking, Embedding, VDB & retrieval, GraphRAG, Rerank/Prompt/LLM) - graphrag/, end-to-end/: v1.0 formal versions with full source retained as reference - evolution/: 11 architecture-refactor proposals, 6-direction roadmap, capability map - review/: S3-T1 / S3-T2 final reviews, S2-T7 final summary - _indexes/: glossary (81 terms), source->doc reverse index, chart index - _release/: v1.0-RC1 release manifest, versioning convention, ops & freshness plan - _meta/README.md: placeholder noting WS-12 governance assets gap Aggregate review score 92.6/100 (8/8 PASS, 31/31 source-code spot checks hit). The legacy docs/ ignore in .gitignore is narrowed to docs/* with an explicit allowlist for docs/rag/. Refs: WS-26 Co-authored-by: multica-agent <github@multica.ai>
133 lines
5.3 KiB
Plaintext
133 lines
5.3 KiB
Plaintext
%% MemoryBear RAG 全链路架构图(Mermaid Flowchart)
|
||
%% 约定:浅蓝色 = 数据来源层;浅绿色 = 解析与分块;浅黄色 = 向量化与存储;浅紫色 = 检索;浅橙色 = 生成;浅灰色 = 支撑组件
|
||
|
||
flowchart TB
|
||
subgraph DATA_SOURCES["数据来源层 (Loader)"]
|
||
CRAWLER["Web Crawler\ncrawler/web_crawler.py\n-> 输出: CrawledDocument"]
|
||
FEISHU["飞书 API\nintegrations/feishu/client.py\n-> 输出: 本地文件 (.docx/.pdf)"]
|
||
YUQUE["语雀 API\nintegrations/yuque/client.py\n-> 输出: 本地文件 (.md/.html/.xlsx)"]
|
||
UPLOAD["用户上传\ncontrollers/document_controller.py:275\n-> 输出: 文件路径"]
|
||
end
|
||
|
||
subgraph PARSER["文档解析与分块 (Parser + Chunking)"]
|
||
NAIVE["app/naive.py:chunk()\n统一分块入口\nDispatch by filename extension"]
|
||
PDFP["deepdoc/parser/pdf_parser.py\nOCR + Layout + Table"]
|
||
DOCXP["deepdoc/parser/docx_parser.py"]
|
||
HTMLP["deepdoc/parser/html_parser.py"]
|
||
MDPP["deepdoc/parser/markdown_parser.py"]
|
||
EXCELP["deepdoc/parser/excel_parser.py"]
|
||
TXTPIP["deepdoc/parser/txt_parser.py"]
|
||
VISION["deepdoc/vision/\nocr.py + layout_recognizer.py\n+ table_structure_recognizer.py"]
|
||
NLP["nlp/__init__.py\ntokenize / naive_merge / hierarchical_merge"]
|
||
end
|
||
|
||
subgraph CHUNK_TYPES["文档类型适配 (Task Types)"]
|
||
BOOK["app/book.py\n长文档分级分块"]
|
||
PAPER["app/paper.py\n论文结构保持"]
|
||
MANUAL["app/manual.py\n手册按节分块"]
|
||
LAWS["app/laws.py\n法规层级树分块"]
|
||
QA["app/qa.py\n问答对独立分块"]
|
||
ONE["app/one.py\n整文件单块"]
|
||
PIC["app/picture.py\nOCR + VLM描述"]
|
||
AUD["app/audio.py\n语音转文本"]
|
||
end
|
||
|
||
subgraph EMBED["向量化 (Embedding)"]
|
||
EMB_BASE["llm/embedding_model.py\nBase.encode(texts: list)\n→ (np.array, token_count)"]
|
||
EMB_PROV["Provider 工厂\nOpenAI / LocalAI / Azure / Tongyi /\nHuggingFace / Xinference / VolcEngine /\nGPUStack / NVIDIA / BaiChuan"]
|
||
end
|
||
|
||
subgraph VDB["向量数据库 (VDB)"]
|
||
ES_VECT["vdb/elasticsearch/elasticsearch_vector.py\nDense + Sparse 混合索引\ncosineSimilarity + BM25"]
|
||
ES_CONN["utils/es_conn.py\nES 连接管理"]
|
||
ES_SCHEMA["vdb/field.py\npage_content / metadata / vector / text\n+ doc_id / knowledge_id / sort_id"]
|
||
end
|
||
|
||
subgraph GRAPHRAG["知识图谱 (GraphRAG)"]
|
||
G_LIGHT["graphrag/light/\ngraph_extractor.py\n实体+关系抽取\n→ nx.Graph"]
|
||
G_GEN["graphrag/general/\ngraph_extractor.py\n→ community_reports_extractor.py\n+ mind_map_extractor.py"]
|
||
G_LEIDEN["general/leiden.py\n层次聚类"]
|
||
G_RESOLVE["entity_resolution.py\n实体消歧 LLM 匹配"]
|
||
G_SEARCH["graphrag/search.py\nKGSearch.retrieval()\nQuery分析→实体检索→N-hop→社区报告"]
|
||
end
|
||
|
||
subgraph RETRIEVAL["检索 (Retrieval)"]
|
||
DEALER["nlp/search.py\nDealer.search()\nHybrid: BM25 0.05 + Vector 0.95"]
|
||
QRYR["nlp/query.py\nQuery理解 / 关键词扩展"]
|
||
KNOWLEDGE["nlp/search.py:36\nknowledge_retrieval()\n→ 多知识库合并"]
|
||
end
|
||
|
||
subgraph RERANK["重排序 (Reranking)"]
|
||
RERANK_M["models/rerank.py\nRedBearRerank\ncompress_documents() / rerank()"]
|
||
RERANK_P["Provider: JinaRerank /\nDashScopeRerank /\nXINFERENCE / GPUSTACK"]
|
||
end
|
||
|
||
subgraph PROMPT["Prompt 组装"]
|
||
PGEN["prompts/generator.py\ncitation_prompt / keyword_extraction /\nfull_question / content_tagging /\ntoc_relevance / structured_output"]
|
||
PTEMPLATE["prompts/template.py\n加载 .md 模板文件"]
|
||
end
|
||
|
||
subgraph LLM["LLM 生成"]
|
||
CHAT["llm/chat_model.py\nBase.chat() / chat_streamly()\n→ (str, tokens)"]
|
||
CHAT_PROV["Provider 工厂\nOpenAI / Azure / LocalAI /\nXinference / Tongyi /\nHuggingFace / GPUStack / VolcEngine"]
|
||
end
|
||
|
||
subgraph ORCH["编排层 (Orchestration)"]
|
||
CELERY["tasks.py\nparse_document() /\nbuild_graphrag_for_kb() /\nbuild_graphrag_for_document()"]
|
||
WORKFLOW["workflow/nodes/knowledge/node.py\nKnowledgeRetrievalNode.execute()\n→ 检索→去重→重排→返回 chunks"]
|
||
end
|
||
|
||
subgraph POST["后处理"]
|
||
CITE["插入引用标注\nDealer.insert_citations()\npagerank*sim 评分"]
|
||
CACHE["缓存层\nutils/redis_conn.py\nLLM 结果缓存"]
|
||
end
|
||
|
||
%% === 数据流 ===
|
||
DATA_SOURCES --> NAIVE
|
||
NAIVE --> |PDF| PDFP
|
||
NAIVE --> |DOCX| DOCXP
|
||
NAIVE --> |HTML| HTMLP
|
||
NAIVE --> |MD| MDPP
|
||
NAIVE --> |XLSX| EXCELP
|
||
NAIVE --> |TXT| TXTPIP
|
||
|
||
PDFP --> VISION
|
||
VISION --> NLP
|
||
DOCXP --> NLP
|
||
HTMLP --> NLP
|
||
MDPP --> NLP
|
||
EXCELP --> NLP
|
||
TXTPIP --> NLP
|
||
|
||
NAIVE --> |按文档类型| CHUNK_TYPES
|
||
CHUNK_TYPES --> NLP
|
||
|
||
NLP --> EMB_BASE
|
||
EMB_BASE --> EMB_PROV
|
||
EMB_PROV --> ES_VECT
|
||
ES_SCHEMA --> ES_VECT
|
||
ES_CONN --> ES_VECT
|
||
|
||
NLP -.-> |"并行 (async)"| GRAPHRAG
|
||
G_LIGHT --> G_SEARCH
|
||
G_GEN --> G_LEIDEN
|
||
G_GEN --> G_RESOLVE
|
||
G_LEIDEN --> G_SEARCH
|
||
G_RESOLVE --> G_SEARCH
|
||
|
||
CELERY --> NAIVE
|
||
CELERY -.-> |"触发"| GRAPHRAG
|
||
|
||
WORKFLOW --> QRYR
|
||
QRYR --> DEALER
|
||
DEALER --> KNOWLEDGE
|
||
KNOWLEDGE --> RERANK_M
|
||
G_SEARCH --> |"GRAPH模式"| KNOWLEDGE
|
||
RERANK_M --> RERANK_P
|
||
RERANK_P --> PGEN
|
||
PGEN --> PTEMPLATE
|
||
PTEMPLATE --> CHAT
|
||
CHAT --> CHAT_PROV
|
||
CHAT --> CITE
|
||
CITE --> CACHE
|