%% MemoryBear RAG 全链路架构图(Mermaid Flowchart) %% 约定:浅蓝色 = 数据来源层;浅绿色 = 解析与分块;浅黄色 = 向量化与存储;浅紫色 = 检索;浅橙色 = 生成;浅灰色 = 支撑组件 flowchart TB subgraph DATA_SOURCES["数据来源层 (Loader)"] CRAWLER["Web Crawler\ncrawler/web_crawler.py\n-> 输出: CrawledDocument"] FEISHU["飞书 API\nintegrations/feishu/client.py\n-> 输出: 本地文件 (.docx/.pdf)"] YUQUE["语雀 API\nintegrations/yuque/client.py\n-> 输出: 本地文件 (.md/.html/.xlsx)"] UPLOAD["用户上传\ncontrollers/document_controller.py:275\n-> 输出: 文件路径"] end subgraph PARSER["文档解析与分块 (Parser + Chunking)"] NAIVE["app/naive.py:chunk()\n统一分块入口\nDispatch by filename extension"] PDFP["deepdoc/parser/pdf_parser.py\nOCR + Layout + Table"] DOCXP["deepdoc/parser/docx_parser.py"] HTMLP["deepdoc/parser/html_parser.py"] MDPP["deepdoc/parser/markdown_parser.py"] EXCELP["deepdoc/parser/excel_parser.py"] TXTPIP["deepdoc/parser/txt_parser.py"] VISION["deepdoc/vision/\nocr.py + layout_recognizer.py\n+ table_structure_recognizer.py"] NLP["nlp/__init__.py\ntokenize / naive_merge / hierarchical_merge"] end subgraph CHUNK_TYPES["文档类型适配 (Task Types)"] BOOK["app/book.py\n长文档分级分块"] PAPER["app/paper.py\n论文结构保持"] MANUAL["app/manual.py\n手册按节分块"] LAWS["app/laws.py\n法规层级树分块"] QA["app/qa.py\n问答对独立分块"] ONE["app/one.py\n整文件单块"] PIC["app/picture.py\nOCR + VLM描述"] AUD["app/audio.py\n语音转文本"] end subgraph EMBED["向量化 (Embedding)"] EMB_BASE["llm/embedding_model.py\nBase.encode(texts: list)\n→ (np.array, token_count)"] EMB_PROV["Provider 工厂\nOpenAI / LocalAI / Azure / Tongyi /\nHuggingFace / Xinference / VolcEngine /\nGPUStack / NVIDIA / BaiChuan"] end subgraph VDB["向量数据库 (VDB)"] ES_VECT["vdb/elasticsearch/elasticsearch_vector.py\nDense + Sparse 混合索引\ncosineSimilarity + BM25"] ES_CONN["utils/es_conn.py\nES 连接管理"] ES_SCHEMA["vdb/field.py\npage_content / metadata / vector / text\n+ doc_id / knowledge_id / sort_id"] end subgraph GRAPHRAG["知识图谱 (GraphRAG)"] G_LIGHT["graphrag/light/\ngraph_extractor.py\n实体+关系抽取\n→ nx.Graph"] G_GEN["graphrag/general/\ngraph_extractor.py\n→ community_reports_extractor.py\n+ mind_map_extractor.py"] G_LEIDEN["general/leiden.py\n层次聚类"] G_RESOLVE["entity_resolution.py\n实体消歧 LLM 匹配"] G_SEARCH["graphrag/search.py\nKGSearch.retrieval()\nQuery分析→实体检索→N-hop→社区报告"] end subgraph RETRIEVAL["检索 (Retrieval)"] DEALER["nlp/search.py\nDealer.search()\nHybrid: BM25 0.05 + Vector 0.95"] QRYR["nlp/query.py\nQuery理解 / 关键词扩展"] KNOWLEDGE["nlp/search.py:36\nknowledge_retrieval()\n→ 多知识库合并"] end subgraph RERANK["重排序 (Reranking)"] RERANK_M["models/rerank.py\nRedBearRerank\ncompress_documents() / rerank()"] RERANK_P["Provider: JinaRerank /\nDashScopeRerank /\nXINFERENCE / GPUSTACK"] end subgraph PROMPT["Prompt 组装"] PGEN["prompts/generator.py\ncitation_prompt / keyword_extraction /\nfull_question / content_tagging /\ntoc_relevance / structured_output"] PTEMPLATE["prompts/template.py\n加载 .md 模板文件"] end subgraph LLM["LLM 生成"] CHAT["llm/chat_model.py\nBase.chat() / chat_streamly()\n→ (str, tokens)"] CHAT_PROV["Provider 工厂\nOpenAI / Azure / LocalAI /\nXinference / Tongyi /\nHuggingFace / GPUStack / VolcEngine"] end subgraph ORCH["编排层 (Orchestration)"] CELERY["tasks.py\nparse_document() /\nbuild_graphrag_for_kb() /\nbuild_graphrag_for_document()"] WORKFLOW["workflow/nodes/knowledge/node.py\nKnowledgeRetrievalNode.execute()\n→ 检索→去重→重排→返回 chunks"] end subgraph POST["后处理"] CITE["插入引用标注\nDealer.insert_citations()\npagerank*sim 评分"] CACHE["缓存层\nutils/redis_conn.py\nLLM 结果缓存"] end %% === 数据流 === DATA_SOURCES --> NAIVE NAIVE --> |PDF| PDFP NAIVE --> |DOCX| DOCXP NAIVE --> |HTML| HTMLP NAIVE --> |MD| MDPP NAIVE --> |XLSX| EXCELP NAIVE --> |TXT| TXTPIP PDFP --> VISION VISION --> NLP DOCXP --> NLP HTMLP --> NLP MDPP --> NLP EXCELP --> NLP TXTPIP --> NLP NAIVE --> |按文档类型| CHUNK_TYPES CHUNK_TYPES --> NLP NLP --> EMB_BASE EMB_BASE --> EMB_PROV EMB_PROV --> ES_VECT ES_SCHEMA --> ES_VECT ES_CONN --> ES_VECT NLP -.-> |"并行 (async)"| GRAPHRAG G_LIGHT --> G_SEARCH G_GEN --> G_LEIDEN G_GEN --> G_RESOLVE G_LEIDEN --> G_SEARCH G_RESOLVE --> G_SEARCH CELERY --> NAIVE CELERY -.-> |"触发"| GRAPHRAG WORKFLOW --> QRYR QRYR --> DEALER DEALER --> KNOWLEDGE KNOWLEDGE --> RERANK_M G_SEARCH --> |"GRAPH模式"| KNOWLEDGE RERANK_M --> RERANK_P RERANK_P --> PGEN PGEN --> PTEMPLATE PTEMPLATE --> CHAT CHAT --> CHAT_PROV CHAT --> CITE CITE --> CACHE