Merge remote-tracking branch 'upstream/develop' into feature/app-share-wxy

# Conflicts: # api/app/services/app_dsl_service.py
2026-03-16 17:54:01 +08:00
parent 3498e2e884 b62c40dba3
commit e21f713de0
77 changed files with 3913 additions and 567 deletions
--- a/api/app/services/app_dsl_service.py
+++ b/api/app/services/app_dsl_service.py
@@ -19,6 +19,7 @@ from app.models.tool_model import ToolConfig as ToolConfigModel
 from app.models.workflow_model import WorkflowConfig
 from app.services.workflow_service import WorkflowService
 from app.core.workflow.adapters.memory_bear.memory_bear_adapter import MemoryBearAdapter
+from app.models.memory_config_model import MemoryConfig as MemoryConfigModel


 class AppDslService:
@@ -423,9 +424,19 @@ class AppDslService:
        config_id = memory.get("memory_config_id") or memory.get("memory_content")
        if not config_id:
            return memory
-        from app.models.memory_config_model import MemoryConfig as MemoryConfigModel
+        try:
+            config_uuid = uuid.UUID(str(config_id))
+        except (ValueError, AttributeError):
+            exists = self.db.query(MemoryConfigModel).filter(
+                MemoryConfigModel.config_id_old == int(config_id),
+                MemoryConfigModel.workspace_id == workspace_id
+            ).first()
+            if not exists:
+                warnings.append(f"记忆配置 '{config_id}' 未匹配，已置空，请导入后手动配置")
+                return {**memory, "memory_config_id": None, "enabled": False}
+            return memory
        exists = self.db.query(MemoryConfigModel).filter(
-            MemoryConfigModel.config_id == config_id,
+            MemoryConfigModel.config_id == config_uuid,
            MemoryConfigModel.workspace_id == workspace_id
        ).first()
        if not exists:
--- a/api/app/services/draft_run_service.py
+++ b/api/app/services/draft_run_service.py
@@ -98,7 +98,7 @@ def create_long_term_memory_tool(
        **重要：如果用户的问题可以直接回答，不要调用此工具。只在确实需要历史信息时才使用。**

        Args:
-            question: 需要检索的问题（保持原问题的核心语义，使用清晰的关键词）
+            question: 需要检索的问题（保持原问题的核心语义，使用清晰的关键词，第三人称描述的偏好、行为通常指用户本人，比如（我，本人，在下，自己，咱，鄙人，吴，余）通指用户）

        Returns:
            检索到的历史记忆内容
--- a/api/app/services/memory_dashboard_service.py
+++ b/api/app/services/memory_dashboard_service.py
@@ -535,7 +535,8 @@ def get_users_total_chunk_batch(

 def get_rag_content(
    end_user_id: str,
-    limit: int,
+    page: int,
+    pagesize: int,
    db: Session,
    current_user: User
 ) -> dict:
@@ -543,9 +544,9 @@ def get_rag_content(
    先在documents表中查询file_name=='end_user_id'+'.txt'的id和kb_id,
    然后调用/chunks/{kb_id}/{document_id}/chunks接口的相关代码获取所有内容，
    接着对获取的内容进行提取，只要page_content的内容，
-    最后返回数据
+    最后返回分页数据
    """
-    business_logger.info(f"获取RAG内容: end_user_id={end_user_id}, limit={limit}, 操作者: {current_user.username}")
+    business_logger.info(f"获取RAG内容: end_user_id={end_user_id}, page={page}, pagesize={pagesize}, 操作者: {current_user.username}")
    
    try:
        from app.models.document_model import Document
@@ -562,63 +563,76 @@ def get_rag_content(
        if not documents:
            business_logger.warning(f"未找到文件: {file_name}")
            return {
-                "total": 0,
-                "contents": []
+                "page": {
+                    "page": page,
+                    "pagesize": pagesize,
+                    "total": 0,
+                    "hasnext": False,
+                },
+                "items": []
            }
        
        business_logger.info(f"找到 {len(documents)} 个文档记录")
        
-        # 3. 获取所有chunks的page_content
-        all_contents = []
-        total_chunks = 0
+        # 3. 按全局偏移量计算当前页数据
+        # 全局偏移范围：[offset_start, offset_end)
+        offset_start = (page - 1) * pagesize
+        offset_end = offset_start + pagesize
+        
+        global_total = 0    # 所有文档的 chunk 总数
+        page_contents = []  # 当前页的内容
        
        for document in documents:
            try:
-                # 获取知识库信息
                kb = knowledge_repository.get_knowledge_by_id(db, document.kb_id)
                if not kb:
                    business_logger.warning(f"知识库不存在: kb_id={document.kb_id}")
                    continue
                
-                # 初始化向量服务
                vector_service = ElasticSearchVectorFactory().init_vector(knowledge=kb)
                
-                # 获取该文档的所有chunks（分页获取）
-                page = 1
-                pagesize = 100  # 每页100条
+                # 先用 pagesize=1 获取该文档的 chunk 总数
+                doc_total, _ = vector_service.search_by_segment(
+                    document_id=str(document.id),
+                    query=None,
+                    pagesize=1,
+                    page=1,
+                    asc=True
+                )
                
-                while True:
-                    total, items = vector_service.search_by_segment(
+                doc_offset_start = global_total            # 该文档在全局中的起始偏移
+                doc_offset_end = global_total + doc_total  # 该文档在全局中的结束偏移
+                global_total += doc_total
+                
+                # 当前页与该文档无交集，跳过
+                if doc_offset_end <= offset_start or doc_offset_start >= offset_end:
+                    continue
+                
+                # 计算需要从该文档取的局部范围
+                local_start = max(offset_start - doc_offset_start, 0)
+                local_end = min(offset_end - doc_offset_start, doc_total)
+                need_count = local_end - local_start
+                
+                # 换算成 ES 分页参数（ES page 从1开始）
+                es_page = (local_start // pagesize) + 1
+                es_offset_in_page = local_start % pagesize
+                
+                fetched = []
+                while len(fetched) < es_offset_in_page + need_count:
+                    _, items = vector_service.search_by_segment(
                        document_id=str(document.id),
                        query=None,
                        pagesize=pagesize,
-                        page=page,
+                        page=es_page,
                        asc=True
                    )
-                    
                    if not items:
                        break
-                    
-                    # 提取page_content
-                    for item in items:
-                        all_contents.append(item.page_content)
-                        total_chunks += 1
-                        
-                        # # 如果达到limit限制，直接返回
-                        # if limit > 0 and total_chunks >= limit:
-                        #     business_logger.info(f"已达到limit限制: {limit}")
-                        #     return {
-                        #         "total": total_chunks,
-                        #         "contents": all_contents[:limit]
-                        #     }
-                    
-                    # 检查是否还有下一页
-                    if page * pagesize >= total:
-                        break
-                    
-                    page += 1
+                    fetched.extend(items)
+                    es_page += 1
                
-                business_logger.info(f"文档 {document.id} 获取了 {len(items)} 个chunks")
+                slice_items = fetched[es_offset_in_page: es_offset_in_page + need_count]
+                page_contents.extend([item.page_content for item in slice_items])
                
            except Exception as e:
                business_logger.error(f"获取文档 {document.id} 的chunks失败: {str(e)}")
@@ -626,11 +640,16 @@ def get_rag_content(
        
        # 4. 返回结果
        result = {
-            "total": total_chunks,
-            "contents": all_contents[:limit] if limit > 0 else all_contents
+            "page": {
+                "page": page,
+                "pagesize": pagesize,
+                "total": global_total,
+                "hasnext": offset_end < global_total,
+            },
+            "items": page_contents
        }
        
-        business_logger.info(f"成功获取RAG内容: total={total_chunks}, 返回={len(result['contents'])} 条")
+        business_logger.info(f"成功获取RAG内容: total={global_total}, page={page}, 返回={len(page_contents)} 条")
        return result
        
    except Exception as e:
@@ -730,8 +749,8 @@ async def generate_rag_profile(
    if not end_user:
        raise ValueError(f"end_user {end_user_id} 不存在")

-    rag_content = get_rag_content(end_user_id, limit, db, current_user)
-    chunks = rag_content.get("contents", [])
+    rag_content = get_rag_content(end_user_id, page=1, pagesize=limit, db=db, current_user=current_user)
+    chunks = rag_content.get("items", [])

    if not chunks:
        business_logger.warning(f"未找到chunk内容，无法生产RAG画像: end_user_id={end_user_id}")
--- a/api/app/services/user_memory_service.py
+++ b/api/app/services/user_memory_service.py
@@ -1727,6 +1727,150 @@ async def analytics_graph_data(

 # 辅助函数

+async def analytics_community_graph_data(
+    db: Session,
+    end_user_id: str,
+) -> Dict[str, Any]:
+    """
+    获取社区图谱数据，包含 Community 节点、ExtractedEntity 节点及其关系。
+
+    Returns:
+        包含 nodes、edges、statistics 的字典，格式与 analytics_graph_data 一致
+    """
+    try:
+        user_uuid = uuid.UUID(end_user_id)
+        repo = EndUserRepository(db)
+        end_user = repo.get_by_id(user_uuid)
+        if not end_user:
+            return {
+                "nodes": [], "edges": [],
+                "statistics": {"total_nodes": 0, "total_edges": 0, "node_types": {}, "edge_types": {}},
+                "message": "用户不存在"
+            }
+
+        # 查询社区节点、实体节点、BELONGS_TO_COMMUNITY 边、实体间关系
+        from app.repositories.neo4j.cypher_queries import GET_COMMUNITY_GRAPH_DATA
+        rows = await _neo4j_connector.execute_query(GET_COMMUNITY_GRAPH_DATA, end_user_id=end_user_id)
+
+        nodes_map: Dict[str, dict] = {}
+        edges_map: Dict[str, dict] = {}
+        # 记录每个 Community 对应的实体 id 列表
+        community_members: Dict[str, list] = {}
+
+        for row in rows:
+            # Community 节点
+            c_id = row["c_id"]
+            if c_id and c_id not in nodes_map:
+                raw = row["c_props"] or {}
+                props = {k: _clean_neo4j_value(raw.get(k)) for k in (
+                    "community_id", "end_user_id", "member_count", "updated_at",
+                    "name", "summary", "core_entities",
+                ) if k in raw}
+                nodes_map[c_id] = {
+                    "id": c_id,
+                    "label": "Community",
+                    "properties": props,
+                }
+
+            # ExtractedEntity 节点 (e)
+            e_id = row["e_id"]
+            if e_id and e_id not in nodes_map:
+                raw = row["e_props"] or {}
+                props = {k: _clean_neo4j_value(raw.get(k)) for k in (
+                    "name", "end_user_id", "description", "created_at", "entity_type",
+                ) if k in raw}
+                # 注入所属社区名称（c 是 e 直接归属的社区）
+                c_raw = row["c_props"] or {}
+                props["community_name"] = _clean_neo4j_value(c_raw.get("name")) or ""
+                nodes_map[e_id] = {
+                    "id": e_id,
+                    "label": "ExtractedEntity",
+                    "properties": props,
+                }
+
+            # ExtractedEntity 节点 (e2，可选)
+            e2_id = row.get("e2_id")
+            if e2_id and e2_id not in nodes_map:
+                raw = row["e2_props"] or {}
+                props = {k: _clean_neo4j_value(raw.get(k)) for k in (
+                    "name", "end_user_id", "description", "created_at", "entity_type",
+                ) if k in raw}
+                # e2 的社区归属在后处理阶段通过 community_members 补充
+                props["community_name"] = ""
+                nodes_map[e2_id] = {
+                    "id": e2_id,
+                    "label": "ExtractedEntity",
+                    "properties": props,
+                }
+
+            # BELONGS_TO_COMMUNITY 边
+            b_id = row["b_id"]
+            if b_id and b_id not in edges_map:
+                edges_map[b_id] = {
+                    "id": b_id,
+                    "source": e_id,
+                    "target": c_id,
+                }
+            # 收集社区成员 id
+            if c_id and e_id:
+                community_members.setdefault(c_id, [])
+                if e_id not in community_members[c_id]:
+                    community_members[c_id].append(e_id)
+
+            # EXTRACTED_RELATIONSHIP 边（可选）
+            r_id = row.get("r_id")
+            if r_id and r_id not in edges_map and e2_id:
+                r_props = {k: _clean_neo4j_value(v) for k, v in (row["r_props"] or {}).items()}
+                source = e_id if row.get("r_from_e") else e2_id
+                target = e2_id if row.get("r_from_e") else e_id
+                edges_map[r_id] = {
+                    "id": r_id,
+                    "source": source,
+                    "target": target,
+                }
+
+        nodes = list(nodes_map.values())
+        edges = list(edges_map.values())
+
+        # 为每个 Community 节点注入 member_entity_ids，同时补全 e2 节点的 community_name
+        for c_id, member_ids in community_members.items():
+            c_node = nodes_map.get(c_id)
+            if c_node:
+                c_node["properties"]["member_entity_ids"] = member_ids
+                c_name = c_node["properties"].get("name") or ""
+                # 补全属于该社区但 community_name 为空的实体（即 e2 节点）
+                for eid in member_ids:
+                    e_node = nodes_map.get(eid)
+                    if e_node and e_node["label"] == "ExtractedEntity":
+                        if not e_node["properties"].get("community_name"):
+                            e_node["properties"]["community_name"] = c_name
+
+        node_type_counts: Dict[str, int] = {}
+        for n in nodes:
+            node_type_counts[n["label"]] = node_type_counts.get(n["label"], 0) + 1
+
+        return {
+            "nodes": nodes,
+            "edges": edges,
+            "statistics": {
+                "total_nodes": len(nodes),
+                "total_edges": len(edges),
+                "node_types": node_type_counts,
+            }
+        }
+
+    except ValueError:
+        logger.error(f"无效的 end_user_id 格式: {end_user_id}")
+        return {
+            "nodes": [], "edges": [],
+            "statistics": {"total_nodes": 0, "total_edges": 0, "node_types": {}, "edge_types": {}},
+            "message": "无效的用户ID格式"
+        }
+    except Exception as e:
+        logger.error(f"获取社区图谱数据失败: {str(e)}", exc_info=True)
+        raise
+
+
 async  def _extract_node_properties(label: str, properties: Dict[str, Any],node_id: str) -> Dict[str, Any]:
    """
    根据节点类型提取需要的属性字段