Merge branch 'feature/node-aggregation' of github.com:SuanmoSuanyangTechnology/MemoryBear into feature/node-aggregation

This commit is contained in:
lanceyq
2026-03-16 13:11:12 +08:00
7 changed files with 342 additions and 49 deletions

View File

@@ -113,6 +113,7 @@ celery_app.conf.update(
'app.tasks.run_forgetting_cycle_task': {'queue': 'periodic_tasks'},
'app.tasks.write_all_workspaces_memory_task': {'queue': 'periodic_tasks'},
'app.tasks.update_implicit_emotions_storage': {'queue': 'periodic_tasks'},
'app.tasks.init_community_clustering_for_users': {'queue': 'periodic_tasks'},
},
)

View File

@@ -177,7 +177,19 @@ async def get_workspace_end_users(
await aio_redis_set(cache_key, json.dumps(result), expire=30)
except Exception as e:
api_logger.warning(f"Redis 缓存写入失败: {str(e)}")
# 触发社区聚类补全任务(异步,不阻塞接口响应)
# 对有 ExtractedEntity 但无 Community 节点的存量用户自动补跑全量聚类
try:
from app.tasks import init_community_clustering_for_users
init_community_clustering_for_users.apply_async(
kwargs={"end_user_ids": end_user_ids},
queue="periodic_tasks",
)
api_logger.info(f"已触发社区聚类补全任务,候选用户数: {len(end_user_ids)}")
except Exception as e:
api_logger.warning(f"触发社区聚类补全任务失败(不影响主流程): {str(e)}")
api_logger.info(f"成功获取 {len(end_users)} 个宿主记录")
return success(data=result, msg="宿主列表获取成功")

View File

@@ -17,6 +17,7 @@ from app.services.user_memory_service import (
UserMemoryService,
analytics_memory_types,
analytics_graph_data,
analytics_community_graph_data,
)
from app.services.memory_entity_relationship_service import MemoryEntityService,MemoryEmotion,MemoryInteraction
from app.schemas.response_schema import ApiResponse
@@ -295,6 +296,42 @@ async def get_graph_data_api(
return fail(BizCode.INTERNAL_ERROR, "图数据查询失败", str(e))
@router.get("/analytics/community_graph", response_model=ApiResponse)
async def get_community_graph_data_api(
end_user_id: str,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db),
) -> dict:
workspace_id = current_user.current_workspace_id
if workspace_id is None:
api_logger.warning(f"用户 {current_user.username} 尝试查询社区图谱但未选择工作空间")
return fail(BizCode.INVALID_PARAMETER, "请先切换到一个工作空间", "current_workspace_id is None")
api_logger.info(
f"社区图谱查询请求: end_user_id={end_user_id}, user={current_user.username}, "
f"workspace={workspace_id}"
)
try:
result = await analytics_community_graph_data(db=db, end_user_id=end_user_id)
if "message" in result and result["statistics"]["total_nodes"] == 0:
api_logger.warning(f"社区图谱查询返回空结果: {result.get('message')}")
return success(data=result, msg=result.get("message", "查询成功"))
api_logger.info(
f"成功获取社区图谱: end_user_id={end_user_id}, "
f"nodes={result['statistics']['total_nodes']}, "
f"edges={result['statistics']['total_edges']}"
)
return success(data=result, msg="查询成功")
except Exception as e:
api_logger.error(f"社区图谱查询失败: end_user_id={end_user_id}, error={str(e)}")
return fail(BizCode.INTERNAL_ERROR, "社区图谱查询失败", str(e))
@router.get("/read_end_user/profile", response_model=ApiResponse)
async def get_end_user_profile(
end_user_id: str,

View File

@@ -196,6 +196,7 @@ class LabelPropagationEngine:
await self._evaluate_merge(all_community_ids, end_user_id)
logger.info(
f"[Clustering] 全量聚类完成,合并前 {pre_merge_count} 个社区,"
f"[Clustering] 全量聚类完成,合并前 {pre_merge_count} 个社区,"
f"{len(labels)} 个实体"
)
@@ -265,6 +266,7 @@ class LabelPropagationEngine:
logger.debug(
f"[Clustering] 新实体 {entity_id}{len(neighbors)} 个无社区邻居 → 新社区 {new_cid}"
)
await self._generate_community_metadata(new_cid, end_user_id)
else:
# 加入得票最多的社区
await self.repo.assign_entity_to_community(entity_id, target_cid, end_user_id)
@@ -276,6 +278,7 @@ class LabelPropagationEngine:
await self._evaluate_merge(
list(community_ids_in_neighbors), end_user_id
)
await self._generate_community_metadata(target_cid, end_user_id)
async def _evaluate_merge(
self, community_ids: List[str], end_user_id: str
@@ -285,30 +288,50 @@ class LabelPropagationEngine:
策略:计算各社区成员 embedding 的平均向量,若两两余弦相似度 > 0.75 则合并。
合并时保留成员数最多的社区,其余成员迁移过来。
全量场景(社区数 > 20使用批量查询避免 N 次数据库往返。
"""
MERGE_THRESHOLD = 0.75
MERGE_THRESHOLD = 0.85
BATCH_THRESHOLD = 20 # 超过此数量走批量查询
community_embeddings: Dict[str, Optional[List[float]]] = {}
community_sizes: Dict[str, int] = {}
for cid in community_ids:
members = await self.repo.get_community_members(cid, end_user_id)
community_sizes[cid] = len(members)
# 计算社区成员 embedding 的平均向量
valid_embeddings = [
m["name_embedding"]
for m in members
if m.get("name_embedding")
]
if valid_embeddings:
dim = len(valid_embeddings[0])
avg = [
sum(e[i] for e in valid_embeddings) / len(valid_embeddings)
for i in range(dim)
if len(community_ids) > BATCH_THRESHOLD:
# 批量查询:一次拉取所有社区成员
all_members = await self.repo.get_all_community_members_batch(
community_ids, end_user_id
)
for cid in community_ids:
members = all_members.get(cid, [])
community_sizes[cid] = len(members)
valid_embeddings = [
m["name_embedding"] for m in members if m.get("name_embedding")
]
community_embeddings[cid] = avg
else:
community_embeddings[cid] = None
if valid_embeddings:
dim = len(valid_embeddings[0])
community_embeddings[cid] = [
sum(e[i] for e in valid_embeddings) / len(valid_embeddings)
for i in range(dim)
]
else:
community_embeddings[cid] = None
else:
# 增量场景:逐个查询
for cid in community_ids:
members = await self.repo.get_community_members(cid, end_user_id)
community_sizes[cid] = len(members)
valid_embeddings = [
m["name_embedding"] for m in members if m.get("name_embedding")
]
if valid_embeddings:
dim = len(valid_embeddings[0])
community_embeddings[cid] = [
sum(e[i] for e in valid_embeddings) / len(valid_embeddings)
for i in range(dim)
]
else:
community_embeddings[cid] = None
# 找出应合并的社区对
to_merge: List[tuple] = []
@@ -322,18 +345,67 @@ class LabelPropagationEngine:
if sim > MERGE_THRESHOLD:
to_merge.append((cids[i], cids[j]))
logger.info(f"[Clustering] 发现 {len(to_merge)} 对可合并社区")
# 执行合并:逐对处理,每次合并后重新计算合并社区的平均向量
# 避免 union-find 链式传递导致语义不相关的社区被间接合并
# A≈B、B≈C 不代表 A≈C不能因传递性把 A/B/C 全部合并)
merged_into: Dict[str, str] = {} # dissolve → keep 的最终映射
def get_root(x: str) -> str:
"""路径压缩,找到 x 当前所属的根社区。"""
while x in merged_into:
merged_into[x] = merged_into.get(merged_into[x], merged_into[x])
x = merged_into[x]
return x
for c1, c2 in to_merge:
keep = c1 if community_sizes.get(c1, 0) >= community_sizes.get(c2, 0) else c2
dissolve = c2 if keep == c1 else c1
root1, root2 = get_root(c1), get_root(c2)
if root1 == root2:
continue
# 用合并后的最新平均向量重新验证相似度
# 防止链式传递A≈B 合并后 B 的向量已更新C 必须和新 B 相似才能合并
current_sim = _cosine_similarity(
community_embeddings.get(root1),
community_embeddings.get(root2),
)
if current_sim <= MERGE_THRESHOLD:
# 合并后向量已漂移,不再满足阈值,跳过
logger.debug(
f"[Clustering] 跳过合并 {root1}{root2}"
f"当前相似度 {current_sim:.3f}{MERGE_THRESHOLD}"
)
continue
keep = root1 if community_sizes.get(root1, 0) >= community_sizes.get(root2, 0) else root2
dissolve = root2 if keep == root1 else root1
merged_into[dissolve] = keep
members = await self.repo.get_community_members(dissolve, end_user_id)
for m in members:
await self.repo.assign_entity_to_community(
m["id"], keep, end_user_id
)
await self.repo.assign_entity_to_community(m["id"], keep, end_user_id)
# 合并后重新计算 keep 的平均向量(加权平均)
keep_emb = community_embeddings.get(keep)
dissolve_emb = community_embeddings.get(dissolve)
keep_size = community_sizes.get(keep, 0)
dissolve_size = community_sizes.get(dissolve, 0)
total_size = keep_size + dissolve_size
if keep_emb and dissolve_emb and total_size > 0:
dim = len(keep_emb)
community_embeddings[keep] = [
(keep_emb[i] * keep_size + dissolve_emb[i] * dissolve_size) / total_size
for i in range(dim)
]
community_embeddings[dissolve] = None
community_sizes[keep] = total_size
community_sizes[dissolve] = 0
await self.repo.refresh_member_count(keep, end_user_id)
logger.info(
f"[Clustering] 社区合并: {dissolve}{keep}"
f"迁移 {len(members)} 个成员"
f"相似度={current_sim:.3f}迁移 {len(members)} 个成员"
)
async def _flush_labels(

View File

@@ -21,6 +21,7 @@ from app.repositories.neo4j.cypher_queries import (
CHECK_USER_HAS_COMMUNITIES,
UPDATE_COMMUNITY_MEMBER_COUNT,
UPDATE_COMMUNITY_METADATA,
UPDATE_COMMUNITY_METADATA,
)
logger = logging.getLogger(__name__)
@@ -141,6 +142,25 @@ class CommunityRepository:
logger.error(f"get_community_members failed: {e}")
return []
async def get_all_community_members_batch(
self, community_ids: List[str], end_user_id: str
) -> Dict[str, List[Dict]]:
"""批量查询多个社区的成员,返回 {community_id: [members]} 字典。"""
try:
rows = await self.connector.execute_query(
GET_ALL_COMMUNITY_MEMBERS_BATCH,
community_ids=community_ids,
end_user_id=end_user_id,
)
result: Dict[str, List[Dict]] = {}
for row in rows:
cid = row["community_id"]
result.setdefault(cid, []).append(row)
return result
except Exception as e:
logger.error(f"get_all_community_members_batch failed: {e}")
return {}
async def has_communities(self, end_user_id: str) -> bool:
"""检查该用户是否已有 Community 节点(用于判断全量 vs 增量)。"""
try:

View File

@@ -1065,26 +1065,6 @@ Graph_Node_query = """
# Community 节点 & BELONGS_TO_COMMUNITY 边
# ============================================================
COMMUNITY_NODE_SAVE = """
MERGE (c:Community {community_id: $community_id})
SET c.end_user_id = $end_user_id,
c.formed_at = $formed_at,
c.updated_at = datetime(),
c.status = $status,
c.member_count = $member_count
RETURN c.community_id AS community_id
"""
COMMUNITY_ADD_MEMBER = """
MATCH (e:ExtractedEntity {id: $entity_id, end_user_id: $end_user_id})
MATCH (c:Community {community_id: $community_id, end_user_id: $end_user_id})
MERGE (e)-[:BELONGS_TO_COMMUNITY]->(c)
SET c.updated_at = datetime(),
c.member_count = $member_count
"""
# ─── Community 聚类相关 Cypher 模板 ───────────────────────────────────────────
COMMUNITY_NODE_UPSERT = """
@@ -1111,12 +1091,23 @@ DELETE r
GET_ENTITY_NEIGHBORS = """
MATCH (e:ExtractedEntity {id: $entity_id, end_user_id: $end_user_id})
OPTIONAL MATCH (e)-[:EXTRACTED_RELATIONSHIP]-(nb:ExtractedEntity {end_user_id: $end_user_id})
// 来源一直接关系邻居EXTRACTED_RELATIONSHIP 边)
OPTIONAL MATCH (e)-[:EXTRACTED_RELATIONSHIP]-(nb1:ExtractedEntity {end_user_id: $end_user_id})
// 来源二:同 Statement 共现邻居REFERENCES_ENTITY 边)
OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e)
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(nb2:ExtractedEntity {end_user_id: $end_user_id})
WHERE nb2.id <> e.id
WITH collect(DISTINCT nb1) + collect(DISTINCT nb2) AS all_neighbors
UNWIND all_neighbors AS nb
WITH nb WHERE nb IS NOT NULL
OPTIONAL MATCH (nb)-[:BELONGS_TO_COMMUNITY]->(c:Community)
RETURN DISTINCT
nb.id AS id,
nb.name AS name,
nb.name_embedding AS name_embedding,
nb.id AS id,
nb.name AS name,
nb.name_embedding AS name_embedding,
nb.activation_value AS activation_value,
CASE WHEN c IS NOT NULL THEN c.community_id ELSE null END AS community_id
"""

View File

@@ -1727,6 +1727,166 @@ async def analytics_graph_data(
# 辅助函数
async def analytics_community_graph_data(
db: Session,
end_user_id: str,
) -> Dict[str, Any]:
"""
获取社区图谱数据,包含 Community 节点、ExtractedEntity 节点及其关系。
Returns:
包含 nodes、edges、statistics 的字典,格式与 analytics_graph_data 一致
"""
try:
user_uuid = uuid.UUID(end_user_id)
repo = EndUserRepository(db)
end_user = repo.get_by_id(user_uuid)
if not end_user:
return {
"nodes": [], "edges": [],
"statistics": {"total_nodes": 0, "total_edges": 0, "node_types": {}, "edge_types": {}},
"message": "用户不存在"
}
# 查询社区节点、实体节点、BELONGS_TO_COMMUNITY 边、实体间关系
cypher = """
MATCH (c:Community {end_user_id: $end_user_id})
MATCH (e:ExtractedEntity {end_user_id: $end_user_id})-[b:BELONGS_TO_COMMUNITY]->(c)
OPTIONAL MATCH (e)-[r:EXTRACTED_RELATIONSHIP]-(e2:ExtractedEntity {end_user_id: $end_user_id})
RETURN
elementId(c) AS c_id,
properties(c) AS c_props,
elementId(e) AS e_id,
properties(e) AS e_props,
elementId(b) AS b_id,
elementId(e2) AS e2_id,
properties(e2) AS e2_props,
elementId(r) AS r_id,
type(r) AS r_type,
properties(r) AS r_props,
startNode(r) = e AS r_from_e
"""
rows = await _neo4j_connector.execute_query(cypher, end_user_id=end_user_id)
nodes_map: Dict[str, dict] = {}
edges_map: Dict[str, dict] = {}
# 记录每个 Community 对应的实体 id 列表
community_members: Dict[str, list] = {}
for row in rows:
# Community 节点
c_id = row["c_id"]
if c_id and c_id not in nodes_map:
raw = row["c_props"] or {}
props = {k: _clean_neo4j_value(raw.get(k)) for k in (
"community_id", "end_user_id", "member_count", "updated_at",
"name", "summary", "core_entities",
) if k in raw}
nodes_map[c_id] = {
"id": c_id,
"label": "Community",
"properties": props,
}
# ExtractedEntity 节点 (e)
e_id = row["e_id"]
if e_id and e_id not in nodes_map:
raw = row["e_props"] or {}
props = {k: _clean_neo4j_value(raw.get(k)) for k in (
"name", "end_user_id", "description", "created_at", "entity_type",
) if k in raw}
# 注入所属社区名称c 是 e 直接归属的社区)
c_raw = row["c_props"] or {}
props["community_name"] = _clean_neo4j_value(c_raw.get("name")) or ""
nodes_map[e_id] = {
"id": e_id,
"label": "ExtractedEntity",
"properties": props,
}
# ExtractedEntity 节点 (e2可选)
e2_id = row.get("e2_id")
if e2_id and e2_id not in nodes_map:
raw = row["e2_props"] or {}
props = {k: _clean_neo4j_value(raw.get(k)) for k in (
"name", "end_user_id", "description", "created_at", "entity_type",
) if k in raw}
# e2 的社区归属在后处理阶段通过 community_members 补充
props["community_name"] = ""
nodes_map[e2_id] = {
"id": e2_id,
"label": "ExtractedEntity",
"properties": props,
}
# BELONGS_TO_COMMUNITY 边
b_id = row["b_id"]
if b_id and b_id not in edges_map:
edges_map[b_id] = {
"id": b_id,
"source": e_id,
"target": c_id,
}
# 收集社区成员 id
if c_id and e_id:
community_members.setdefault(c_id, [])
if e_id not in community_members[c_id]:
community_members[c_id].append(e_id)
# EXTRACTED_RELATIONSHIP 边(可选)
r_id = row.get("r_id")
if r_id and r_id not in edges_map and e2_id:
r_props = {k: _clean_neo4j_value(v) for k, v in (row["r_props"] or {}).items()}
source = e_id if row.get("r_from_e") else e2_id
target = e2_id if row.get("r_from_e") else e_id
edges_map[r_id] = {
"id": r_id,
"source": source,
"target": target,
}
nodes = list(nodes_map.values())
edges = list(edges_map.values())
# 为每个 Community 节点注入 member_entity_ids同时补全 e2 节点的 community_name
for c_id, member_ids in community_members.items():
c_node = nodes_map.get(c_id)
if c_node:
c_node["properties"]["member_entity_ids"] = member_ids
c_name = c_node["properties"].get("name") or ""
# 补全属于该社区但 community_name 为空的实体(即 e2 节点)
for eid in member_ids:
e_node = nodes_map.get(eid)
if e_node and e_node["label"] == "ExtractedEntity":
if not e_node["properties"].get("community_name"):
e_node["properties"]["community_name"] = c_name
node_type_counts: Dict[str, int] = {}
for n in nodes:
node_type_counts[n["label"]] = node_type_counts.get(n["label"], 0) + 1
return {
"nodes": nodes,
"edges": edges,
"statistics": {
"total_nodes": len(nodes),
"total_edges": len(edges),
"node_types": node_type_counts,
}
}
except ValueError:
logger.error(f"无效的 end_user_id 格式: {end_user_id}")
return {
"nodes": [], "edges": [],
"statistics": {"total_nodes": 0, "total_edges": 0, "node_types": {}, "edge_types": {}},
"message": "无效的用户ID格式"
}
except Exception as e:
logger.error(f"获取社区图谱数据失败: {str(e)}", exc_info=True)
raise
async def _extract_node_properties(label: str, properties: Dict[str, Any],node_id: str) -> Dict[str, Any]:
"""
根据节点类型提取需要的属性字段