Merge remote-tracking branch 'upstream/develop' into feature/app-share-wxy
# Conflicts: # api/app/services/app_dsl_service.py
This commit is contained in:
@@ -19,6 +19,7 @@ from app.models.tool_model import ToolConfig as ToolConfigModel
|
||||
from app.models.workflow_model import WorkflowConfig
|
||||
from app.services.workflow_service import WorkflowService
|
||||
from app.core.workflow.adapters.memory_bear.memory_bear_adapter import MemoryBearAdapter
|
||||
from app.models.memory_config_model import MemoryConfig as MemoryConfigModel
|
||||
|
||||
|
||||
class AppDslService:
|
||||
@@ -423,9 +424,19 @@ class AppDslService:
|
||||
config_id = memory.get("memory_config_id") or memory.get("memory_content")
|
||||
if not config_id:
|
||||
return memory
|
||||
from app.models.memory_config_model import MemoryConfig as MemoryConfigModel
|
||||
try:
|
||||
config_uuid = uuid.UUID(str(config_id))
|
||||
except (ValueError, AttributeError):
|
||||
exists = self.db.query(MemoryConfigModel).filter(
|
||||
MemoryConfigModel.config_id_old == int(config_id),
|
||||
MemoryConfigModel.workspace_id == workspace_id
|
||||
).first()
|
||||
if not exists:
|
||||
warnings.append(f"记忆配置 '{config_id}' 未匹配,已置空,请导入后手动配置")
|
||||
return {**memory, "memory_config_id": None, "enabled": False}
|
||||
return memory
|
||||
exists = self.db.query(MemoryConfigModel).filter(
|
||||
MemoryConfigModel.config_id == config_id,
|
||||
MemoryConfigModel.config_id == config_uuid,
|
||||
MemoryConfigModel.workspace_id == workspace_id
|
||||
).first()
|
||||
if not exists:
|
||||
|
||||
@@ -98,7 +98,7 @@ def create_long_term_memory_tool(
|
||||
**重要:如果用户的问题可以直接回答,不要调用此工具。只在确实需要历史信息时才使用。**
|
||||
|
||||
Args:
|
||||
question: 需要检索的问题(保持原问题的核心语义,使用清晰的关键词)
|
||||
question: 需要检索的问题(保持原问题的核心语义,使用清晰的关键词,第三人称描述的偏好、行为通常指用户本人,比如(我,本人,在下,自己,咱,鄙人,吴,余)通指用户)
|
||||
|
||||
Returns:
|
||||
检索到的历史记忆内容
|
||||
|
||||
@@ -535,7 +535,8 @@ def get_users_total_chunk_batch(
|
||||
|
||||
def get_rag_content(
|
||||
end_user_id: str,
|
||||
limit: int,
|
||||
page: int,
|
||||
pagesize: int,
|
||||
db: Session,
|
||||
current_user: User
|
||||
) -> dict:
|
||||
@@ -543,9 +544,9 @@ def get_rag_content(
|
||||
先在documents表中查询file_name=='end_user_id'+'.txt'的id和kb_id,
|
||||
然后调用/chunks/{kb_id}/{document_id}/chunks接口的相关代码获取所有内容,
|
||||
接着对获取的内容进行提取,只要page_content的内容,
|
||||
最后返回数据
|
||||
最后返回分页数据
|
||||
"""
|
||||
business_logger.info(f"获取RAG内容: end_user_id={end_user_id}, limit={limit}, 操作者: {current_user.username}")
|
||||
business_logger.info(f"获取RAG内容: end_user_id={end_user_id}, page={page}, pagesize={pagesize}, 操作者: {current_user.username}")
|
||||
|
||||
try:
|
||||
from app.models.document_model import Document
|
||||
@@ -562,63 +563,76 @@ def get_rag_content(
|
||||
if not documents:
|
||||
business_logger.warning(f"未找到文件: {file_name}")
|
||||
return {
|
||||
"total": 0,
|
||||
"contents": []
|
||||
"page": {
|
||||
"page": page,
|
||||
"pagesize": pagesize,
|
||||
"total": 0,
|
||||
"hasnext": False,
|
||||
},
|
||||
"items": []
|
||||
}
|
||||
|
||||
business_logger.info(f"找到 {len(documents)} 个文档记录")
|
||||
|
||||
# 3. 获取所有chunks的page_content
|
||||
all_contents = []
|
||||
total_chunks = 0
|
||||
# 3. 按全局偏移量计算当前页数据
|
||||
# 全局偏移范围:[offset_start, offset_end)
|
||||
offset_start = (page - 1) * pagesize
|
||||
offset_end = offset_start + pagesize
|
||||
|
||||
global_total = 0 # 所有文档的 chunk 总数
|
||||
page_contents = [] # 当前页的内容
|
||||
|
||||
for document in documents:
|
||||
try:
|
||||
# 获取知识库信息
|
||||
kb = knowledge_repository.get_knowledge_by_id(db, document.kb_id)
|
||||
if not kb:
|
||||
business_logger.warning(f"知识库不存在: kb_id={document.kb_id}")
|
||||
continue
|
||||
|
||||
# 初始化向量服务
|
||||
vector_service = ElasticSearchVectorFactory().init_vector(knowledge=kb)
|
||||
|
||||
# 获取该文档的所有chunks(分页获取)
|
||||
page = 1
|
||||
pagesize = 100 # 每页100条
|
||||
# 先用 pagesize=1 获取该文档的 chunk 总数
|
||||
doc_total, _ = vector_service.search_by_segment(
|
||||
document_id=str(document.id),
|
||||
query=None,
|
||||
pagesize=1,
|
||||
page=1,
|
||||
asc=True
|
||||
)
|
||||
|
||||
while True:
|
||||
total, items = vector_service.search_by_segment(
|
||||
doc_offset_start = global_total # 该文档在全局中的起始偏移
|
||||
doc_offset_end = global_total + doc_total # 该文档在全局中的结束偏移
|
||||
global_total += doc_total
|
||||
|
||||
# 当前页与该文档无交集,跳过
|
||||
if doc_offset_end <= offset_start or doc_offset_start >= offset_end:
|
||||
continue
|
||||
|
||||
# 计算需要从该文档取的局部范围
|
||||
local_start = max(offset_start - doc_offset_start, 0)
|
||||
local_end = min(offset_end - doc_offset_start, doc_total)
|
||||
need_count = local_end - local_start
|
||||
|
||||
# 换算成 ES 分页参数(ES page 从1开始)
|
||||
es_page = (local_start // pagesize) + 1
|
||||
es_offset_in_page = local_start % pagesize
|
||||
|
||||
fetched = []
|
||||
while len(fetched) < es_offset_in_page + need_count:
|
||||
_, items = vector_service.search_by_segment(
|
||||
document_id=str(document.id),
|
||||
query=None,
|
||||
pagesize=pagesize,
|
||||
page=page,
|
||||
page=es_page,
|
||||
asc=True
|
||||
)
|
||||
|
||||
if not items:
|
||||
break
|
||||
|
||||
# 提取page_content
|
||||
for item in items:
|
||||
all_contents.append(item.page_content)
|
||||
total_chunks += 1
|
||||
|
||||
# # 如果达到limit限制,直接返回
|
||||
# if limit > 0 and total_chunks >= limit:
|
||||
# business_logger.info(f"已达到limit限制: {limit}")
|
||||
# return {
|
||||
# "total": total_chunks,
|
||||
# "contents": all_contents[:limit]
|
||||
# }
|
||||
|
||||
# 检查是否还有下一页
|
||||
if page * pagesize >= total:
|
||||
break
|
||||
|
||||
page += 1
|
||||
fetched.extend(items)
|
||||
es_page += 1
|
||||
|
||||
business_logger.info(f"文档 {document.id} 获取了 {len(items)} 个chunks")
|
||||
slice_items = fetched[es_offset_in_page: es_offset_in_page + need_count]
|
||||
page_contents.extend([item.page_content for item in slice_items])
|
||||
|
||||
except Exception as e:
|
||||
business_logger.error(f"获取文档 {document.id} 的chunks失败: {str(e)}")
|
||||
@@ -626,11 +640,16 @@ def get_rag_content(
|
||||
|
||||
# 4. 返回结果
|
||||
result = {
|
||||
"total": total_chunks,
|
||||
"contents": all_contents[:limit] if limit > 0 else all_contents
|
||||
"page": {
|
||||
"page": page,
|
||||
"pagesize": pagesize,
|
||||
"total": global_total,
|
||||
"hasnext": offset_end < global_total,
|
||||
},
|
||||
"items": page_contents
|
||||
}
|
||||
|
||||
business_logger.info(f"成功获取RAG内容: total={total_chunks}, 返回={len(result['contents'])} 条")
|
||||
business_logger.info(f"成功获取RAG内容: total={global_total}, page={page}, 返回={len(page_contents)} 条")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
@@ -730,8 +749,8 @@ async def generate_rag_profile(
|
||||
if not end_user:
|
||||
raise ValueError(f"end_user {end_user_id} 不存在")
|
||||
|
||||
rag_content = get_rag_content(end_user_id, limit, db, current_user)
|
||||
chunks = rag_content.get("contents", [])
|
||||
rag_content = get_rag_content(end_user_id, page=1, pagesize=limit, db=db, current_user=current_user)
|
||||
chunks = rag_content.get("items", [])
|
||||
|
||||
if not chunks:
|
||||
business_logger.warning(f"未找到chunk内容,无法生产RAG画像: end_user_id={end_user_id}")
|
||||
|
||||
@@ -1727,6 +1727,150 @@ async def analytics_graph_data(
|
||||
|
||||
# 辅助函数
|
||||
|
||||
async def analytics_community_graph_data(
|
||||
db: Session,
|
||||
end_user_id: str,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
获取社区图谱数据,包含 Community 节点、ExtractedEntity 节点及其关系。
|
||||
|
||||
Returns:
|
||||
包含 nodes、edges、statistics 的字典,格式与 analytics_graph_data 一致
|
||||
"""
|
||||
try:
|
||||
user_uuid = uuid.UUID(end_user_id)
|
||||
repo = EndUserRepository(db)
|
||||
end_user = repo.get_by_id(user_uuid)
|
||||
if not end_user:
|
||||
return {
|
||||
"nodes": [], "edges": [],
|
||||
"statistics": {"total_nodes": 0, "total_edges": 0, "node_types": {}, "edge_types": {}},
|
||||
"message": "用户不存在"
|
||||
}
|
||||
|
||||
# 查询社区节点、实体节点、BELONGS_TO_COMMUNITY 边、实体间关系
|
||||
from app.repositories.neo4j.cypher_queries import GET_COMMUNITY_GRAPH_DATA
|
||||
rows = await _neo4j_connector.execute_query(GET_COMMUNITY_GRAPH_DATA, end_user_id=end_user_id)
|
||||
|
||||
nodes_map: Dict[str, dict] = {}
|
||||
edges_map: Dict[str, dict] = {}
|
||||
# 记录每个 Community 对应的实体 id 列表
|
||||
community_members: Dict[str, list] = {}
|
||||
|
||||
for row in rows:
|
||||
# Community 节点
|
||||
c_id = row["c_id"]
|
||||
if c_id and c_id not in nodes_map:
|
||||
raw = row["c_props"] or {}
|
||||
props = {k: _clean_neo4j_value(raw.get(k)) for k in (
|
||||
"community_id", "end_user_id", "member_count", "updated_at",
|
||||
"name", "summary", "core_entities",
|
||||
) if k in raw}
|
||||
nodes_map[c_id] = {
|
||||
"id": c_id,
|
||||
"label": "Community",
|
||||
"properties": props,
|
||||
}
|
||||
|
||||
# ExtractedEntity 节点 (e)
|
||||
e_id = row["e_id"]
|
||||
if e_id and e_id not in nodes_map:
|
||||
raw = row["e_props"] or {}
|
||||
props = {k: _clean_neo4j_value(raw.get(k)) for k in (
|
||||
"name", "end_user_id", "description", "created_at", "entity_type",
|
||||
) if k in raw}
|
||||
# 注入所属社区名称(c 是 e 直接归属的社区)
|
||||
c_raw = row["c_props"] or {}
|
||||
props["community_name"] = _clean_neo4j_value(c_raw.get("name")) or ""
|
||||
nodes_map[e_id] = {
|
||||
"id": e_id,
|
||||
"label": "ExtractedEntity",
|
||||
"properties": props,
|
||||
}
|
||||
|
||||
# ExtractedEntity 节点 (e2,可选)
|
||||
e2_id = row.get("e2_id")
|
||||
if e2_id and e2_id not in nodes_map:
|
||||
raw = row["e2_props"] or {}
|
||||
props = {k: _clean_neo4j_value(raw.get(k)) for k in (
|
||||
"name", "end_user_id", "description", "created_at", "entity_type",
|
||||
) if k in raw}
|
||||
# e2 的社区归属在后处理阶段通过 community_members 补充
|
||||
props["community_name"] = ""
|
||||
nodes_map[e2_id] = {
|
||||
"id": e2_id,
|
||||
"label": "ExtractedEntity",
|
||||
"properties": props,
|
||||
}
|
||||
|
||||
# BELONGS_TO_COMMUNITY 边
|
||||
b_id = row["b_id"]
|
||||
if b_id and b_id not in edges_map:
|
||||
edges_map[b_id] = {
|
||||
"id": b_id,
|
||||
"source": e_id,
|
||||
"target": c_id,
|
||||
}
|
||||
# 收集社区成员 id
|
||||
if c_id and e_id:
|
||||
community_members.setdefault(c_id, [])
|
||||
if e_id not in community_members[c_id]:
|
||||
community_members[c_id].append(e_id)
|
||||
|
||||
# EXTRACTED_RELATIONSHIP 边(可选)
|
||||
r_id = row.get("r_id")
|
||||
if r_id and r_id not in edges_map and e2_id:
|
||||
r_props = {k: _clean_neo4j_value(v) for k, v in (row["r_props"] or {}).items()}
|
||||
source = e_id if row.get("r_from_e") else e2_id
|
||||
target = e2_id if row.get("r_from_e") else e_id
|
||||
edges_map[r_id] = {
|
||||
"id": r_id,
|
||||
"source": source,
|
||||
"target": target,
|
||||
}
|
||||
|
||||
nodes = list(nodes_map.values())
|
||||
edges = list(edges_map.values())
|
||||
|
||||
# 为每个 Community 节点注入 member_entity_ids,同时补全 e2 节点的 community_name
|
||||
for c_id, member_ids in community_members.items():
|
||||
c_node = nodes_map.get(c_id)
|
||||
if c_node:
|
||||
c_node["properties"]["member_entity_ids"] = member_ids
|
||||
c_name = c_node["properties"].get("name") or ""
|
||||
# 补全属于该社区但 community_name 为空的实体(即 e2 节点)
|
||||
for eid in member_ids:
|
||||
e_node = nodes_map.get(eid)
|
||||
if e_node and e_node["label"] == "ExtractedEntity":
|
||||
if not e_node["properties"].get("community_name"):
|
||||
e_node["properties"]["community_name"] = c_name
|
||||
|
||||
node_type_counts: Dict[str, int] = {}
|
||||
for n in nodes:
|
||||
node_type_counts[n["label"]] = node_type_counts.get(n["label"], 0) + 1
|
||||
|
||||
return {
|
||||
"nodes": nodes,
|
||||
"edges": edges,
|
||||
"statistics": {
|
||||
"total_nodes": len(nodes),
|
||||
"total_edges": len(edges),
|
||||
"node_types": node_type_counts,
|
||||
}
|
||||
}
|
||||
|
||||
except ValueError:
|
||||
logger.error(f"无效的 end_user_id 格式: {end_user_id}")
|
||||
return {
|
||||
"nodes": [], "edges": [],
|
||||
"statistics": {"total_nodes": 0, "total_edges": 0, "node_types": {}, "edge_types": {}},
|
||||
"message": "无效的用户ID格式"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"获取社区图谱数据失败: {str(e)}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
async def _extract_node_properties(label: str, properties: Dict[str, Any],node_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
根据节点类型提取需要的属性字段
|
||||
|
||||
Reference in New Issue
Block a user