From f2d7479229593ada1da568297f155282ed76c8c1 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Thu, 9 Apr 2026 11:01:56 +0800 Subject: [PATCH 01/20] feat(memory): add async user metadata extraction pipeline - Add MetadataExtractor to collect user-related statements post-dedup and extract profile/behavioral metadata via independent LLM call - Add Celery task (extract_user_metadata) routed to memory_tasks queue - Add metadata models (UserMetadata, UserMetadataProfile, etc.) - Add metadata utility functions (clean, validate, merge with _op support) - Add Jinja2 prompt template for metadata extraction (zh/en) - Fix Lucene query parameter naming: rename `q` to `query` across all Cypher queries, graph_search functions, and callers - Escape `/` in Lucene queries to prevent TokenMgrError - Add `speaker` field to ChunkNode and persist it in Neo4j - Remove unused imports (argparse, os, UUID) in search.py - Fix unnecessary db context nesting in interest distribution task --- api/app/celery_app.py | 3 + .../nodes/perceptual_retrieve_node.py | 4 +- api/app/core/memory/models/__init__.py | 12 ++ api/app/core/memory/models/graph_models.py | 2 + api/app/core/memory/models/metadata_models.py | 40 ++++ api/app/core/memory/src/search.py | 12 +- .../extraction_orchestrator.py | 33 ++- .../metadata_extractor.py | 152 ++++++++++++++ .../triplet_extraction.py | 1 - .../storage_services/search/keyword_search.py | 4 +- api/app/core/memory/utils/data/text_utils.py | 4 +- api/app/core/memory/utils/metadata_utils.py | 179 +++++++++++++++++ .../prompt/prompts/extract_triplet.jinja2 | 8 + .../prompts/extract_user_metadata.jinja2 | 74 +++++++ api/app/repositories/neo4j/cypher_queries.py | 26 +-- api/app/repositories/neo4j/graph_search.py | 49 +++-- api/app/tasks.py | 189 +++++++++++++++--- 17 files changed, 714 insertions(+), 78 deletions(-) create mode 100644 api/app/core/memory/models/metadata_models.py create mode 100644 api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py create mode 100644 api/app/core/memory/utils/metadata_utils.py create mode 100644 api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 diff --git a/api/app/celery_app.py b/api/app/celery_app.py index 23fd82ed..0f8a197c 100644 --- a/api/app/celery_app.py +++ b/api/app/celery_app.py @@ -111,6 +111,9 @@ celery_app.conf.update( # Clustering tasks → memory_tasks queue (使用相同的 worker,避免 macOS fork 问题) 'app.tasks.run_incremental_clustering': {'queue': 'memory_tasks'}, + # Metadata extraction → memory_tasks queue + 'app.tasks.extract_user_metadata': {'queue': 'memory_tasks'}, + # Document tasks → document_tasks queue (prefork worker) 'app.core.rag.tasks.parse_document': {'queue': 'document_tasks'}, 'app.core.rag.tasks.build_graphrag_for_kb': {'queue': 'document_tasks'}, diff --git a/api/app/core/memory/agent/langgraph_graph/nodes/perceptual_retrieve_node.py b/api/app/core/memory/agent/langgraph_graph/nodes/perceptual_retrieve_node.py index f248afa5..1cf5e291 100644 --- a/api/app/core/memory/agent/langgraph_graph/nodes/perceptual_retrieve_node.py +++ b/api/app/core/memory/agent/langgraph_graph/nodes/perceptual_retrieve_node.py @@ -153,7 +153,7 @@ class PerceptualSearchService: return [] try: r = await search_perceptual( - connector=connector, q=escaped, + connector=connector, query=escaped, end_user_id=self.end_user_id, limit=limit * 5, # 多查一些以提高命中率 ) @@ -178,7 +178,7 @@ class PerceptualSearchService: if not escaped.strip(): return [] r = await search_perceptual( - connector=connector, q=escaped, + connector=connector, query=escaped, end_user_id=self.end_user_id, limit=limit, ) return r.get("perceptuals", []) diff --git a/api/app/core/memory/models/__init__.py b/api/app/core/memory/models/__init__.py index 41d08908..eed8e8c4 100644 --- a/api/app/core/memory/models/__init__.py +++ b/api/app/core/memory/models/__init__.py @@ -58,6 +58,14 @@ from app.core.memory.models.triplet_models import ( TripletExtractionResponse, ) +# User metadata models +from app.core.memory.models.metadata_models import ( + UserMetadata, + UserMetadataBehavioralHints, + UserMetadataProfile, + MetadataExtractionResponse, +) + # Ontology scenario models (LLM extracted from scenarios) from app.core.memory.models.ontology_scenario_models import ( OntologyClass, @@ -124,6 +132,10 @@ __all__ = [ "Entity", "Triplet", "TripletExtractionResponse", + "UserMetadata", + "UserMetadataBehavioralHints", + "UserMetadataProfile", + "MetadataExtractionResponse", # Ontology models "OntologyClass", "OntologyExtractionResponse", diff --git a/api/app/core/memory/models/graph_models.py b/api/app/core/memory/models/graph_models.py index 1b8c9d52..6e34421c 100644 --- a/api/app/core/memory/models/graph_models.py +++ b/api/app/core/memory/models/graph_models.py @@ -364,12 +364,14 @@ class ChunkNode(Node): Attributes: dialog_id: ID of the parent dialog content: The text content of the chunk + speaker: Speaker identifier ('user' or 'assistant') chunk_embedding: Optional embedding vector for the chunk sequence_number: Order of this chunk within the dialog metadata: Additional chunk metadata as key-value pairs """ dialog_id: str = Field(..., description="ID of the parent dialog") content: str = Field(..., description="The text content of the chunk") + speaker: Optional[str] = Field(None, description="Speaker identifier: 'user' for user messages, 'assistant' for AI responses") chunk_embedding: Optional[List[float]] = Field(None, description="Chunk embedding vector") sequence_number: int = Field(..., description="Order of this chunk within the dialog") metadata: dict = Field(default_factory=dict, description="Additional chunk metadata") diff --git a/api/app/core/memory/models/metadata_models.py b/api/app/core/memory/models/metadata_models.py new file mode 100644 index 00000000..e3184879 --- /dev/null +++ b/api/app/core/memory/models/metadata_models.py @@ -0,0 +1,40 @@ +"""Models for user metadata extraction. + +Independent from triplet_models.py - these models are used by the +standalone metadata extraction pipeline (post-dedup async Celery task). +""" + +from typing import List + +from pydantic import BaseModel, ConfigDict, Field + + +class UserMetadataProfile(BaseModel): + """用户画像信息""" + model_config = ConfigDict(extra='ignore') + role: str = Field(default="", description="用户职业或角色,如 teacher, doctor, software_engineer") + domain: str = Field(default="", description="用户所在领域,如 education, healthcare, software_development") + expertise: List[str] = Field(default_factory=list, description="用户擅长的技能或工具") + interests: List[str] = Field(default_factory=list, description="用户关注的话题或领域标签") + + +class UserMetadataBehavioralHints(BaseModel): + """行为偏好""" + model_config = ConfigDict(extra='ignore') + learning_stage: str = Field(default="", description="学习阶段") + preferred_depth: str = Field(default="", description="偏好深度") + tone_preference: str = Field(default="", description="语气偏好") + + +class UserMetadata(BaseModel): + """用户元数据顶层结构""" + model_config = ConfigDict(extra='ignore') + profile: UserMetadataProfile = Field(default_factory=UserMetadataProfile) + behavioral_hints: UserMetadataBehavioralHints = Field(default_factory=UserMetadataBehavioralHints) + knowledge_tags: List[str] = Field(default_factory=list, description="知识标签") + + +class MetadataExtractionResponse(BaseModel): + """元数据提取 LLM 响应结构""" + model_config = ConfigDict(extra='ignore') + user_metadata: UserMetadata = Field(default_factory=UserMetadata) diff --git a/api/app/core/memory/src/search.py b/api/app/core/memory/src/search.py index ef39a12e..4e2883d5 100644 --- a/api/app/core/memory/src/search.py +++ b/api/app/core/memory/src/search.py @@ -1,4 +1,3 @@ -import argparse import asyncio import json import math @@ -6,7 +5,6 @@ import os import time from datetime import datetime from typing import TYPE_CHECKING, Any, Dict, List, Optional -from uuid import UUID if TYPE_CHECKING: from app.schemas.memory_config_schema import MemoryConfig @@ -23,7 +21,7 @@ from app.core.memory.utils.config.config_utils import ( ) from app.core.memory.utils.data.text_utils import extract_plain_query from app.core.memory.utils.data.time_utils import normalize_date_safe -from app.core.memory.utils.llm.llm_utils import get_reranker_client +# from app.core.memory.utils.llm.llm_utils import get_reranker_client from app.core.models.base import RedBearModelConfig from app.db import get_db_context from app.repositories.neo4j.graph_search import ( @@ -748,11 +746,10 @@ async def run_hybrid_search( if search_type in ["keyword", "hybrid"]: # Keyword-based search logger.info("[PERF] Starting keyword search...") - keyword_start = time.time() keyword_task = asyncio.create_task( search_graph( connector=connector, - q=query_text, + query=query_text, end_user_id=end_user_id, limit=limit, include=include @@ -762,7 +759,6 @@ async def run_hybrid_search( if search_type in ["embedding", "hybrid"]: # Embedding-based search logger.info("[PERF] Starting embedding search...") - embedding_start = time.time() # 从数据库读取嵌入器配置(按 ID)并构建 RedBearModelConfig config_load_start = time.time() @@ -904,10 +900,10 @@ async def run_hybrid_search( else: results["latency_metrics"] = latency_metrics - logger.info(f"[PERF] ===== SEARCH PERFORMANCE SUMMARY =====") + logger.info("[PERF] ===== SEARCH PERFORMANCE SUMMARY =====") logger.info(f"[PERF] Total search completed in {total_latency:.4f}s") logger.info(f"[PERF] Latency breakdown: {json.dumps(latency_metrics, indent=2)}") - logger.info(f"[PERF] =========================================") + logger.info("[PERF] =========================================") # Sanitize results: drop large/unused fields _remove_keys_recursive(results, ["name_embedding"]) # drop entity name embeddings from outputs diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index 3229674d..8f6d9853 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -311,8 +311,35 @@ class ExtractionOrchestrator: dialog_data_list, ) - # 步骤 7: 同步用户别名到数据库表(仅正式模式) + # 步骤 7: 同步用户别名到数据库表 + 触发异步元数据提取(仅正式模式) if not is_pilot_run: + # 收集用户相关 statement 并触发异步元数据提取 + try: + from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor + metadata_extractor = MetadataExtractor(llm_client=self.llm_client, language=self.language) + user_statements = metadata_extractor.collect_user_related_statements( + entity_nodes, statement_nodes, + statement_entity_edges + ) + if user_statements: + # 获取 end_user_id 和 config_id + end_user_id = dialog_data_list[0].end_user_id if dialog_data_list else None + config_id = dialog_data_list[0].config_id if dialog_data_list and hasattr(dialog_data_list[0], 'config_id') else None + if end_user_id: + from app.tasks import extract_user_metadata_task + extract_user_metadata_task.delay( + end_user_id=str(end_user_id), + statements=user_statements, + config_id=str(config_id) if config_id else None, + language=self.language, + ) + logger.info(f"已触发异步元数据提取任务,共 {len(user_statements)} 条用户相关 statement") + else: + logger.info("未找到用户相关 statement,跳过元数据提取") + except Exception as e: + logger.error(f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True) + + # 同步用户别名到数据库表 logger.info("步骤 7: 同步用户别名到 end_user 和 end_user_info 表") await self._update_end_user_other_name(entity_nodes, dialog_data_list) @@ -1107,6 +1134,7 @@ class ExtractionOrchestrator: end_user_id=dialog_data.end_user_id, run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id content=chunk.content, + speaker=getattr(chunk, 'speaker', None), chunk_embedding=chunk.chunk_embedding, sequence_number=chunk_idx, # 添加必需的 sequence_number 字段 created_at=dialog_data.created_at, @@ -1342,7 +1370,7 @@ class ExtractionOrchestrator: async def _update_end_user_other_name( self, entity_nodes: List[ExtractedEntityNode], - dialog_data_list: List[DialogData] + dialog_data_list: List[DialogData], ) -> None: """ 将本轮提取的用户别名同步到 end_user 和 end_user_info 表。 @@ -1470,7 +1498,6 @@ class ExtractionOrchestrator: end_user_id=end_user_uuid, other_name=first_alias, aliases=merged_aliases, - meta_data={} )) logger.info(f"创建 end_user_info 记录,other_name={first_alias}, aliases={merged_aliases}") diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py new file mode 100644 index 00000000..5e763622 --- /dev/null +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py @@ -0,0 +1,152 @@ +""" +Metadata extractor module. + +Collects user-related statements from post-dedup graph data and +extracts user metadata via an independent LLM call. +""" + +import logging +from typing import List, Optional + +from app.core.memory.models.graph_models import ( + ExtractedEntityNode, + StatementEntityEdge, + StatementNode, +) +from app.core.memory.models.metadata_models import ( + MetadataExtractionResponse, + UserMetadata, +) + +logger = logging.getLogger(__name__) + +# Reuse the same user-entity detection logic from dedup module +_USER_NAMES = {"用户", "我", "user", "i"} +_CANONICAL_USER_TYPE = "用户" + + +def _is_user_entity(ent: ExtractedEntityNode) -> bool: + """判断实体是否为用户实体""" + name = (getattr(ent, "name", "") or "").strip().lower() + etype = (getattr(ent, "entity_type", "") or "").strip() + return name in _USER_NAMES or etype == _CANONICAL_USER_TYPE + + +class MetadataExtractor: + """Extracts user metadata from post-dedup graph data via independent LLM call.""" + + def __init__(self, llm_client, language: str = "zh"): + self.llm_client = llm_client + self.language = language + + @staticmethod + def detect_language(statements: List[str]) -> str: + """根据 statement 文本内容检测语言。 + 如果文本中包含中文字符则返回 "zh",否则返回 "en"。 + """ + import re + combined = " ".join(statements) + if re.search(r'[\u4e00-\u9fff]', combined): + return "zh" + return "en" + + def collect_user_related_statements( + self, + entity_nodes: List[ExtractedEntityNode], + statement_nodes: List[StatementNode], + statement_entity_edges: List[StatementEntityEdge], + ) -> List[str]: + """ + 从去重后的数据中筛选与用户直接相关且由用户发言的 statement 文本。 + + 筛选逻辑: + 1. 用户实体 → StatementEntityEdge → statement(直接关联) + 2. 只保留 speaker="user" 的 statement(过滤 assistant 回复的噪声) + + Returns: + 用户发言的 statement 文本列表 + """ + # Find user entity IDs + user_entity_ids = set() + for ent in entity_nodes: + if _is_user_entity(ent): + user_entity_ids.add(ent.id) + + if not user_entity_ids: + logger.debug("未找到用户实体节点,跳过 statement 收集") + return [] + + # 用户实体 → StatementEntityEdge → statement + target_stmt_ids = set() + for edge in statement_entity_edges: + if edge.target in user_entity_ids: + target_stmt_ids.add(edge.source) + + # Collect: only speaker="user" statements, preserving order + result = [] + seen = set() + total_associated = 0 + skipped_non_user = 0 + for stmt_node in statement_nodes: + if stmt_node.id in target_stmt_ids and stmt_node.id not in seen: + total_associated += 1 + speaker = getattr(stmt_node, 'speaker', None) or 'unknown' + if speaker == "user": + text = (stmt_node.statement or "").strip() + if text: + result.append(text) + else: + skipped_non_user += 1 + seen.add(stmt_node.id) + + logger.info( + f"收集到 {len(result)} 条用户发言 statement " + f"(直接关联: {total_associated}, speaker=user: {len(result)}, " + f"跳过非user: {skipped_non_user})" + ) + if total_associated > 0 and len(result) == 0: + logger.warning( + f"有 {total_associated} 条直接关联 statement 但全部被 speaker 过滤," + f"可能本次写入不包含 user 消息" + ) + return result + + async def extract_metadata(self, statements: List[str]) -> Optional[UserMetadata]: + """ + 对筛选后的 statement 列表调用 LLM 提取元数据。 + 语言根据 statement 内容自动检测,不依赖系统界面语言。 + + Returns: + UserMetadata on success, None on failure + """ + if not statements: + return None + + try: + from app.core.memory.utils.prompt.prompt_utils import prompt_env + + # 根据写入内容的语言自动检测,而非使用系统界面语言 + detected_language = self.detect_language(statements) + logger.info(f"元数据提取语言检测结果: {detected_language}") + + template = prompt_env.get_template("extract_user_metadata.jinja2") + prompt = template.render( + statements=statements, + language=detected_language, + json_schema="", + ) + + response = await self.llm_client.response_structured( + messages=[{"role": "user", "content": prompt}], + response_model=MetadataExtractionResponse, + ) + + if response and response.user_metadata: + return response.user_metadata + + logger.warning("LLM 返回的元数据为空") + return None + + except Exception as e: + logger.error(f"元数据提取 LLM 调用失败: {e}", exc_info=True) + return None diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py index 7fb74b82..ea355ca1 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py @@ -1,4 +1,3 @@ -import os import asyncio from typing import List, Dict, Optional diff --git a/api/app/core/memory/storage_services/search/keyword_search.py b/api/app/core/memory/storage_services/search/keyword_search.py index d2591945..2458cf30 100644 --- a/api/app/core/memory/storage_services/search/keyword_search.py +++ b/api/app/core/memory/storage_services/search/keyword_search.py @@ -5,7 +5,7 @@ 使用Neo4j的全文索引进行高效的文本匹配。 """ -from typing import List, Dict, Any, Optional +from typing import List, Optional from app.core.logging_config import get_memory_logger from app.repositories.neo4j.neo4j_connector import Neo4jConnector from app.core.memory.storage_services.search.search_strategy import SearchStrategy, SearchResult @@ -74,7 +74,7 @@ class KeywordSearchStrategy(SearchStrategy): # 调用底层的关键词搜索函数 results_dict = await search_graph( connector=self.connector, - q=query_text, + query=query_text, end_user_id=end_user_id, limit=limit, include=include_list diff --git a/api/app/core/memory/utils/data/text_utils.py b/api/app/core/memory/utils/data/text_utils.py index d0b10f97..eaed0940 100644 --- a/api/app/core/memory/utils/data/text_utils.py +++ b/api/app/core/memory/utils/data/text_utils.py @@ -22,7 +22,9 @@ def escape_lucene_query(query: str) -> str: s = s.replace("\r", " ").replace("\n", " ").strip() # Lucene reserved tokens/special characters - specials = ['&&', '||', '\\', '+', '-', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':'] + # NOTE: '/' is the regex delimiter in Lucene — must be escaped to prevent + # TokenMgrError when the query contains unmatched slashes. + specials = ['&&', '||', '\\', '+', '-', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '/'] # Replace longer tokens first to avoid partial double-escaping for token in sorted(specials, key=len, reverse=True): s = s.replace(token, f"\\{token}") diff --git a/api/app/core/memory/utils/metadata_utils.py b/api/app/core/memory/utils/metadata_utils.py new file mode 100644 index 00000000..ccdd1686 --- /dev/null +++ b/api/app/core/memory/utils/metadata_utils.py @@ -0,0 +1,179 @@ +""" +Metadata utility functions for cleaning, validating, aggregating, and merging +user metadata extracted from conversations. +""" + +import logging +from datetime import datetime, timezone +from typing import Optional + +from app.core.memory.models.metadata_models import UserMetadata + +logger = logging.getLogger(__name__) + + +def clean_metadata(raw: dict) -> dict: + """ + Clean metadata by removing empty string values and empty array fields recursively. + Only keeps fields with actual content. If a nested dict becomes empty after cleaning, + it is removed too. + """ + cleaned = {} + for key, value in raw.items(): + if isinstance(value, dict): + nested = clean_metadata(value) + if nested: + cleaned[key] = nested + elif isinstance(value, list): + if len(value) > 0: + cleaned[key] = value + elif isinstance(value, str): + if value != "": + cleaned[key] = value + else: + cleaned[key] = value + return cleaned + +# TODO 这个函数没有调用的地方 +def validate_metadata(raw: dict) -> Optional[UserMetadata]: + """ + Validate metadata structure using the Pydantic UserMetadata model. + Returns None and logs a WARNING on validation failure. + """ + try: + return UserMetadata.model_validate(raw) + except Exception as e: + logger.warning("Metadata validation failed: %s", e) + return None + + +def merge_metadata(existing: dict, new: dict) -> dict: + """ + Merge new extracted metadata with existing database metadata. + - Scalar fields: new value overwrites old value + - Array fields: support _op marker (append/replace/remove) + - Missing top-level keys in new: preserve existing data + - Auto-update _updated_at timestamp dict with field paths and ISO timestamps + - When existing is None or {}: directly write new + _updated_at (no merge logic) + """ + now = datetime.now(timezone.utc).isoformat() + + if not existing: + # Direct write: new + _updated_at for all fields + result = dict(new) + updated_at = {} + _collect_field_paths(result, "", updated_at, now) + if updated_at: + result["_updated_at"] = updated_at + return result + + result = dict(existing) + updated_at: dict = dict(result.get("_updated_at", {})) + + for key, new_value in new.items(): + if key == "_updated_at": + continue + + old_value = result.get(key) + + if isinstance(new_value, dict) and isinstance(old_value, dict): + # Nested dict merge (e.g. profile, behavioral_hints) + _merge_nested(result, key, old_value, new_value, updated_at, now) + elif isinstance(new_value, list) or (isinstance(new_value, dict) and "_op" in new_value): + # Array field with possible _op + _merge_array_field(result, key, old_value, new_value, updated_at, now) + else: + # Scalar top-level field + if old_value != new_value: + result[key] = new_value + updated_at[key] = now + # If equal, no change needed + + result["_updated_at"] = updated_at + return result + +# TODO 考虑大函数包含小函数,因为只服务于大函数,实现代码文件的结构清楚 +def _collect_field_paths(data: dict, prefix: str, updated_at: dict, now: str) -> None: + """Collect all leaf field paths for _updated_at on direct write.""" + for key, value in data.items(): + if key == "_updated_at": + continue + path = f"{prefix}{key}" if not prefix else f"{prefix}.{key}" + if isinstance(value, dict): + _collect_field_paths(value, path, updated_at, now) + else: + updated_at[path] = now + + +def _merge_nested( + result: dict, key: str, old_dict: dict, new_dict: dict, + updated_at: dict, now: str +) -> None: + """Merge a nested dict (e.g. profile, behavioral_hints).""" + merged = dict(old_dict) + for field, new_val in new_dict.items(): + old_val = merged.get(field) + path = f"{key}.{field}" + + if isinstance(new_val, list) or (isinstance(new_val, dict) and "_op" in new_val): + _merge_array_field_inner(merged, field, old_val, new_val, updated_at, path, now) + else: + # Scalar field + if old_val != new_val: + merged[field] = new_val + updated_at[path] = now + result[key] = merged + + +def _merge_array_field( + result: dict, key: str, old_value, new_value, + updated_at: dict, now: str +) -> None: + """Merge a top-level array field with _op support.""" + _merge_array_field_inner(result, key, old_value, new_value, updated_at, key, now) + + +def _merge_array_field_inner( + container: dict, field: str, old_value, new_value, + updated_at: dict, path: str, now: str +) -> None: + """Core array merge logic with _op support.""" + # Determine op and items + if isinstance(new_value, dict) and "_op" in new_value: + op = new_value.get("_op", "append") + items = new_value.get(field, new_value.get("items", [])) + # If the dict has a key matching the field name, use it; otherwise look for list values + if not isinstance(items, list): + # Try to find the list value in the dict (excluding _op) + for k, v in new_value.items(): + if k != "_op" and isinstance(v, list): + items = v + break + else: + items = [] + elif isinstance(new_value, list): + op = "append" + items = new_value + else: + op = "append" + items = [] + + old_arr = old_value if isinstance(old_value, list) else [] + + if op == "replace": + new_arr = items + elif op == "remove": + new_arr = [x for x in old_arr if x not in items] + else: + # append (default): merge and deduplicate + seen = list(old_arr) + for item in items: + if item not in seen: + seen.append(item) + new_arr = seen + + if old_arr != new_arr: + container[field] = new_arr + updated_at[path] = now + else: + container[field] = new_arr diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index 7ded48a4..1a79b482 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -406,4 +406,12 @@ Output: - **⚠️ ALIASES ORDER: preserve temporal order of appearance** - **🚨 MANDATORY FIELD: EVERY entity MUST include "aliases" field, even if empty array []** +**Output JSON structure:** +```json +{ + "triplets": [...], + "entities": [...] +} +``` + {{ json_schema }} diff --git a/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 new file mode 100644 index 00000000..9053e57d --- /dev/null +++ b/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 @@ -0,0 +1,74 @@ +===Task=== +Extract user metadata from the following conversation statements spoken by the user. + +{% if language == "zh" %} +**"三度原则"判断标准:** +- 复用度:该信息是否会被多个功能模块使用? +- 约束度:该信息是否会影响系统行为? +- 时效性:该信息是长期稳定的还是临时的?仅提取长期稳定信息。 + +**提取规则:** +- **只提取关于"用户本人"的画像信息**,忽略用户提到的第三方人物(如朋友、同事、家人)的信息 +- 仅提取文本中明确提到的信息,不要推测 +- 如果文本中没有可提取的用户画像信息,返回空的 user_metadata 对象 +- **输出语言必须与输入文本的语言一致**(输入中文则输出中文值,输入英文则输出英文值) + +**字段说明:** +- profile.role:用户的职业或角色,如 教师、医生、后端工程师 +- profile.domain:用户所在领域,如 教育、医疗、软件开发 +- profile.expertise:用户擅长的技能或工具(通用,不限于编程),如 Python、心理咨询、高中物理 +- profile.interests:用户主动表达兴趣的话题或领域标签 +- behavioral_hints.learning_stage:学习阶段(初学者/中级/高级) +- behavioral_hints.preferred_depth:偏好深度(概览/技术细节/深入探讨) +- behavioral_hints.tone_preference:语气偏好(轻松随意/专业简洁/学术严谨) +- knowledge_tags:用户涉及的知识领域标签 +{% else %} +**"Three-Degree Principle" criteria:** +- Reusability: Will this information be used by multiple functional modules? +- Constraint: Will this information affect system behavior? +- Timeliness: Is this information long-term stable or temporary? Only extract long-term stable information. + +**Extraction rules:** +- **Only extract profile information about the user themselves**, ignore information about third parties (friends, colleagues, family) mentioned by the user +- Only extract information explicitly mentioned in the text, do not speculate +- If no user profile information can be extracted, return an empty user_metadata object +- **Output language must match the input text language** + +**Field descriptions:** +- profile.role: User's occupation or role, e.g. teacher, doctor, software engineer +- profile.domain: User's domain, e.g. education, healthcare, software development +- profile.expertise: User's skills or tools (general, not limited to programming) +- profile.interests: Topics or domain tags the user actively expressed interest in +- behavioral_hints.learning_stage: Learning stage (beginner/intermediate/advanced) +- behavioral_hints.preferred_depth: Preferred depth (overview/detailed/deep dive) +- behavioral_hints.tone_preference: Tone preference (casual/professional/academic) +- knowledge_tags: Knowledge domain tags related to the user +{% endif %} + +===User Statements=== +{% for stmt in statements %} +- {{ stmt }} +{% endfor %} + +===Output Format=== +Return a JSON object with the following structure: +```json +{ + "user_metadata": { + "profile": { + "role": "", + "domain": "", + "expertise": [], + "interests": [] + }, + "behavioral_hints": { + "learning_stage": "", + "preferred_depth": "", + "tone_preference": "" + }, + "knowledge_tags": [] + } +} +``` + +{{ json_schema }} diff --git a/api/app/repositories/neo4j/cypher_queries.py b/api/app/repositories/neo4j/cypher_queries.py index aa246829..4b5273ac 100644 --- a/api/app/repositories/neo4j/cypher_queries.py +++ b/api/app/repositories/neo4j/cypher_queries.py @@ -23,6 +23,7 @@ SET s += { end_user_id: statement.end_user_id, stmt_type: statement.stmt_type, statement: statement.statement, + speaker: statement.speaker, emotion_intensity: statement.emotion_intensity, emotion_target: statement.emotion_target, emotion_subject: statement.emotion_subject, @@ -56,6 +57,7 @@ SET c += { expired_at: chunk.expired_at, dialog_id: chunk.dialog_id, content: chunk.content, + speaker: chunk.speaker, chunk_embedding: chunk.chunk_embedding, sequence_number: chunk.sequence_number, start_index: chunk.start_index, @@ -283,7 +285,7 @@ LIMIT $limit """ SEARCH_STATEMENTS_BY_KEYWORD = """ -CALL db.index.fulltext.queryNodes("statementsFulltext", $q) YIELD node AS s, score +CALL db.index.fulltext.queryNodes("statementsFulltext", $query) YIELD node AS s, score WHERE ($end_user_id IS NULL OR s.end_user_id = $end_user_id) OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s) OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity) @@ -307,7 +309,7 @@ LIMIT $limit """ # 查询实体名称包含指定字符串的实体 SEARCH_ENTITIES_BY_NAME = """ -CALL db.index.fulltext.queryNodes("entitiesFulltext", $q) YIELD node AS e, score +CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id) OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e) OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s) @@ -337,21 +339,21 @@ LIMIT $limit """ SEARCH_ENTITIES_BY_NAME_OR_ALIAS = """ -CALL db.index.fulltext.queryNodes("entitiesFulltext", $q) YIELD node AS e, score +CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id) WITH e, score -WITH collect({entity: e, score: score}) AS fulltextResults +With collect({entity: e, score: score}) AS fulltextResults OPTIONAL MATCH (ae:ExtractedEntity) WHERE ($end_user_id IS NULL OR ae.end_user_id = $end_user_id) AND ae.aliases IS NOT NULL - AND ANY(alias IN ae.aliases WHERE toLower(alias) CONTAINS toLower($q)) + AND ANY(alias IN ae.aliases WHERE toLower(alias) CONTAINS toLower($query)) WITH fulltextResults, collect(ae) AS aliasEntities UNWIND (fulltextResults + [x IN aliasEntities | {entity: x, score: CASE - WHEN ANY(alias IN x.aliases WHERE toLower(alias) = toLower($q)) THEN 1.0 - WHEN ANY(alias IN x.aliases WHERE toLower(alias) STARTS WITH toLower($q)) THEN 0.9 + WHEN ANY(alias IN x.aliases WHERE toLower(alias) = toLower($query)) THEN 1.0 + WHEN ANY(alias IN x.aliases WHERE toLower(alias) STARTS WITH toLower($query)) THEN 0.9 ELSE 0.8 END }]) AS row @@ -384,7 +386,7 @@ LIMIT $limit SEARCH_CHUNKS_BY_CONTENT = """ -CALL db.index.fulltext.queryNodes("chunksFulltext", $q) YIELD node AS c, score +CALL db.index.fulltext.queryNodes("chunksFulltext", $query) YIELD node AS c, score WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id) OPTIONAL MATCH (c)-[:CONTAINS]->(s:Statement) OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity) @@ -501,7 +503,7 @@ LIMIT $limit """ SEARCH_STATEMENTS_BY_KEYWORD_TEMPORAL = """ -CALL db.index.fulltext.queryNodes("statementsFulltext", $q) YIELD node AS s, score +CALL db.index.fulltext.queryNodes("statementsFulltext", $query) YIELD node AS s, score WHERE ($end_user_id IS NULL OR s.end_user_id = $end_user_id) AND ((($start_date IS NULL OR (s.created_at IS NOT NULL AND datetime(s.created_at) >= datetime($start_date))) AND ($end_date IS NULL OR (s.created_at IS NOT NULL AND datetime(s.created_at) <= datetime($end_date)))) @@ -677,7 +679,7 @@ SET n.invalid_at = $new_invalid_at # MemorySummary keyword search using fulltext index SEARCH_MEMORY_SUMMARIES_BY_KEYWORD = """ -CALL db.index.fulltext.queryNodes("summariesFulltext", $q) YIELD node AS m, score +CALL db.index.fulltext.queryNodes("summariesFulltext", $query) YIELD node AS m, score WHERE ($end_user_id IS NULL OR m.end_user_id = $end_user_id) OPTIONAL MATCH (m)-[:DERIVED_FROM_STATEMENT]->(s:Statement) RETURN m.id AS id, @@ -1363,7 +1365,7 @@ RETURN c.community_id AS community_id # Community keyword search: matches name or summary via fulltext index SEARCH_COMMUNITIES_BY_KEYWORD = """ -CALL db.index.fulltext.queryNodes("communitiesFulltext", $q) YIELD node AS c, score +CALL db.index.fulltext.queryNodes("communitiesFulltext", $query) YIELD node AS c, score WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id) RETURN c.community_id AS id, c.name AS name, @@ -1451,7 +1453,7 @@ RETURN elementId(r) AS uuid """ SEARCH_PERCEPTUAL_BY_KEYWORD = """ -CALL db.index.fulltext.queryNodes("perceptualFulltext", $q) YIELD node AS p, score +CALL db.index.fulltext.queryNodes("perceptualFulltext", $query) YIELD node AS p, score WHERE p.end_user_id = $end_user_id RETURN p.id AS id, p.end_user_id AS end_user_id, diff --git a/api/app/repositories/neo4j/graph_search.py b/api/app/repositories/neo4j/graph_search.py index 32ec4474..a191dad6 100644 --- a/api/app/repositories/neo4j/graph_search.py +++ b/api/app/repositories/neo4j/graph_search.py @@ -2,6 +2,7 @@ import asyncio import logging from typing import Any, Dict, List, Optional +from app.core.memory.utils.data.text_utils import escape_lucene_query from app.repositories.neo4j.cypher_queries import ( CHUNK_EMBEDDING_SEARCH, COMMUNITY_EMBEDDING_SEARCH, @@ -87,7 +88,7 @@ async def _update_activation_values_batch( unique_node_ids.append(node_id) if not unique_node_ids: - logger.warning(f"批量更新激活值:没有有效的节点ID") + logger.warning("批量更新激活值:没有有效的节点ID") return nodes # 记录去重信息(仅针对具有有效 ID 的节点) @@ -223,7 +224,7 @@ async def _update_search_results_activation( async def search_graph( connector: Neo4jConnector, - q: str, + query: str, end_user_id: Optional[str] = None, limit: int = 50, include: List[str] = None, @@ -234,14 +235,14 @@ async def search_graph( OPTIMIZED: Runs all queries in parallel using asyncio.gather() INTEGRATED: Updates activation values for knowledge nodes before returning results - - Statements: matches s.statement CONTAINS q - - Entities: matches e.name CONTAINS q - - Chunks: matches s.content CONTAINS q (from Statement nodes) - - Summaries: matches ms.content CONTAINS q + - Statements: matches s.statement CONTAINS query + - Entities: matches e.name CONTAINS query + - Chunks: matches s.content CONTAINS query (from Statement nodes) + - Summaries: matches ms.content CONTAINS query Args: connector: Neo4j connector - q: Query text + query: Query text for full-text search end_user_id: Optional group filter limit: Max results per category include: List of categories to search (default: all) @@ -252,6 +253,9 @@ async def search_graph( if include is None: include = ["statements", "chunks", "entities", "summaries"] + # Escape Lucene special characters to prevent query parse errors + escaped_query = escape_lucene_query(query) + # Prepare tasks for parallel execution tasks = [] task_keys = [] @@ -260,7 +264,7 @@ async def search_graph( tasks.append(connector.execute_query( SEARCH_STATEMENTS_BY_KEYWORD, json_format=True, - q=q, + query=escaped_query, end_user_id=end_user_id, limit=limit, )) @@ -270,7 +274,7 @@ async def search_graph( tasks.append(connector.execute_query( SEARCH_ENTITIES_BY_NAME_OR_ALIAS, json_format=True, - q=q, + query=escaped_query, end_user_id=end_user_id, limit=limit, )) @@ -280,7 +284,7 @@ async def search_graph( tasks.append(connector.execute_query( SEARCH_CHUNKS_BY_CONTENT, json_format=True, - q=q, + query=escaped_query, end_user_id=end_user_id, limit=limit, )) @@ -290,7 +294,7 @@ async def search_graph( tasks.append(connector.execute_query( SEARCH_MEMORY_SUMMARIES_BY_KEYWORD, json_format=True, - q=q, + query=escaped_query, end_user_id=end_user_id, limit=limit, )) @@ -300,7 +304,7 @@ async def search_graph( tasks.append(connector.execute_query( SEARCH_COMMUNITIES_BY_KEYWORD, json_format=True, - q=q, + query=escaped_query, end_user_id=end_user_id, limit=limit, )) @@ -482,7 +486,7 @@ async def search_graph_by_embedding( update_time = time.time() - update_start logger.info(f"[PERF] Activation value updates took: {update_time:.4f}s") else: - logger.info(f"[PERF] Skipping activation updates (only summaries)") + logger.info("[PERF] Skipping activation updates (only summaries)") return results @@ -520,7 +524,7 @@ async def get_dedup_candidates_for_entities( # 适配新版查询:使用全 # 全文索引按名称检索(包含 CONTAINS 语义) rows = await connector.execute_query( SEARCH_ENTITIES_BY_NAME, - q=name, + query=escape_lucene_query(name), end_user_id=end_user_id, limit=100, ) @@ -544,7 +548,7 @@ async def get_dedup_candidates_for_entities( # 适配新版查询:使用全 try: rows = await connector.execute_query( SEARCH_ENTITIES_BY_NAME, - q=name.lower(), + query=escape_lucene_query(name.lower()), end_user_id=end_user_id, limit=100, ) @@ -593,11 +597,12 @@ async def search_graph_by_keyword_temporal( - Returns up to 'limit' statements """ if not query_text: - logger.warning(f"query_text不能为空") + logger.warning("query_text不能为空") return {"statements": []} + escaped_query = escape_lucene_query(query_text) statements = await connector.execute_query( SEARCH_STATEMENTS_BY_KEYWORD_TEMPORAL, - q=query_text, + query=escaped_query, end_user_id=end_user_id, start_date=start_date, end_date=end_date, @@ -671,7 +676,7 @@ async def search_graph_by_dialog_id( - Returns up to 'limit' dialogues """ if not dialog_id: - logger.warning(f"dialog_id不能为空") + logger.warning("dialog_id不能为空") return {"dialogues": []} dialogues = await connector.execute_query( @@ -690,7 +695,7 @@ async def search_graph_by_chunk_id( limit: int = 1, ) -> Dict[str, List[Dict[str, Any]]]: if not chunk_id: - logger.warning(f"chunk_id不能为空") + logger.warning("chunk_id不能为空") return {"chunks": []} chunks = await connector.execute_query( SEARCH_CHUNK_BY_CHUNK_ID, @@ -968,7 +973,7 @@ async def search_graph_l_valid_at( async def search_perceptual( connector: Neo4jConnector, - q: str, + query: str, end_user_id: Optional[str] = None, limit: int = 10, ) -> Dict[str, List[Dict[str, Any]]]: @@ -979,7 +984,7 @@ async def search_perceptual( Args: connector: Neo4j connector - q: Query text + query: Query text for full-text search end_user_id: Optional user filter limit: Max results @@ -989,7 +994,7 @@ async def search_perceptual( try: perceptuals = await connector.execute_query( SEARCH_PERCEPTUAL_BY_KEYWORD, - q=q, + query=escape_lucene_query(query), end_user_id=end_user_id, limit=limit, ) diff --git a/api/app/tasks.py b/api/app/tasks.py index f918743c..4914e142 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -1001,7 +1001,7 @@ def sync_knowledge_for_kb(kb_id: uuid.UUID): except Exception as e: print(f"\n\nError during fetch feishu: {e}") case _: # General - print(f"General: No synchronization needed\n") + print("General: No synchronization needed\n") result = f"sync knowledge '{db_knowledge.name}' processed successfully." return result @@ -1510,6 +1510,7 @@ def write_all_workspaces_memory_task(self) -> Dict[str, Any]: "status": "SUCCESS", "total_num": total_num, "end_user_count": len(end_users), + "end_user_details": end_user_details, "memory_increment_id": str(memory_increment.id), "created_at": memory_increment.created_at.isoformat(), }) @@ -2602,35 +2603,34 @@ def init_interest_distribution_for_users(self, end_user_ids: List[str]) -> Dict[ service = MemoryAgentService() - with get_db_context() as db: - for end_user_id in end_user_ids: - # 存在性检查:缓存有数据则跳过 - cached = await InterestMemoryCache.get_interest_distribution( + for end_user_id in end_user_ids: + # 存在性检查:缓存有数据则跳过 + cached = await InterestMemoryCache.get_interest_distribution( + end_user_id=end_user_id, + language=language, + ) + if cached is not None: + skipped += 1 + continue + + logger.info(f"用户 {end_user_id} 无兴趣分布缓存,开始生成") + try: + result = await service.get_interest_distribution_by_user( end_user_id=end_user_id, + limit=5, language=language, ) - if cached is not None: - skipped += 1 - continue - - logger.info(f"用户 {end_user_id} 无兴趣分布缓存,开始生成") - try: - result = await service.get_interest_distribution_by_user( - end_user_id=end_user_id, - limit=5, - language=language, - ) - await InterestMemoryCache.set_interest_distribution( - end_user_id=end_user_id, - language=language, - data=result, - expire=INTEREST_CACHE_EXPIRE, - ) - initialized += 1 - logger.info(f"用户 {end_user_id} 兴趣分布缓存生成成功") - except Exception as e: - failed += 1 - logger.error(f"用户 {end_user_id} 兴趣分布缓存生成失败: {e}") + await InterestMemoryCache.set_interest_distribution( + end_user_id=end_user_id, + language=language, + data=result, + expire=INTEREST_CACHE_EXPIRE, + ) + initialized += 1 + logger.info(f"用户 {end_user_id} 兴趣分布缓存生成成功") + except Exception as e: + failed += 1 + logger.error(f"用户 {end_user_id} 兴趣分布缓存生成失败: {e}") logger.info(f"兴趣分布按需初始化完成: 初始化={initialized}, 跳过={skipped}, 失败={failed}") return { @@ -2914,4 +2914,139 @@ def init_community_clustering_for_users(self, end_user_ids: List[str], workspace } +# ─── User Metadata Extraction Task ─────────────────────────────────────────── + +@celery_app.task( + bind=True, + name='app.tasks.extract_user_metadata', + ignore_result=False, + max_retries=0, + acks_late=True, + time_limit=300, + soft_time_limit=240, +) +def extract_user_metadata_task( + self, + end_user_id: str, + statements: List[str], + config_id: Optional[str] = None, + language: str = "zh", +) -> Dict[str, Any]: + """异步提取用户元数据并写入数据库。 + + 在去重消歧完成后由编排器触发,使用独立 LLM 调用提取元数据。 + LLM 配置优先使用 config_id 对应的应用配置,失败时回退到工作空间默认配置。 + + Args: + end_user_id: 终端用户 ID + statements: 用户相关的 statement 文本列表 + config_id: 应用配置 ID(可选) + language: 语言类型 ("zh" 中文, "en" 英文) + + Returns: + 包含任务执行结果的字典 + """ + start_time = time.time() + logger.info( + f"[CELERY METADATA] Starting metadata extraction - end_user_id={end_user_id}, " + f"statements_count={len(statements)}, config_id={config_id}, language={language}" + ) + + async def _run() -> Dict[str, Any]: + from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor + from app.core.memory.utils.metadata_utils import clean_metadata, merge_metadata, validate_metadata + from app.repositories.end_user_info_repository import EndUserInfoRepository + from app.repositories.end_user_repository import EndUserRepository + from app.services.memory_config_service import MemoryConfigService + + # 1. 获取 LLM 配置(应用配置 → 工作空间配置兜底)并创建 LLM client + with get_db_context() as db: + end_user_uuid = uuid.UUID(end_user_id) + + # 获取 workspace_id from end_user + end_user = EndUserRepository(db).get_by_id(end_user_uuid) + if not end_user: + return {"status": "FAILURE", "error": f"End user not found: {end_user_id}"} + + workspace_id = end_user.workspace_id + + config_service = MemoryConfigService(db) + memory_config = config_service.get_config_with_fallback( + memory_config_id=uuid.UUID(config_id) if config_id else None, + workspace_id=workspace_id, + ) + if not memory_config: + return {"status": "FAILURE", "error": "No LLM config available (app + workspace fallback failed)"} + + # 2. 创建 LLM client + from app.core.memory.utils.llm.llm_utils import MemoryClientFactory + factory = MemoryClientFactory(db) + if not memory_config.llm_id: + return {"status": "FAILURE", "error": "Memory config has no LLM model configured"} + llm_client = factory.get_llm_client(memory_config.llm_id) + + # 3. 提取元数据 + extractor = MetadataExtractor(llm_client=llm_client, language=language) + user_metadata = await extractor.extract_metadata(statements) + + if not user_metadata: + logger.info(f"[CELERY METADATA] No metadata extracted for end_user_id={end_user_id}") + return {"status": "SUCCESS", "result": "no_metadata_extracted"} + + # 4. 清洗、校验、合并、写入 + raw_dict = user_metadata.model_dump() + cleaned = clean_metadata(raw_dict) + if not cleaned: + logger.info(f"[CELERY METADATA] Cleaned metadata is empty for end_user_id={end_user_id}") + return {"status": "SUCCESS", "result": "empty_after_cleaning"} + + validated = validate_metadata(cleaned) + if not validated: + return {"status": "FAILURE", "error": "Metadata validation failed after cleaning"} + + with get_db_context() as db: + end_user_uuid = uuid.UUID(end_user_id) + info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) + + if info: + existing_meta = info.meta_data if info.meta_data else {} + info.meta_data = merge_metadata(existing_meta, cleaned) + logger.info(f"[CELERY METADATA] Updated metadata for end_user_id={end_user_id}") + else: + # No end_user_info record yet - metadata will be written when alias sync creates it, + # or we create a minimal record here + logger.info( + f"[CELERY METADATA] No end_user_info record for end_user_id={end_user_id}, " + f"skipping metadata write (will be created by alias sync)" + ) + return {"status": "SUCCESS", "result": "no_info_record"} + + db.commit() + + return {"status": "SUCCESS", "result": "metadata_written"} + + loop = None + try: + loop = set_asyncio_event_loop() + result = loop.run_until_complete(_run()) + elapsed = time.time() - start_time + result["elapsed_time"] = elapsed + result["task_id"] = self.request.id + logger.info(f"[CELERY METADATA] Task completed - elapsed={elapsed:.2f}s, result={result.get('result')}") + return result + + except Exception as e: + elapsed = time.time() - start_time + logger.error(f"[CELERY METADATA] Task failed - elapsed={elapsed:.2f}s, error={e}", exc_info=True) + return { + "status": "FAILURE", + "error": str(e), + "elapsed_time": elapsed, + "task_id": self.request.id, + } + finally: + if loop: + _shutdown_loop_gracefully(loop) + + # unused task \ No newline at end of file From e0546e01ef3422afb88e6ee8c34db06948dd983a Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Thu, 9 Apr 2026 15:10:29 +0800 Subject: [PATCH 02/20] refactor(memory): delegate metadata merging to LLM instead of code-based merge - Remove merge_metadata and its helper functions from metadata_utils.py - Pass existing_metadata to MetadataExtractor.extract_metadata() as LLM context - Add merge instructions to extract_user_metadata.jinja2 prompt (zh/en) - Update Celery task to read existing metadata before extraction and overwrite - Simplify field descriptions in UserMetadataProfile model - Add _update_timestamps helper to track changed fields --- api/app/core/memory/models/metadata_models.py | 4 +- .../metadata_extractor.py | 8 +- api/app/core/memory/utils/metadata_utils.py | 138 +----------------- .../prompts/extract_user_metadata.jinja2 | 27 ++++ api/app/tasks.py | 58 ++++++-- 5 files changed, 87 insertions(+), 148 deletions(-) diff --git a/api/app/core/memory/models/metadata_models.py b/api/app/core/memory/models/metadata_models.py index e3184879..a5c70ec6 100644 --- a/api/app/core/memory/models/metadata_models.py +++ b/api/app/core/memory/models/metadata_models.py @@ -12,8 +12,8 @@ from pydantic import BaseModel, ConfigDict, Field class UserMetadataProfile(BaseModel): """用户画像信息""" model_config = ConfigDict(extra='ignore') - role: str = Field(default="", description="用户职业或角色,如 teacher, doctor, software_engineer") - domain: str = Field(default="", description="用户所在领域,如 education, healthcare, software_development") + role: str = Field(default="", description="用户职业或角色") + domain: str = Field(default="", description="用户所在领域") expertise: List[str] = Field(default_factory=list, description="用户擅长的技能或工具") interests: List[str] = Field(default_factory=list, description="用户关注的话题或领域标签") diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py index 5e763622..af3331b9 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py @@ -111,10 +111,15 @@ class MetadataExtractor: ) return result - async def extract_metadata(self, statements: List[str]) -> Optional[UserMetadata]: + async def extract_metadata(self, statements: List[str], existing_metadata: Optional[dict] = None) -> Optional[UserMetadata]: """ 对筛选后的 statement 列表调用 LLM 提取元数据。 语言根据 statement 内容自动检测,不依赖系统界面语言。 + 传入已有元数据作为上下文,让 LLM 能判断 replace/remove 操作。 + + Args: + statements: 用户发言的 statement 文本列表 + existing_metadata: 数据库已有的元数据(可选),用于 LLM 对比判断变更 Returns: UserMetadata on success, None on failure @@ -133,6 +138,7 @@ class MetadataExtractor: prompt = template.render( statements=statements, language=detected_language, + existing_metadata=existing_metadata, json_schema="", ) diff --git a/api/app/core/memory/utils/metadata_utils.py b/api/app/core/memory/utils/metadata_utils.py index ccdd1686..69bd8edf 100644 --- a/api/app/core/memory/utils/metadata_utils.py +++ b/api/app/core/memory/utils/metadata_utils.py @@ -1,10 +1,8 @@ """ -Metadata utility functions for cleaning, validating, aggregating, and merging -user metadata extracted from conversations. +Metadata utility functions for cleaning and validating user metadata. """ import logging -from datetime import datetime, timezone from typing import Optional from app.core.memory.models.metadata_models import UserMetadata @@ -34,7 +32,7 @@ def clean_metadata(raw: dict) -> dict: cleaned[key] = value return cleaned -# TODO 这个函数没有调用的地方 + def validate_metadata(raw: dict) -> Optional[UserMetadata]: """ Validate metadata structure using the Pydantic UserMetadata model. @@ -45,135 +43,3 @@ def validate_metadata(raw: dict) -> Optional[UserMetadata]: except Exception as e: logger.warning("Metadata validation failed: %s", e) return None - - -def merge_metadata(existing: dict, new: dict) -> dict: - """ - Merge new extracted metadata with existing database metadata. - - Scalar fields: new value overwrites old value - - Array fields: support _op marker (append/replace/remove) - - Missing top-level keys in new: preserve existing data - - Auto-update _updated_at timestamp dict with field paths and ISO timestamps - - When existing is None or {}: directly write new + _updated_at (no merge logic) - """ - now = datetime.now(timezone.utc).isoformat() - - if not existing: - # Direct write: new + _updated_at for all fields - result = dict(new) - updated_at = {} - _collect_field_paths(result, "", updated_at, now) - if updated_at: - result["_updated_at"] = updated_at - return result - - result = dict(existing) - updated_at: dict = dict(result.get("_updated_at", {})) - - for key, new_value in new.items(): - if key == "_updated_at": - continue - - old_value = result.get(key) - - if isinstance(new_value, dict) and isinstance(old_value, dict): - # Nested dict merge (e.g. profile, behavioral_hints) - _merge_nested(result, key, old_value, new_value, updated_at, now) - elif isinstance(new_value, list) or (isinstance(new_value, dict) and "_op" in new_value): - # Array field with possible _op - _merge_array_field(result, key, old_value, new_value, updated_at, now) - else: - # Scalar top-level field - if old_value != new_value: - result[key] = new_value - updated_at[key] = now - # If equal, no change needed - - result["_updated_at"] = updated_at - return result - -# TODO 考虑大函数包含小函数,因为只服务于大函数,实现代码文件的结构清楚 -def _collect_field_paths(data: dict, prefix: str, updated_at: dict, now: str) -> None: - """Collect all leaf field paths for _updated_at on direct write.""" - for key, value in data.items(): - if key == "_updated_at": - continue - path = f"{prefix}{key}" if not prefix else f"{prefix}.{key}" - if isinstance(value, dict): - _collect_field_paths(value, path, updated_at, now) - else: - updated_at[path] = now - - -def _merge_nested( - result: dict, key: str, old_dict: dict, new_dict: dict, - updated_at: dict, now: str -) -> None: - """Merge a nested dict (e.g. profile, behavioral_hints).""" - merged = dict(old_dict) - for field, new_val in new_dict.items(): - old_val = merged.get(field) - path = f"{key}.{field}" - - if isinstance(new_val, list) or (isinstance(new_val, dict) and "_op" in new_val): - _merge_array_field_inner(merged, field, old_val, new_val, updated_at, path, now) - else: - # Scalar field - if old_val != new_val: - merged[field] = new_val - updated_at[path] = now - result[key] = merged - - -def _merge_array_field( - result: dict, key: str, old_value, new_value, - updated_at: dict, now: str -) -> None: - """Merge a top-level array field with _op support.""" - _merge_array_field_inner(result, key, old_value, new_value, updated_at, key, now) - - -def _merge_array_field_inner( - container: dict, field: str, old_value, new_value, - updated_at: dict, path: str, now: str -) -> None: - """Core array merge logic with _op support.""" - # Determine op and items - if isinstance(new_value, dict) and "_op" in new_value: - op = new_value.get("_op", "append") - items = new_value.get(field, new_value.get("items", [])) - # If the dict has a key matching the field name, use it; otherwise look for list values - if not isinstance(items, list): - # Try to find the list value in the dict (excluding _op) - for k, v in new_value.items(): - if k != "_op" and isinstance(v, list): - items = v - break - else: - items = [] - elif isinstance(new_value, list): - op = "append" - items = new_value - else: - op = "append" - items = [] - - old_arr = old_value if isinstance(old_value, list) else [] - - if op == "replace": - new_arr = items - elif op == "remove": - new_arr = [x for x in old_arr if x not in items] - else: - # append (default): merge and deduplicate - seen = list(old_arr) - for item in items: - if item not in seen: - seen.append(item) - new_arr = seen - - if old_arr != new_arr: - container[field] = new_arr - updated_at[path] = now - else: - container[field] = new_arr diff --git a/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 index 9053e57d..c280e5f6 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 @@ -13,6 +13,16 @@ Extract user metadata from the following conversation statements spoken by the u - 如果文本中没有可提取的用户画像信息,返回空的 user_metadata 对象 - **输出语言必须与输入文本的语言一致**(输入中文则输出中文值,输入英文则输出英文值) +{% if existing_metadata %} +**重要:合并已有元数据** +下方提供了数据库中已有的用户元数据。请结合用户最新发言,输出**合并后的完整元数据**: +- 如果用户明确否定了已有信息(如"我不再教高中物理了"),在输出中**移除**该信息 +- 如果用户提到了新信息,**添加**到对应字段中 +- 如果已有信息未被用户否定,**保留**在输出中 +- 标量字段(如 role、domain):如果用户提到了新值,用新值替换;否则保留已有值 +- 最终输出应该是完整的、合并后的元数据,不是增量 +{% endif %} + **字段说明:** - profile.role:用户的职业或角色,如 教师、医生、后端工程师 - profile.domain:用户所在领域,如 教育、医疗、软件开发 @@ -34,6 +44,16 @@ Extract user metadata from the following conversation statements spoken by the u - If no user profile information can be extracted, return an empty user_metadata object - **Output language must match the input text language** +{% if existing_metadata %} +**Important: Merge with existing metadata** +Existing user metadata from the database is provided below. Combine with the user's latest statements to output the **complete merged metadata**: +- If the user explicitly negates existing info (e.g. "I no longer teach high school physics"), **remove** it from output +- If the user mentions new info, **add** it to the corresponding field +- If existing info is not negated by the user, **keep** it in the output +- Scalar fields (e.g. role, domain): replace with new value if user mentions one; otherwise keep existing +- The final output should be the complete, merged metadata — not an incremental update +{% endif %} + **Field descriptions:** - profile.role: User's occupation or role, e.g. teacher, doctor, software engineer - profile.domain: User's domain, e.g. education, healthcare, software development @@ -50,6 +70,13 @@ Extract user metadata from the following conversation statements spoken by the u - {{ stmt }} {% endfor %} +{% if existing_metadata %} +===Existing User Metadata=== +```json +{{ existing_metadata | tojson }} +``` +{% endif %} + ===Output Format=== Return a JSON object with the following structure: ```json diff --git a/api/app/tasks.py b/api/app/tasks.py index 4914e142..3eb1a52c 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -1,4 +1,5 @@ import asyncio +import json import os import re import shutil @@ -2916,6 +2917,20 @@ def init_community_clustering_for_users(self, end_user_ids: List[str], workspace # ─── User Metadata Extraction Task ─────────────────────────────────────────── + +def _update_timestamps(existing: dict, new: dict, updated_at: dict, now: str, prefix: str = "") -> None: + """对比新旧元数据,更新变更字段的 _updated_at 时间戳。""" + for key, new_val in new.items(): + if key == "_updated_at": + continue + path = f"{prefix}.{key}" if prefix else key + old_val = existing.get(key) + + if isinstance(new_val, dict) and isinstance(old_val, dict): + _update_timestamps(old_val, new_val, updated_at, now, prefix=path) + elif old_val != new_val: + updated_at[path] = now + @celery_app.task( bind=True, name='app.tasks.extract_user_metadata', @@ -2954,7 +2969,7 @@ def extract_user_metadata_task( async def _run() -> Dict[str, Any]: from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor - from app.core.memory.utils.metadata_utils import clean_metadata, merge_metadata, validate_metadata + from app.core.memory.utils.metadata_utils import clean_metadata, validate_metadata from app.repositories.end_user_info_repository import EndUserInfoRepository from app.repositories.end_user_repository import EndUserRepository from app.services.memory_config_service import MemoryConfigService @@ -2985,36 +3000,61 @@ def extract_user_metadata_task( return {"status": "FAILURE", "error": "Memory config has no LLM model configured"} llm_client = factory.get_llm_client(memory_config.llm_id) - # 3. 提取元数据 + # 2.5 读取已有元数据,传给 extractor 作为上下文 + existing_metadata = None + try: + info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) + if info and info.meta_data: + existing_metadata = info.meta_data + logger.info("[CELERY METADATA] 已读取数据库已有元数据作为 LLM 上下文") + except Exception as e: + logger.warning(f"[CELERY METADATA] 读取已有元数据失败(继续无上下文提取): {e}") + + # 3. 提取元数据(传入已有元数据作为上下文) extractor = MetadataExtractor(llm_client=llm_client, language=language) - user_metadata = await extractor.extract_metadata(statements) + user_metadata = await extractor.extract_metadata(statements, existing_metadata=existing_metadata) if not user_metadata: logger.info(f"[CELERY METADATA] No metadata extracted for end_user_id={end_user_id}") return {"status": "SUCCESS", "result": "no_metadata_extracted"} - # 4. 清洗、校验、合并、写入 - raw_dict = user_metadata.model_dump() + # 4. 清洗、校验、覆盖写入 + raw_dict = user_metadata.model_dump(exclude_none=True) + logger.info(f"[CELERY METADATA] LLM 输出完整元数据: {json.dumps(raw_dict, ensure_ascii=False)}") + cleaned = clean_metadata(raw_dict) if not cleaned: logger.info(f"[CELERY METADATA] Cleaned metadata is empty for end_user_id={end_user_id}") return {"status": "SUCCESS", "result": "empty_after_cleaning"} + logger.info(f"[CELERY METADATA] 清洗后元数据: {json.dumps(cleaned, ensure_ascii=False)}") + validated = validate_metadata(cleaned) if not validated: return {"status": "FAILURE", "error": "Metadata validation failed after cleaning"} + # 直接覆盖写入(LLM 已完成语义合并,输出的是完整结果) + # 保留 _updated_at 时间戳追踪 + from datetime import datetime as dt, timezone as tz + now = dt.now(tz.utc).isoformat() + with get_db_context() as db: end_user_uuid = uuid.UUID(end_user_id) info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) if info: existing_meta = info.meta_data if info.meta_data else {} - info.meta_data = merge_metadata(existing_meta, cleaned) - logger.info(f"[CELERY METADATA] Updated metadata for end_user_id={end_user_id}") + logger.info(f"[CELERY METADATA] 数据库已有元数据: {json.dumps(existing_meta, ensure_ascii=False)}") + + # 保留已有的 _updated_at,更新变更字段的时间戳 + updated_at = dict(existing_meta.get("_updated_at", {})) + _update_timestamps(existing_meta, cleaned, updated_at, now) + + final = dict(cleaned) + final["_updated_at"] = updated_at + info.meta_data = final + logger.info(f"[CELERY METADATA] 覆盖写入元数据: {json.dumps(final, ensure_ascii=False)}") else: - # No end_user_info record yet - metadata will be written when alias sync creates it, - # or we create a minimal record here logger.info( f"[CELERY METADATA] No end_user_info record for end_user_id={end_user_id}, " f"skipping metadata write (will be created by alias sync)" From 15a863b41a2a4c7506f232eae382d1ea6abdd5c2 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Thu, 9 Apr 2026 21:55:59 +0800 Subject: [PATCH 03/20] feat(memory): unify alias extraction into metadata pipeline and deduplicate user entity nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Merge alias add/remove into MetadataExtractionResponse and Celery metadata task, removing the separate sync step from extraction_orchestrator - Replace first-person pronouns ("我") with "用户" in statement extraction to preserve identity semantics for downstream metadata/alias extraction - Update extract_statement.jinja2 prompt to enforce "用户" as subject for user statements instead of resolving to real names - Add alias change instructions (aliases_to_add/aliases_to_remove) to extract_user_metadata.jinja2 with incremental merge logic - Deduplicate special entities ("用户", "AI助手") in graph_saver by reusing existing Neo4j node IDs per end_user_id - Sync final aliases from PgSQL to Neo4j user entity nodes after metadata write --- api/app/core/memory/models/metadata_models.py | 8 + .../extraction_orchestrator.py | 8 +- .../metadata_extractor.py | 62 +++++-- .../statement_extraction.py | 21 ++- .../prompt/prompts/extract_statement.jinja2 | 42 ++--- .../prompts/extract_user_metadata.jinja2 | 36 ++++- api/app/repositories/neo4j/graph_saver.py | 52 ++++++ api/app/tasks.py | 151 +++++++++++++----- 8 files changed, 304 insertions(+), 76 deletions(-) diff --git a/api/app/core/memory/models/metadata_models.py b/api/app/core/memory/models/metadata_models.py index a5c70ec6..f08d18ed 100644 --- a/api/app/core/memory/models/metadata_models.py +++ b/api/app/core/memory/models/metadata_models.py @@ -38,3 +38,11 @@ class MetadataExtractionResponse(BaseModel): """元数据提取 LLM 响应结构""" model_config = ConfigDict(extra='ignore') user_metadata: UserMetadata = Field(default_factory=UserMetadata) + aliases_to_add: List[str] = Field( + default_factory=list, + description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)" + ) + aliases_to_remove: List[str] = Field( + default_factory=list, + description="用户明确否认的别名(如'我不叫XX了')" + ) diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index 8f6d9853..b8a36e44 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -311,9 +311,8 @@ class ExtractionOrchestrator: dialog_data_list, ) - # 步骤 7: 同步用户别名到数据库表 + 触发异步元数据提取(仅正式模式) + # 步骤 7: 触发异步元数据和别名提取(仅正式模式) if not is_pilot_run: - # 收集用户相关 statement 并触发异步元数据提取 try: from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor metadata_extractor = MetadataExtractor(llm_client=self.llm_client, language=self.language) @@ -322,7 +321,6 @@ class ExtractionOrchestrator: statement_entity_edges ) if user_statements: - # 获取 end_user_id 和 config_id end_user_id = dialog_data_list[0].end_user_id if dialog_data_list else None config_id = dialog_data_list[0].config_id if dialog_data_list and hasattr(dialog_data_list[0], 'config_id') else None if end_user_id: @@ -339,9 +337,7 @@ class ExtractionOrchestrator: except Exception as e: logger.error(f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True) - # 同步用户别名到数据库表 - logger.info("步骤 7: 同步用户别名到 end_user 和 end_user_info 表") - await self._update_end_user_other_name(entity_nodes, dialog_data_list) + # 别名同步已迁移到 Celery 元数据提取任务中,不再在此处执行 logger.info(f"知识提取流水线运行完成({mode_str})") return ( diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py index af3331b9..cc8c6073 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py @@ -14,9 +14,9 @@ from app.core.memory.models.graph_models import ( StatementNode, ) from app.core.memory.models.metadata_models import ( - MetadataExtractionResponse, UserMetadata, ) +from app.core.memory.models.message_models import DialogData logger = logging.getLogger(__name__) @@ -50,6 +50,35 @@ class MetadataExtractor: return "zh" return "en" + # def collect_user_raw_messages( + # self, + # dialog_data_list: List[DialogData], + # ) -> List[str]: + # """ + # 从原始对话数据中提取 speaker="user" 的消息原文。 + + # 直接使用用户的原始输入,不经过陈述句提取阶段的 LLM 改写, + # 避免第一人称被替换为第三人称导致元数据/别名提取错误。 + + # Returns: + # 用户原始消息文本列表 + # """ + # result = [] + # for dialog in dialog_data_list: + # if not dialog.context or not dialog.context.msgs: + # continue + # for msg in dialog.context.msgs: + # if getattr(msg, 'role', '') == 'user': + # text = (getattr(msg, 'msg', '') or '').strip() + # if text: + # result.append(text) + + # logger.info(f"收集到 {len(result)} 条用户原始消息") + # if result: + # for i, text in enumerate(result): + # logger.info(f" [user message {i+1}] {text[:200]}") + # return result + def collect_user_related_statements( self, entity_nodes: List[ExtractedEntityNode], @@ -104,6 +133,9 @@ class MetadataExtractor: f"(直接关联: {total_associated}, speaker=user: {len(result)}, " f"跳过非user: {skipped_non_user})" ) + if result: + for i, text in enumerate(result): + logger.info(f" [user statement {i+1}] {text}") if total_associated > 0 and len(result) == 0: logger.warning( f"有 {total_associated} 条直接关联 statement 但全部被 speaker 过滤," @@ -111,18 +143,22 @@ class MetadataExtractor: ) return result - async def extract_metadata(self, statements: List[str], existing_metadata: Optional[dict] = None) -> Optional[UserMetadata]: + async def extract_metadata( + self, + statements: List[str], + existing_metadata: Optional[dict] = None, + existing_aliases: Optional[List[str]] = None, + ) -> Optional[tuple]: """ - 对筛选后的 statement 列表调用 LLM 提取元数据。 - 语言根据 statement 内容自动检测,不依赖系统界面语言。 - 传入已有元数据作为上下文,让 LLM 能判断 replace/remove 操作。 + 对筛选后的 statement 列表调用 LLM 提取元数据和用户别名。 Args: statements: 用户发言的 statement 文本列表 - existing_metadata: 数据库已有的元数据(可选),用于 LLM 对比判断变更 + existing_metadata: 数据库已有的元数据(可选) + existing_aliases: 数据库已有的用户别名列表(可选) Returns: - UserMetadata on success, None on failure + (UserMetadata, List[str], List[str]) tuple: (metadata, aliases_to_add, aliases_to_remove) on success, None on failure """ if not statements: return None @@ -130,7 +166,6 @@ class MetadataExtractor: try: from app.core.memory.utils.prompt.prompt_utils import prompt_env - # 根据写入内容的语言自动检测,而非使用系统界面语言 detected_language = self.detect_language(statements) logger.info(f"元数据提取语言检测结果: {detected_language}") @@ -139,18 +174,23 @@ class MetadataExtractor: statements=statements, language=detected_language, existing_metadata=existing_metadata, + existing_aliases=existing_aliases, json_schema="", ) + from app.core.memory.models.metadata_models import MetadataExtractionResponse response = await self.llm_client.response_structured( messages=[{"role": "user", "content": prompt}], response_model=MetadataExtractionResponse, ) - if response and response.user_metadata: - return response.user_metadata + if response: + metadata = response.user_metadata if response.user_metadata else None + to_add = response.aliases_to_add if response.aliases_to_add else [] + to_remove = response.aliases_to_remove if response.aliases_to_remove else [] + return metadata, to_add, to_remove - logger.warning("LLM 返回的元数据为空") + logger.warning("LLM 返回的响应为空") return None except Exception as e: diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py index b06bd70f..684ad556 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py @@ -82,6 +82,18 @@ class StatementExtractor: logger.warning(f"Chunk {getattr(chunk, 'id', 'unknown')} has no speaker field or is empty") return None + @staticmethod + def _replace_first_person_with_user(text: str) -> str: + """将用户消息中的第一人称代词"我"替换为"用户"。 + + 替换规则: + - 所有独立的"我"都替换为"用户"(包括"我的"→"用户的"、"叫我"→"叫用户") + - 不替换"我们"中的"我"("我们"是复数,不指代用户个人) + """ + import re + result = re.sub(r'我(?!们)', '用户', text) + return result + async def _extract_statements(self, chunk, end_user_id: Optional[str] = None, dialogue_content: str = None) -> List[Statement]: """Process a single chunk and return extracted statements @@ -99,6 +111,13 @@ class StatementExtractor: logger.warning(f"Chunk {chunk.id} content too short or empty, skipping") return [] + # 对 speaker="user" 的 chunk,将第一人称"我"替换为"用户", + # 避免 LLM 在陈述句提取时将"我"替换为具体名字(如"齐齐"), + # 导致下游元数据/别名提取无法识别第一人称语义。 + chunk_speaker = self._get_speaker_from_chunk(chunk) + if chunk_speaker == "user": + chunk_content = self._replace_first_person_with_user(chunk_content) + prompt_content = await render_statement_extraction_prompt( chunk_content=chunk_content, definitions=LABEL_DEFINITIONS, @@ -149,8 +168,6 @@ class StatementExtractor: relevence_info = RelevenceInfo[relevence_str] if relevence_str in RelevenceInfo.__members__ else RelevenceInfo.RELEVANT except (KeyError, ValueError): relevence_info = RelevenceInfo.RELEVANT - - chunk_speaker = self._get_speaker_from_chunk(chunk) chunk_statement = Statement( statement=extracted_stmt.statement, diff --git a/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 index 3cdb5fd0..611bd6df 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 @@ -43,8 +43,9 @@ Each statement must be labeled as per the criteria mentioned below. 对话上下文和共指消解: - 将每个陈述句归属于说出它的参与者。 -- 如果参与者列表为说话者提供了名称(例如,"李雪(用户)"),请在提取的陈述句中使用具体名称("李雪"),而不是通用角色("用户")。 -- 将所有代词解析为对话上下文中的具体人物或实体。 +- **对于用户的发言:必须使用"用户"作为主语**,禁止将"用户"或"我"替换为用户的真实姓名或别名。例如,用户说"我叫张三"应提取为"用户叫张三",而不是"张三叫张三"。 +- 对于 AI 助手的发言:使用"助手"或"AI助手"作为主语。 +- 将所有代词解析为对话上下文中的具体人物或实体,但"我"必须解析为"用户"。 - 识别并将抽象引用解析为其具体名称(如果提到)。 - 将缩写和首字母缩略词扩展为其完整形式。 {% else %} @@ -68,8 +69,9 @@ Context Resolution Requirements: Conversational Context & Co-reference Resolution: - Attribute every statement to the participant who uttered it. -- If the participant list provides a name for a speaker (e.g., "李雪 (用户)"), use the specific name ("李雪") in the extracted statement, not the generic role ("用户"). -- Resolve all pronouns to the specific person or entity from the conversation's context. +- **For user's statements: always use "用户" (User) as the subject**. Do NOT replace "用户" or "I" with the user's real name or alias. For example, if the user says "I'm John", extract as "用户 is John", not "John is John". +- For AI assistant's statements: use "助手" or "AI助手" as the subject. +- Resolve all pronouns to the specific person or entity from the conversation's context, but "I"/"我" must always resolve to "用户". - Identify and resolve abstract references to their specific names if mentioned. - Expand abbreviations and acronyms to their full form. {% endif %} @@ -139,13 +141,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料与阿拉伯树胶等粘合 示例输出: { "statements": [ { - "statement": "Sarah Chen 最近一直在尝试水彩画。", + "statement": "用户最近一直在尝试水彩画。", "statement_type": "FACT", "temporal_type": "DYNAMIC", "relevance": "RELEVANT" }, { - "statement": "Sarah Chen 画了一些花朵。", + "statement": "用户画了一些花朵。", "statement_type": "FACT", "temporal_type": "DYNAMIC", "relevance": "RELEVANT" @@ -157,13 +159,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料与阿拉伯树胶等粘合 "relevance": "IRRELEVANT" }, { - "statement": "Sarah Chen 认为她的水彩画中的色彩组合可以改进。", + "statement": "用户认为她的水彩画中的色彩组合可以改进。", "statement_type": "OPINION", "temporal_type": "STATIC", "relevance": "RELEVANT" }, { - "statement": "Sarah Chen 真的很喜欢玫瑰和百合。", + "statement": "用户真的很喜欢玫瑰和百合。", "statement_type": "FACT", "temporal_type": "STATIC", "relevance": "RELEVANT" @@ -186,13 +188,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合 示例输出: { "statements": [ { - "statement": "张曼婷最近在尝试水彩画。", + "statement": "用户最近在尝试水彩画。", "statement_type": "FACT", "temporal_type": "DYNAMIC", "relevance": "RELEVANT" }, { - "statement": "张曼婷画了一些花朵。", + "statement": "用户画了一些花朵。", "statement_type": "FACT", "temporal_type": "DYNAMIC", "relevance": "RELEVANT" @@ -204,13 +206,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合 "relevance": "IRRELEVANT" }, { - "statement": "张曼婷觉得水彩画的色彩搭配还有提升的空间。", + "statement": "用户觉得水彩画的色彩搭配还有提升的空间。", "statement_type": "OPINION", "temporal_type": "STATIC", "relevance": "RELEVANT" }, { - "statement": "张曼婷很喜欢玫瑰和百合。", + "statement": "用户很喜欢玫瑰和百合。", "statement_type": "FACT", "temporal_type": "STATIC", "relevance": "RELEVANT" @@ -233,13 +235,13 @@ User: "I think the color combinations could use some improvement, but I really l Example Output: { "statements": [ { - "statement": "Sarah Chen has been trying watercolor painting recently.", + "statement": "用户 has been trying watercolor painting recently.", "statement_type": "FACT", "temporal_type": "DYNAMIC", "relevance": "RELEVANT" }, { - "statement": "Sarah Chen painted some flowers.", + "statement": "用户 painted some flowers.", "statement_type": "FACT", "temporal_type": "DYNAMIC", "relevance": "RELEVANT" @@ -251,13 +253,13 @@ Example Output: { "relevance": "IRRELEVANT" }, { - "statement": "Sarah Chen thinks the color combinations in her watercolor paintings could use some improvement.", + "statement": "用户 thinks the color combinations in her watercolor paintings could use some improvement.", "statement_type": "OPINION", "temporal_type": "STATIC", "relevance": "RELEVANT" }, { - "statement": "Sarah Chen really likes roses and lilies.", + "statement": "用户 really likes roses and lilies.", "statement_type": "FACT", "temporal_type": "STATIC", "relevance": "RELEVANT" @@ -280,13 +282,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合 Example Output: { "statements": [ { - "statement": "张曼婷最近在尝试水彩画。", + "statement": "用户最近在尝试水彩画。", "statement_type": "FACT", "temporal_type": "DYNAMIC", "relevance": "RELEVANT" }, { - "statement": "张曼婷画了一些花朵。", + "statement": "用户画了一些花朵。", "statement_type": "FACT", "temporal_type": "DYNAMIC", "relevance": "RELEVANT" @@ -298,13 +300,13 @@ Example Output: { "relevance": "IRRELEVANT" }, { - "statement": "张曼婷觉得水彩画的色彩搭配还有提升的空间。", + "statement": "用户觉得水彩画的色彩搭配还有提升的空间。", "statement_type": "OPINION", "temporal_type": "STATIC", "relevance": "RELEVANT" }, { - "statement": "张曼婷很喜欢玫瑰和百合。", + "statement": "用户很喜欢玫瑰和百合。", "statement_type": "FACT", "temporal_type": "STATIC", "relevance": "RELEVANT" diff --git a/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 index c280e5f6..5d019b12 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2 @@ -32,6 +32,22 @@ Extract user metadata from the following conversation statements spoken by the u - behavioral_hints.preferred_depth:偏好深度(概览/技术细节/深入探讨) - behavioral_hints.tone_preference:语气偏好(轻松随意/专业简洁/学术严谨) - knowledge_tags:用户涉及的知识领域标签 + +**用户别名变更(增量模式):** +- **aliases_to_add**:本次新发现的用户别名,包括: + * 用户主动自我介绍:如"我叫张三"、"我的名字是XX"、"我的网名是XX" + * 他人对用户的称呼:如"同事叫我陈哥"、"大家叫我小张"、"领导叫我老陈" + * 只提取原文中逐字出现的名字,严禁推测或创造 + * 禁止提取:用户给 AI 取的名字、第三方人物自身的名字、"用户"/"我" 等占位词 + * 如果没有新别名,返回空数组 `[]` +- **aliases_to_remove**:用户明确否认的别名,包括: + * 用户说"我不叫XX了"、"别叫我XX"、"我改名了,不叫XX" → 将 XX 放入此数组 + * **严格限制**:只将用户原文中**逐字提到**的被否认名字放入,不要推断关联的其他别名 + * 例如:用户说"我不叫陈小刀了" → 只移除"陈小刀",不要移除"陈哥"、"老陈"等未被提及的别名 + * 如果没有要移除的别名,返回空数组 `[]` +{% if existing_aliases %} +- 已有别名:{{ existing_aliases | tojson }}(仅供参考,不需要在输出中重复) +{% endif %} {% else %} **"Three-Degree Principle" criteria:** - Reusability: Will this information be used by multiple functional modules? @@ -63,6 +79,22 @@ Existing user metadata from the database is provided below. Combine with the use - behavioral_hints.preferred_depth: Preferred depth (overview/detailed/deep dive) - behavioral_hints.tone_preference: Tone preference (casual/professional/academic) - knowledge_tags: Knowledge domain tags related to the user + +**User alias changes (incremental mode):** +- **aliases_to_add**: Newly discovered user aliases from this conversation, including: + * User self-introductions: e.g. "I'm John", "My name is XX", "My username is XX" + * How others address the user: e.g. "My colleagues call me Johnny", "People call me Mike" + * Only extract names that appear VERBATIM in the text — never infer or fabricate + * Do NOT extract: names the user gives to the AI, third-party people's own names, placeholder words like "User"/"I" + * If no new aliases, return empty array `[]` +- **aliases_to_remove**: Aliases the user explicitly denies, including: + * User says "Don't call me XX anymore", "I'm not called XX", "I changed my name from XX" → put XX in this array + * **Strict rule**: Only include the exact name the user **verbatim mentions** as denied. Do NOT infer or remove related aliases + * Example: User says "I'm not called John anymore" → only remove "John", do NOT remove "Johnny", "J" or other related aliases not mentioned + * If no aliases to remove, return empty array `[]` +{% if existing_aliases %} +- Existing aliases: {{ existing_aliases | tojson }} (for reference only, do not repeat in output) +{% endif %} {% endif %} ===User Statements=== @@ -94,7 +126,9 @@ Return a JSON object with the following structure: "tone_preference": "" }, "knowledge_tags": [] - } + }, + "aliases_to_add": [], + "aliases_to_remove": [] } ``` diff --git a/api/app/repositories/neo4j/graph_saver.py b/api/app/repositories/neo4j/graph_saver.py index adc266fe..ae76b9f6 100644 --- a/api/app/repositories/neo4j/graph_saver.py +++ b/api/app/repositories/neo4j/graph_saver.py @@ -187,6 +187,58 @@ async def save_dialog_and_statements_to_neo4j( bool: True if successful, False otherwise """ + # 预处理:对特殊实体("用户"、"AI助手")复用 Neo4j 中已有节点的 ID, + # 确保同一个 end_user_id 下只有一个"用户"节点和一个"AI助手"节点。 + if entity_nodes: + _SPECIAL_NAMES = {"用户", "我", "user", "i", "ai助手", "助手", "ai assistant", "assistant"} + end_user_id = entity_nodes[0].end_user_id if entity_nodes else None + if end_user_id: + try: + # 查询已有的特殊实体 + cypher = """ + MATCH (e:ExtractedEntity) + WHERE e.end_user_id = $end_user_id AND toLower(e.name) IN $names + RETURN e.id AS id, e.name AS name + """ + existing = await connector.execute_query( + cypher, + end_user_id=end_user_id, + names=list(_SPECIAL_NAMES), + ) + # 建立 name(lower) → existing_id 映射 + existing_id_map = {} + for record in (existing or []): + name_lower = (record.get("name") or "").strip().lower() + if name_lower and record.get("id"): + existing_id_map[name_lower] = record["id"] + + if existing_id_map: + # 替换新实体的 ID 为已有 ID,同时更新所有引用该 ID 的边 + for ent in entity_nodes: + name_lower = (ent.name or "").strip().lower() + if name_lower in existing_id_map: + old_id = ent.id + new_id = existing_id_map[name_lower] + if old_id != new_id: + ent.id = new_id + # 更新 statement_entity_edges 中的引用 + for edge in statement_entity_edges: + if edge.target == old_id: + edge.target = new_id + if edge.source == old_id: + edge.source = new_id + # 更新 entity_edges 中的引用 + for edge in entity_edges: + if edge.source == old_id: + edge.source = new_id + if edge.target == old_id: + edge.target = new_id + logger.info( + f"特殊实体 '{ent.name}' ID 复用: {old_id[:8]}... → {new_id[:8]}..." + ) + except Exception as e: + logger.warning(f"特殊实体 ID 复用查询失败(不影响写入): {e}") + # 定义事务函数,将所有写操作放在一个事务中 async def _save_all_in_transaction(tx): """在单个事务中执行所有保存操作,避免死锁""" diff --git a/api/app/tasks.py b/api/app/tasks.py index 3eb1a52c..3641f438 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -3000,70 +3000,149 @@ def extract_user_metadata_task( return {"status": "FAILURE", "error": "Memory config has no LLM model configured"} llm_client = factory.get_llm_client(memory_config.llm_id) - # 2.5 读取已有元数据,传给 extractor 作为上下文 + # 2.5 读取已有元数据和别名,传给 extractor 作为上下文 existing_metadata = None + existing_aliases = None try: info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) - if info and info.meta_data: - existing_metadata = info.meta_data - logger.info("[CELERY METADATA] 已读取数据库已有元数据作为 LLM 上下文") + if info: + if info.meta_data: + existing_metadata = info.meta_data + existing_aliases = info.aliases if info.aliases else [] + logger.info(f"[CELERY METADATA] 已读取已有元数据和别名(aliases={existing_aliases})") except Exception as e: - logger.warning(f"[CELERY METADATA] 读取已有元数据失败(继续无上下文提取): {e}") + logger.warning(f"[CELERY METADATA] 读取已有数据失败(继续无上下文提取): {e}") - # 3. 提取元数据(传入已有元数据作为上下文) + # 3. 提取元数据和别名(传入已有数据作为上下文) extractor = MetadataExtractor(llm_client=llm_client, language=language) - user_metadata = await extractor.extract_metadata(statements, existing_metadata=existing_metadata) + extract_result = await extractor.extract_metadata( + statements, + existing_metadata=existing_metadata, + existing_aliases=existing_aliases, + ) - if not user_metadata: + if not extract_result: logger.info(f"[CELERY METADATA] No metadata extracted for end_user_id={end_user_id}") return {"status": "SUCCESS", "result": "no_metadata_extracted"} - # 4. 清洗、校验、覆盖写入 - raw_dict = user_metadata.model_dump(exclude_none=True) + user_metadata, aliases_to_add, aliases_to_remove = extract_result + logger.info(f"[CELERY METADATA] LLM 别名新增: {aliases_to_add}, 移除: {aliases_to_remove}") + + # 4. 清洗元数据、覆盖写入元数据和别名 + raw_dict = user_metadata.model_dump(exclude_none=True) if user_metadata else {} logger.info(f"[CELERY METADATA] LLM 输出完整元数据: {json.dumps(raw_dict, ensure_ascii=False)}") - cleaned = clean_metadata(raw_dict) - if not cleaned: - logger.info(f"[CELERY METADATA] Cleaned metadata is empty for end_user_id={end_user_id}") - return {"status": "SUCCESS", "result": "empty_after_cleaning"} - + cleaned = clean_metadata(raw_dict) if raw_dict else {} logger.info(f"[CELERY METADATA] 清洗后元数据: {json.dumps(cleaned, ensure_ascii=False)}") - validated = validate_metadata(cleaned) - if not validated: - return {"status": "FAILURE", "error": "Metadata validation failed after cleaning"} - - # 直接覆盖写入(LLM 已完成语义合并,输出的是完整结果) - # 保留 _updated_at 时间戳追踪 from datetime import datetime as dt, timezone as tz now = dt.now(tz.utc).isoformat() + # 过滤别名中的占位名称,执行增量增删 + _PLACEHOLDER_NAMES = {"用户", "我", "user", "i"} + + def _filter_aliases(aliases_list): + seen = set() + result = [] + for a in aliases_list: + a_stripped = a.strip() + if a_stripped and a_stripped.lower() not in _PLACEHOLDER_NAMES and a_stripped.lower() not in seen: + result.append(a_stripped) + seen.add(a_stripped.lower()) + return result + + filtered_add = _filter_aliases(aliases_to_add) + filtered_remove = _filter_aliases(aliases_to_remove) + remove_lower = {a.lower() for a in filtered_remove} + with get_db_context() as db: end_user_uuid = uuid.UUID(end_user_id) info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) + end_user = EndUserRepository(db).get_by_id(end_user_uuid) if info: - existing_meta = info.meta_data if info.meta_data else {} - logger.info(f"[CELERY METADATA] 数据库已有元数据: {json.dumps(existing_meta, ensure_ascii=False)}") + # 元数据覆盖写入 + if cleaned: + existing_meta = info.meta_data if info.meta_data else {} + updated_at = dict(existing_meta.get("_updated_at", {})) + _update_timestamps(existing_meta, cleaned, updated_at, now) + final = dict(cleaned) + final["_updated_at"] = updated_at + info.meta_data = final + logger.info("[CELERY METADATA] 覆盖写入元数据") - # 保留已有的 _updated_at,更新变更字段的时间戳 - updated_at = dict(existing_meta.get("_updated_at", {})) - _update_timestamps(existing_meta, cleaned, updated_at, now) + # 别名增量增删:(已有 - remove) + add + old_aliases = info.aliases if info.aliases else [] + # 先移除 + merged = [a for a in old_aliases if a.strip().lower() not in remove_lower] + # 再追加(去重) + existing_lower = {a.strip().lower() for a in merged} + for a in filtered_add: + if a.lower() not in existing_lower: + merged.append(a) + existing_lower.add(a.lower()) - final = dict(cleaned) - final["_updated_at"] = updated_at - info.meta_data = final - logger.info(f"[CELERY METADATA] 覆盖写入元数据: {json.dumps(final, ensure_ascii=False)}") + if merged != old_aliases: + info.aliases = merged + # other_name 更新逻辑 + if merged and ( + not info.other_name + or info.other_name.strip().lower() in _PLACEHOLDER_NAMES + or info.other_name.strip().lower() in remove_lower + ): + info.other_name = merged[0] + if end_user and merged and ( + not end_user.other_name + or end_user.other_name.strip().lower() in _PLACEHOLDER_NAMES + or end_user.other_name.strip().lower() in remove_lower + ): + end_user.other_name = merged[0] + logger.info( + f"[CELERY METADATA] 别名增量更新: {old_aliases} - {filtered_remove} + {filtered_add} → {merged}" + ) else: - logger.info( - f"[CELERY METADATA] No end_user_info record for end_user_id={end_user_id}, " - f"skipping metadata write (will be created by alias sync)" - ) - return {"status": "SUCCESS", "result": "no_info_record"} + # 没有 end_user_info 记录,创建一条 + from app.models.end_user_info_model import EndUserInfo + initial_aliases = filtered_add # 新记录只有 add,没有 remove + first_alias = initial_aliases[0] if initial_aliases else "" + if first_alias or cleaned: + new_info = EndUserInfo( + end_user_id=end_user_uuid, + other_name=first_alias or "", + aliases=initial_aliases, + meta_data=cleaned if cleaned else None, + ) + db.add(new_info) + if end_user and first_alias and ( + not end_user.other_name or end_user.other_name.strip().lower() in _PLACEHOLDER_NAMES + ): + end_user.other_name = first_alias + logger.info(f"[CELERY METADATA] 创建 end_user_info: other_name={first_alias}, aliases={initial_aliases}") + else: + return {"status": "SUCCESS", "result": "no_data_to_write"} db.commit() - return {"status": "SUCCESS", "result": "metadata_written"} + # 同步 PgSQL aliases 到 Neo4j 用户实体(PgSQL 为权威源) + final_aliases = info.aliases if info else initial_aliases + if final_aliases: + try: + from app.repositories.neo4j.neo4j_connector import Neo4jConnector + neo4j_connector = Neo4jConnector() + cypher = """ + MATCH (e:ExtractedEntity) + WHERE e.end_user_id = $end_user_id AND e.name IN ['用户', '我', 'User', 'I'] + SET e.aliases = $aliases + """ + await neo4j_connector.execute_query( + cypher, end_user_id=end_user_id, aliases=final_aliases + ) + await neo4j_connector.close() + logger.info(f"[CELERY METADATA] Neo4j 用户实体 aliases 已同步: {final_aliases}") + except Exception as neo4j_err: + logger.warning(f"[CELERY METADATA] Neo4j aliases 同步失败(不影响主流程): {neo4j_err}") + + return {"status": "SUCCESS", "result": "metadata_and_aliases_written"} loop = None try: From fc5ce63e440bcbcd30a4cd6c222652b096baaf1a Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Thu, 9 Apr 2026 21:57:17 +0800 Subject: [PATCH 04/20] fix:Remove "total" --- api/app/services/memory_dashboard_service.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/api/app/services/memory_dashboard_service.py b/api/app/services/memory_dashboard_service.py index b390aa10..a01b1d00 100644 --- a/api/app/services/memory_dashboard_service.py +++ b/api/app/services/memory_dashboard_service.py @@ -803,7 +803,6 @@ def get_rag_content( "page": { "page": page, "pagesize": pagesize, - "total": 0, "hasnext": False, }, "items": [] @@ -897,13 +896,12 @@ def get_rag_content( "page": { "page": page, "pagesize": pagesize, - "total": global_total, "hasnext": offset_end < global_total, }, "items": conversations } - business_logger.info(f"成功获取RAG内容: total={global_total}, page={page}, 返回={len(conversations)} 条对话") + business_logger.info(f"成功获取RAG内容: page={page}, 返回={len(conversations)} 条对话") return result except Exception as e: From e0b7e95af6501c9f00cc9cbea64de7a1b2179de8 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 10 Apr 2026 00:29:18 +0800 Subject: [PATCH 05/20] refactor(memory): remove first-person pronoun replacement and inline metadata utils - Remove _replace_first_person_with_user from StatementExtractor to preserve original user text for downstream metadata/alias extraction - Delete metadata_utils.py module, inline clean_metadata into Celery task - Remove unused imports and commented-out collect_user_raw_messages method - Apply formatting cleanup across metadata models and extraction orchestrator --- api/app/core/memory/models/metadata_models.py | 29 +++++++---- .../extraction_orchestrator.py | 42 +++++++++++----- .../metadata_extractor.py | 49 +++++-------------- .../statement_extraction.py | 22 +-------- api/app/core/memory/utils/metadata_utils.py | 45 ----------------- api/app/tasks.py | 9 +++- 6 files changed, 69 insertions(+), 127 deletions(-) delete mode 100644 api/app/core/memory/utils/metadata_utils.py diff --git a/api/app/core/memory/models/metadata_models.py b/api/app/core/memory/models/metadata_models.py index f08d18ed..55c2359e 100644 --- a/api/app/core/memory/models/metadata_models.py +++ b/api/app/core/memory/models/metadata_models.py @@ -11,16 +11,22 @@ from pydantic import BaseModel, ConfigDict, Field class UserMetadataProfile(BaseModel): """用户画像信息""" - model_config = ConfigDict(extra='ignore') + + model_config = ConfigDict(extra="ignore") role: str = Field(default="", description="用户职业或角色") domain: str = Field(default="", description="用户所在领域") - expertise: List[str] = Field(default_factory=list, description="用户擅长的技能或工具") - interests: List[str] = Field(default_factory=list, description="用户关注的话题或领域标签") + expertise: List[str] = Field( + default_factory=list, description="用户擅长的技能或工具" + ) + interests: List[str] = Field( + default_factory=list, description="用户关注的话题或领域标签" + ) class UserMetadataBehavioralHints(BaseModel): """行为偏好""" - model_config = ConfigDict(extra='ignore') + + model_config = ConfigDict(extra="ignore") learning_stage: str = Field(default="", description="学习阶段") preferred_depth: str = Field(default="", description="偏好深度") tone_preference: str = Field(default="", description="语气偏好") @@ -28,21 +34,24 @@ class UserMetadataBehavioralHints(BaseModel): class UserMetadata(BaseModel): """用户元数据顶层结构""" - model_config = ConfigDict(extra='ignore') + + model_config = ConfigDict(extra="ignore") profile: UserMetadataProfile = Field(default_factory=UserMetadataProfile) - behavioral_hints: UserMetadataBehavioralHints = Field(default_factory=UserMetadataBehavioralHints) + behavioral_hints: UserMetadataBehavioralHints = Field( + default_factory=UserMetadataBehavioralHints + ) knowledge_tags: List[str] = Field(default_factory=list, description="知识标签") class MetadataExtractionResponse(BaseModel): """元数据提取 LLM 响应结构""" - model_config = ConfigDict(extra='ignore') + + model_config = ConfigDict(extra="ignore") user_metadata: UserMetadata = Field(default_factory=UserMetadata) aliases_to_add: List[str] = Field( default_factory=list, - description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)" + description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)", ) aliases_to_remove: List[str] = Field( - default_factory=list, - description="用户明确否认的别名(如'我不叫XX了')" + default_factory=list, description="用户明确否认的别名(如'我不叫XX了')" ) diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index b8a36e44..5636dcb5 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -314,28 +314,48 @@ class ExtractionOrchestrator: # 步骤 7: 触发异步元数据和别名提取(仅正式模式) if not is_pilot_run: try: - from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor - metadata_extractor = MetadataExtractor(llm_client=self.llm_client, language=self.language) - user_statements = metadata_extractor.collect_user_related_statements( - entity_nodes, statement_nodes, - statement_entity_edges + from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import ( + MetadataExtractor, + ) + + metadata_extractor = MetadataExtractor( + llm_client=self.llm_client, language=self.language + ) + user_statements = ( + metadata_extractor.collect_user_related_statements( + entity_nodes, statement_nodes, statement_entity_edges + ) ) if user_statements: - end_user_id = dialog_data_list[0].end_user_id if dialog_data_list else None - config_id = dialog_data_list[0].config_id if dialog_data_list and hasattr(dialog_data_list[0], 'config_id') else None + end_user_id = ( + dialog_data_list[0].end_user_id + if dialog_data_list + else None + ) + config_id = ( + dialog_data_list[0].config_id + if dialog_data_list + and hasattr(dialog_data_list[0], "config_id") + else None + ) if end_user_id: from app.tasks import extract_user_metadata_task + extract_user_metadata_task.delay( end_user_id=str(end_user_id), statements=user_statements, config_id=str(config_id) if config_id else None, language=self.language, ) - logger.info(f"已触发异步元数据提取任务,共 {len(user_statements)} 条用户相关 statement") + logger.info( + f"已触发异步元数据提取任务,共 {len(user_statements)} 条用户相关 statement" + ) else: logger.info("未找到用户相关 statement,跳过元数据提取") except Exception as e: - logger.error(f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True) + logger.error( + f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True + ) # 别名同步已迁移到 Celery 元数据提取任务中,不再在此处执行 @@ -1501,9 +1521,6 @@ class ExtractionOrchestrator: except Exception as e: logger.error(f"更新 end_user other_name 失败: {e}", exc_info=True) - - - # 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中 # 复用 deduped_and_disamb 模块级常量,避免重复维护 USER_PLACEHOLDER_NAMES = _USER_PLACEHOLDER_NAMES @@ -1610,7 +1627,6 @@ class ExtractionOrchestrator: if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES: return None return candidate - return None async def _run_dedup_and_write_summary( diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py index cc8c6073..8b749c40 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py @@ -13,10 +13,6 @@ from app.core.memory.models.graph_models import ( StatementEntityEdge, StatementNode, ) -from app.core.memory.models.metadata_models import ( - UserMetadata, -) -from app.core.memory.models.message_models import DialogData logger = logging.getLogger(__name__) @@ -45,40 +41,12 @@ class MetadataExtractor: 如果文本中包含中文字符则返回 "zh",否则返回 "en"。 """ import re + combined = " ".join(statements) - if re.search(r'[\u4e00-\u9fff]', combined): + if re.search(r"[\u4e00-\u9fff]", combined): return "zh" return "en" - # def collect_user_raw_messages( - # self, - # dialog_data_list: List[DialogData], - # ) -> List[str]: - # """ - # 从原始对话数据中提取 speaker="user" 的消息原文。 - - # 直接使用用户的原始输入,不经过陈述句提取阶段的 LLM 改写, - # 避免第一人称被替换为第三人称导致元数据/别名提取错误。 - - # Returns: - # 用户原始消息文本列表 - # """ - # result = [] - # for dialog in dialog_data_list: - # if not dialog.context or not dialog.context.msgs: - # continue - # for msg in dialog.context.msgs: - # if getattr(msg, 'role', '') == 'user': - # text = (getattr(msg, 'msg', '') or '').strip() - # if text: - # result.append(text) - - # logger.info(f"收集到 {len(result)} 条用户原始消息") - # if result: - # for i, text in enumerate(result): - # logger.info(f" [user message {i+1}] {text[:200]}") - # return result - def collect_user_related_statements( self, entity_nodes: List[ExtractedEntityNode], @@ -119,7 +87,7 @@ class MetadataExtractor: for stmt_node in statement_nodes: if stmt_node.id in target_stmt_ids and stmt_node.id not in seen: total_associated += 1 - speaker = getattr(stmt_node, 'speaker', None) or 'unknown' + speaker = getattr(stmt_node, "speaker", None) or "unknown" if speaker == "user": text = (stmt_node.statement or "").strip() if text: @@ -135,7 +103,7 @@ class MetadataExtractor: ) if result: for i, text in enumerate(result): - logger.info(f" [user statement {i+1}] {text}") + logger.info(f" [user statement {i + 1}] {text}") if total_associated > 0 and len(result) == 0: logger.warning( f"有 {total_associated} 条直接关联 statement 但全部被 speaker 过滤," @@ -178,7 +146,10 @@ class MetadataExtractor: json_schema="", ) - from app.core.memory.models.metadata_models import MetadataExtractionResponse + from app.core.memory.models.metadata_models import ( + MetadataExtractionResponse, + ) + response = await self.llm_client.response_structured( messages=[{"role": "user", "content": prompt}], response_model=MetadataExtractionResponse, @@ -187,7 +158,9 @@ class MetadataExtractor: if response: metadata = response.user_metadata if response.user_metadata else None to_add = response.aliases_to_add if response.aliases_to_add else [] - to_remove = response.aliases_to_remove if response.aliases_to_remove else [] + to_remove = ( + response.aliases_to_remove if response.aliases_to_remove else [] + ) return metadata, to_add, to_remove logger.warning("LLM 返回的响应为空") diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py index 684ad556..d90a49ba 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py @@ -1,6 +1,5 @@ import asyncio import logging -import os from datetime import datetime from typing import Any, Dict, List, Optional @@ -82,17 +81,6 @@ class StatementExtractor: logger.warning(f"Chunk {getattr(chunk, 'id', 'unknown')} has no speaker field or is empty") return None - @staticmethod - def _replace_first_person_with_user(text: str) -> str: - """将用户消息中的第一人称代词"我"替换为"用户"。 - - 替换规则: - - 所有独立的"我"都替换为"用户"(包括"我的"→"用户的"、"叫我"→"叫用户") - - 不替换"我们"中的"我"("我们"是复数,不指代用户个人) - """ - import re - result = re.sub(r'我(?!们)', '用户', text) - return result async def _extract_statements(self, chunk, end_user_id: Optional[str] = None, dialogue_content: str = None) -> List[Statement]: """Process a single chunk and return extracted statements @@ -106,18 +94,12 @@ class StatementExtractor: List of ExtractedStatement objects extracted from the chunk """ chunk_content = chunk.content - + chunk_speaker = self._get_speaker_from_chunk(chunk) + if not chunk_content or len(chunk_content.strip()) < 5: logger.warning(f"Chunk {chunk.id} content too short or empty, skipping") return [] - # 对 speaker="user" 的 chunk,将第一人称"我"替换为"用户", - # 避免 LLM 在陈述句提取时将"我"替换为具体名字(如"齐齐"), - # 导致下游元数据/别名提取无法识别第一人称语义。 - chunk_speaker = self._get_speaker_from_chunk(chunk) - if chunk_speaker == "user": - chunk_content = self._replace_first_person_with_user(chunk_content) - prompt_content = await render_statement_extraction_prompt( chunk_content=chunk_content, definitions=LABEL_DEFINITIONS, diff --git a/api/app/core/memory/utils/metadata_utils.py b/api/app/core/memory/utils/metadata_utils.py deleted file mode 100644 index 69bd8edf..00000000 --- a/api/app/core/memory/utils/metadata_utils.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Metadata utility functions for cleaning and validating user metadata. -""" - -import logging -from typing import Optional - -from app.core.memory.models.metadata_models import UserMetadata - -logger = logging.getLogger(__name__) - - -def clean_metadata(raw: dict) -> dict: - """ - Clean metadata by removing empty string values and empty array fields recursively. - Only keeps fields with actual content. If a nested dict becomes empty after cleaning, - it is removed too. - """ - cleaned = {} - for key, value in raw.items(): - if isinstance(value, dict): - nested = clean_metadata(value) - if nested: - cleaned[key] = nested - elif isinstance(value, list): - if len(value) > 0: - cleaned[key] = value - elif isinstance(value, str): - if value != "": - cleaned[key] = value - else: - cleaned[key] = value - return cleaned - - -def validate_metadata(raw: dict) -> Optional[UserMetadata]: - """ - Validate metadata structure using the Pydantic UserMetadata model. - Returns None and logs a WARNING on validation failure. - """ - try: - return UserMetadata.model_validate(raw) - except Exception as e: - logger.warning("Metadata validation failed: %s", e) - return None diff --git a/api/app/tasks.py b/api/app/tasks.py index 3641f438..6fd5f8d6 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -2969,7 +2969,6 @@ def extract_user_metadata_task( async def _run() -> Dict[str, Any]: from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor - from app.core.memory.utils.metadata_utils import clean_metadata, validate_metadata from app.repositories.end_user_info_repository import EndUserInfoRepository from app.repositories.end_user_repository import EndUserRepository from app.services.memory_config_service import MemoryConfigService @@ -3029,6 +3028,14 @@ def extract_user_metadata_task( logger.info(f"[CELERY METADATA] LLM 别名新增: {aliases_to_add}, 移除: {aliases_to_remove}") # 4. 清洗元数据、覆盖写入元数据和别名 + def clean_metadata(raw: dict) -> dict: + """递归移除空字符串、空列表、空字典。""" + return { + k: (cleaned if isinstance(v, dict) and (cleaned := clean_metadata(v)) else v) + for k, v in raw.items() + if not (v == "" or v == [] or (isinstance(v, dict) and not clean_metadata(v))) + } + raw_dict = user_metadata.model_dump(exclude_none=True) if user_metadata else {} logger.info(f"[CELERY METADATA] LLM 输出完整元数据: {json.dumps(raw_dict, ensure_ascii=False)}") From cd018814feba3fca3ac3d5f6beadb555fc272739 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 10 Apr 2026 00:42:11 +0800 Subject: [PATCH 06/20] fix(memory): improve metadata language detection and clean_metadata logic - Make MetadataExtractor language param optional (default None) to support auto-detection fallback when no language is explicitly set - Refactor clean_metadata from walrus-operator dict comprehension to explicit loop for correctness and readability --- .../knowledge_extraction/metadata_extractor.py | 10 +++++++--- api/app/tasks.py | 16 +++++++++++----- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py index 8b749c40..19f1e533 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py @@ -31,7 +31,7 @@ def _is_user_entity(ent: ExtractedEntityNode) -> bool: class MetadataExtractor: """Extracts user metadata from post-dedup graph data via independent LLM call.""" - def __init__(self, llm_client, language: str = "zh"): + def __init__(self, llm_client, language: Optional[str] = None): self.llm_client = llm_client self.language = language @@ -134,8 +134,12 @@ class MetadataExtractor: try: from app.core.memory.utils.prompt.prompt_utils import prompt_env - detected_language = self.detect_language(statements) - logger.info(f"元数据提取语言检测结果: {detected_language}") + if self.language: + detected_language = self.language + logger.info(f"元数据提取使用显式指定语言: {detected_language}") + else: + detected_language = self.detect_language(statements) + logger.info(f"元数据提取语言自动检测结果: {detected_language}") template = prompt_env.get_template("extract_user_metadata.jinja2") prompt = template.render( diff --git a/api/app/tasks.py b/api/app/tasks.py index 6fd5f8d6..9afb6225 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -3030,11 +3030,17 @@ def extract_user_metadata_task( # 4. 清洗元数据、覆盖写入元数据和别名 def clean_metadata(raw: dict) -> dict: """递归移除空字符串、空列表、空字典。""" - return { - k: (cleaned if isinstance(v, dict) and (cleaned := clean_metadata(v)) else v) - for k, v in raw.items() - if not (v == "" or v == [] or (isinstance(v, dict) and not clean_metadata(v))) - } + result = {} + for k, v in raw.items(): + if v == "" or v == []: + continue + if isinstance(v, dict): + cleaned = clean_metadata(v) + if cleaned: + result[k] = cleaned + else: + result[k] = v + return result raw_dict = user_metadata.model_dump(exclude_none=True) if user_metadata else {} logger.info(f"[CELERY METADATA] LLM 输出完整元数据: {json.dumps(raw_dict, ensure_ascii=False)}") From 627d6a038153b6919f9fe5f69ebb82400bf06b0a Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 10 Apr 2026 10:43:43 +0800 Subject: [PATCH 07/20] fix : add comments --- api/app/repositories/neo4j/graph_saver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/app/repositories/neo4j/graph_saver.py b/api/app/repositories/neo4j/graph_saver.py index ae76b9f6..56feece2 100644 --- a/api/app/repositories/neo4j/graph_saver.py +++ b/api/app/repositories/neo4j/graph_saver.py @@ -186,7 +186,7 @@ async def save_dialog_and_statements_to_neo4j( Returns: bool: True if successful, False otherwise """ - + # TODO 需要在去重消歧节阶段,做以下逻辑的处理 # 预处理:对特殊实体("用户"、"AI助手")复用 Neo4j 中已有节点的 ID, # 确保同一个 end_user_id 下只有一个"用户"节点和一个"AI助手"节点。 if entity_nodes: From fd05c000f6524f3658cb4046ee9f412eda710b05 Mon Sep 17 00:00:00 2001 From: wxy Date: Fri, 10 Apr 2026 11:04:59 +0800 Subject: [PATCH 08/20] feat(api): Support specifying app version for chat --- .../controllers/service/app_api_controller.py | 26 +++++++++++++++++++ api/app/schemas/app_schema.py | 1 + 2 files changed, 27 insertions(+) diff --git a/api/app/controllers/service/app_api_controller.py b/api/app/controllers/service/app_api_controller.py index 93caa200..f52c9338 100644 --- a/api/app/controllers/service/app_api_controller.py +++ b/api/app/controllers/service/app_api_controller.py @@ -86,10 +86,36 @@ async def chat( app_service: Annotated[AppService, Depends(get_app_service)] = None, message: str = Body(..., description="聊天消息内容"), ): + """ + Agent/Workflow 聊天接口 + + - 不传 version:使用当前发布版本(current_release) + - 传 version=N:使用指定版本号的历史快照,例如 /v1/app/chat?version=2 + """ body = await request.json() payload = AppChatRequest(**body) app = app_service.get_app(api_key_auth.resource_id, api_key_auth.workspace_id) + + # 版本切换:指定 version 时查找对应历史快照 + if payload.version is not None: + from sqlalchemy import select as _select + from app.models.app_release_model import AppRelease as _AppRelease + release = db.scalars( + _select(_AppRelease).where( + _AppRelease.app_id == app.id, + _AppRelease.version == payload.version, + _AppRelease.is_active.is_(True), + ) + ).first() + if not release: + raise BusinessException( + f"版本 {payload.version} 不存在或已下线", + BizCode.AGENT_CONFIG_MISSING, + ) + # 临时替换 current_release,后续逻辑无需改动 + app.current_release = release + app.current_release_id = release.id other_id = payload.user_id workspace_id = api_key_auth.workspace_id end_user_repo = EndUserRepository(db) diff --git a/api/app/schemas/app_schema.py b/api/app/schemas/app_schema.py index 85cff671..130bff91 100644 --- a/api/app/schemas/app_schema.py +++ b/api/app/schemas/app_schema.py @@ -616,6 +616,7 @@ class AppChatRequest(BaseModel): stream: bool = Field(default=False, description="是否流式返回") thinking: bool = Field(default=False, description="是否启用深度思考(需Agent配置支持)") files: List[FileInput] = Field(default_factory=list, description="附件列表(支持多文件)") + version: Optional[int] = Field(default=None, description="指定发布版本号,不传则使用当前发布版本") class DraftRunRequest(BaseModel): From 90e8e9052861c58dd9e2b5906c08076c64eda559 Mon Sep 17 00:00:00 2001 From: wxy Date: Fri, 10 Apr 2026 11:11:39 +0800 Subject: [PATCH 09/20] feat(api): Support specifying app version for chat --- api/app/controllers/service/app_api_controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/app/controllers/service/app_api_controller.py b/api/app/controllers/service/app_api_controller.py index f52c9338..75c8798d 100644 --- a/api/app/controllers/service/app_api_controller.py +++ b/api/app/controllers/service/app_api_controller.py @@ -89,8 +89,8 @@ async def chat( """ Agent/Workflow 聊天接口 - - 不传 version:使用当前发布版本(current_release) - - 传 version=N:使用指定版本号的历史快照,例如 /v1/app/chat?version=2 + - 不传 version:使用当前生效版本(current_release,回滚后为回滚目标版本) + - 传 version=N:使用指定版本号的历史快照,例如 {"version": 2} """ body = await request.json() payload = AppChatRequest(**body) From 412183c3590507efc22c142f859b25385bf2fe49 Mon Sep 17 00:00:00 2001 From: wxy Date: Fri, 10 Apr 2026 11:44:50 +0800 Subject: [PATCH 10/20] feat(api): Support specifying app version for chat --- api/app/controllers/service/app_api_controller.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/api/app/controllers/service/app_api_controller.py b/api/app/controllers/service/app_api_controller.py index 75c8798d..3130a888 100644 --- a/api/app/controllers/service/app_api_controller.py +++ b/api/app/controllers/service/app_api_controller.py @@ -111,7 +111,7 @@ async def chat( if not release: raise BusinessException( f"版本 {payload.version} 不存在或已下线", - BizCode.AGENT_CONFIG_MISSING, + BizCode.RELEASE_NOT_FOUND, ) # 临时替换 current_release,后续逻辑无需改动 app.current_release = release @@ -328,6 +328,4 @@ async def chat( msg="工作流任务执行成功" ) else: - from app.core.exceptions import BusinessException - from app.core.error_codes import BizCode raise BusinessException(f"不支持的应用类型: {app_type}", BizCode.APP_TYPE_NOT_SUPPORTED) From d517bceda28d62e61367eabfbc87fcf1d139302b Mon Sep 17 00:00:00 2001 From: zhaoying Date: Fri, 10 Apr 2026 12:03:02 +0800 Subject: [PATCH 11/20] fix(web): object/array[object] add format check --- web/src/i18n/en.ts | 1 + web/src/i18n/zh.ts | 1 + .../AddChatVariable/ChatVariableModal.tsx | 16 ++++++++++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/web/src/i18n/en.ts b/web/src/i18n/en.ts index 3d24c0d0..d7b7c617 100644 --- a/web/src/i18n/en.ts +++ b/web/src/i18n/en.ts @@ -2239,6 +2239,7 @@ Memory Bear: After the rebellion, regional warlordism intensified for several re addvariable: 'Chat Variables', addChatVariable: 'Add Chat Variable', editChatVariable: 'Edit Chat Variable', + invalidJSON: 'Invalid JSON format', config: { llm: { diff --git a/web/src/i18n/zh.ts b/web/src/i18n/zh.ts index f5e3653a..55860e46 100644 --- a/web/src/i18n/zh.ts +++ b/web/src/i18n/zh.ts @@ -2200,6 +2200,7 @@ export const zh = { addvariable: '会话变量', addChatVariable: '添加会话变量', editChatVariable: '编辑会话变量', + invalidJSON: 'JSON 格式不正确', config: { llm: { diff --git a/web/src/views/Workflow/components/AddChatVariable/ChatVariableModal.tsx b/web/src/views/Workflow/components/AddChatVariable/ChatVariableModal.tsx index ff773a9e..e948aabf 100644 --- a/web/src/views/Workflow/components/AddChatVariable/ChatVariableModal.tsx +++ b/web/src/views/Workflow/components/AddChatVariable/ChatVariableModal.tsx @@ -124,7 +124,7 @@ const ChatVariableModal = forwardRef ) : ( - + { + if (!value) return Promise.resolve(); + try { JSON.parse(value); return Promise.resolve(); } + catch { return Promise.reject(t('workflow.invalidJSON')); } + } + } : {} + ]} + > {type === 'number' ? : type === 'boolean' From c253968aa8b516d1b904ad326082d0d995c63fd3 Mon Sep 17 00:00:00 2001 From: wxy Date: Fri, 10 Apr 2026 12:10:24 +0800 Subject: [PATCH 12/20] feat(api): Support specifying app version for chat --- .../controllers/service/app_api_controller.py | 52 ++++++------------- api/app/repositories/app_repository.py | 12 +++++ api/app/services/app_service.py | 22 ++++++++ 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/api/app/controllers/service/app_api_controller.py b/api/app/controllers/service/app_api_controller.py index 3130a888..b7cc6942 100644 --- a/api/app/controllers/service/app_api_controller.py +++ b/api/app/controllers/service/app_api_controller.py @@ -14,6 +14,7 @@ from app.core.response_utils import success from app.db import get_db from app.models.app_model import App from app.models.app_model import AppType +from app.models.app_release_model import AppRelease from app.repositories import knowledge_repository from app.repositories.end_user_repository import EndUserRepository from app.schemas import AppChatRequest, conversation_schema @@ -61,18 +62,11 @@ async def list_apps(): # return success(data={"received": True}, msg="消息已接收") -def _checkAppConfig(app: App): - if app.type == AppType.AGENT: - if not app.current_release.config: - raise BusinessException("Agent 应用未配置模型", BizCode.AGENT_CONFIG_MISSING) - elif app.type == AppType.MULTI_AGENT: - if not app.current_release.config: - raise BusinessException("Multi-Agent 应用未配置模型", BizCode.AGENT_CONFIG_MISSING) - elif app.type == AppType.WORKFLOW: - if not app.current_release.config: - raise BusinessException("工作流应用未配置模型", BizCode.AGENT_CONFIG_MISSING) - else: - raise BusinessException("不支持的应用类型", BizCode.AGENT_CONFIG_MISSING) +def _checkAppConfig(release: AppRelease): + if not release.config: + raise BusinessException("应用未配置,无法使用", BizCode.AGENT_CONFIG_MISSING) + if release.type not in (AppType.AGENT, AppType.MULTI_AGENT, AppType.WORKFLOW): + raise BusinessException("不支持的应用类型", BizCode.APP_TYPE_NOT_SUPPORTED) @router.post("/chat") @@ -97,25 +91,11 @@ async def chat( app = app_service.get_app(api_key_auth.resource_id, api_key_auth.workspace_id) - # 版本切换:指定 version 时查找对应历史快照 + # 版本切换:指定 version 时查找对应历史快照,否则使用当前激活版本 if payload.version is not None: - from sqlalchemy import select as _select - from app.models.app_release_model import AppRelease as _AppRelease - release = db.scalars( - _select(_AppRelease).where( - _AppRelease.app_id == app.id, - _AppRelease.version == payload.version, - _AppRelease.is_active.is_(True), - ) - ).first() - if not release: - raise BusinessException( - f"版本 {payload.version} 不存在或已下线", - BizCode.RELEASE_NOT_FOUND, - ) - # 临时替换 current_release,后续逻辑无需改动 - app.current_release = release - app.current_release_id = release.id + active_release = app_service.get_release_by_version(app.id, payload.version) + else: + active_release = app.current_release other_id = payload.user_id workspace_id = api_key_auth.workspace_id end_user_repo = EndUserRepository(db) @@ -153,7 +133,7 @@ async def chat( storage_type = 'neo4j' app_type = app.type # check app config - _checkAppConfig(app) + _checkAppConfig(active_release) # 获取或创建会话(提前验证) conversation = conversation_service.create_or_get_conversation( @@ -168,7 +148,7 @@ async def chat( # print("="*50) # print(app.current_release.default_model_config_id) - agent_config = agent_config_4_app_release(app.current_release) + agent_config = agent_config_4_app_release(active_release) # print(agent_config.default_model_config_id) # thinking 开关:仅当 agent 配置了 deep_thinking 且请求 thinking=True 时才启用 @@ -220,7 +200,7 @@ async def chat( return success(data=conversation_schema.ChatResponse(**result).model_dump(mode="json")) elif app_type == AppType.MULTI_AGENT: # 多 Agent 流式返回 - config = multi_agent_config_4_app_release(app.current_release) + config = multi_agent_config_4_app_release(active_release) if payload.stream: async def event_generator(): async for event in app_chat_service.multi_agent_chat_stream( @@ -263,7 +243,7 @@ async def chat( return success(data=conversation_schema.ChatResponse(**result).model_dump(mode="json")) elif app_type == AppType.WORKFLOW: # 多 Agent 流式返回 - config = workflow_config_4_app_release(app.current_release) + config = workflow_config_4_app_release(active_release) if payload.stream: async def event_generator(): async for event in app_chat_service.workflow_chat_stream( @@ -279,7 +259,7 @@ async def chat( user_rag_memory_id=user_rag_memory_id, app_id=app.id, workspace_id=workspace_id, - release_id=app.current_release.id, + release_id=active_release.id, public=True ): event_type = event.get("event", "message") @@ -314,7 +294,7 @@ async def chat( files=payload.files, app_id=app.id, workspace_id=workspace_id, - release_id=app.current_release.id + release_id=active_release.id ) logger.debug( "工作流试运行返回结果", diff --git a/api/app/repositories/app_repository.py b/api/app/repositories/app_repository.py index 75a91fd6..3eef99f8 100644 --- a/api/app/repositories/app_repository.py +++ b/api/app/repositories/app_repository.py @@ -61,3 +61,15 @@ def get_apps_by_id(db: Session, app_id: uuid.UUID) -> App: """根据工作空间ID查询应用""" repo = AppRepository(db) return repo.get_apps_by_id(app_id) + + +def get_release_by_version(db: Session, app_id: uuid.UUID, version: int): + """根据版本号查询发布快照(仅返回激活状态)""" + from app.models.app_release_model import AppRelease + return db.scalars( + select(AppRelease).where( + AppRelease.app_id == app_id, + AppRelease.version == version, + AppRelease.is_active.is_(True), + ) + ).first() diff --git a/api/app/services/app_service.py b/api/app/services/app_service.py index 5e26a629..b05e9621 100644 --- a/api/app/services/app_service.py +++ b/api/app/services/app_service.py @@ -619,6 +619,28 @@ class AppService: self._validate_app_accessible(app, workspace_id) return app + def get_release_by_version(self, app_id: uuid.UUID, version: int) -> AppRelease: + """按版本号获取发布快照 + + Args: + app_id: 应用ID + version: 版本号(整数,按应用内递增) + + Returns: + AppRelease: 发布快照 + + Raises: + BusinessException: 版本不存在或已下线 + """ + from app.repositories.app_repository import get_release_by_version + release = get_release_by_version(self.db, app_id, version) + if not release: + raise BusinessException( + f"版本 {version} 不存在或已下线", + BizCode.RELEASE_NOT_FOUND, + ) + return release + def create_app( self, *, From 72fe3962cf40e9562c5807786ea1580a967f2cec Mon Sep 17 00:00:00 2001 From: wxy Date: Fri, 10 Apr 2026 12:18:11 +0800 Subject: [PATCH 13/20] feat(api): Support specifying app version for chat --- api/app/controllers/service/app_api_controller.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/api/app/controllers/service/app_api_controller.py b/api/app/controllers/service/app_api_controller.py index b7cc6942..d9539a4d 100644 --- a/api/app/controllers/service/app_api_controller.py +++ b/api/app/controllers/service/app_api_controller.py @@ -63,9 +63,16 @@ async def list_apps(): def _checkAppConfig(release: AppRelease): - if not release.config: - raise BusinessException("应用未配置,无法使用", BizCode.AGENT_CONFIG_MISSING) - if release.type not in (AppType.AGENT, AppType.MULTI_AGENT, AppType.WORKFLOW): + if release.type == AppType.AGENT: + if not release.config: + raise BusinessException("Agent 应用未配置模型", BizCode.AGENT_CONFIG_MISSING) + elif release.type == AppType.MULTI_AGENT: + if not release.config: + raise BusinessException("Multi-Agent 应用未配置模型", BizCode.AGENT_CONFIG_MISSING) + elif release.type == AppType.WORKFLOW: + if not release.config: + raise BusinessException("工作流应用未配置模型", BizCode.AGENT_CONFIG_MISSING) + else: raise BusinessException("不支持的应用类型", BizCode.APP_TYPE_NOT_SUPPORTED) From 4ce6fede67f6251a8260aadd87d60532e61825d9 Mon Sep 17 00:00:00 2001 From: Timebomb2018 <18868801967@163.com> Date: Fri, 10 Apr 2026 14:08:51 +0800 Subject: [PATCH 14/20] fix(workflow): update cycle graph node output type validation --- api/app/core/workflow/nodes/cycle_graph/node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/app/core/workflow/nodes/cycle_graph/node.py b/api/app/core/workflow/nodes/cycle_graph/node.py index fc80939f..68c83025 100644 --- a/api/app/core/workflow/nodes/cycle_graph/node.py +++ b/api/app/core/workflow/nodes/cycle_graph/node.py @@ -55,9 +55,9 @@ class CycleGraphNode(BaseNode): if config.output_type in [ VariableType.ARRAY_FILE, VariableType.ARRAY_STRING, - VariableType.NUMBER, + VariableType.ARRAY_NUMBER, VariableType.ARRAY_OBJECT, - VariableType.BOOLEAN + VariableType.ARRAY_BOOLEAN ]: if config.flatten: outputs['output'] = config.output_type From 068e2bfb7e0f266103d9698218e7e05d84e41771 Mon Sep 17 00:00:00 2001 From: Timebomb2018 <18868801967@163.com> Date: Fri, 10 Apr 2026 15:24:18 +0800 Subject: [PATCH 15/20] fix(workflow): update output pattern to handle standalone curly braces --- api/app/core/workflow/engine/graph_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/app/core/workflow/engine/graph_builder.py b/api/app/core/workflow/engine/graph_builder.py index daef6e82..4f76d694 100644 --- a/api/app/core/workflow/engine/graph_builder.py +++ b/api/app/core/workflow/engine/graph_builder.py @@ -31,7 +31,7 @@ logger = logging.getLogger(__name__) # Example: # "Hello {{user.name}}!" -> # ["Hello ", "{{user.name}}", "!"] -_OUTPUT_PATTERN = re.compile(r'\{\{.*?}}|[^{}]+') +_OUTPUT_PATTERN = re.compile(r'\{\{.*?}}|[^{]+|{') # Strict variable format: {{ node_id.field_name }} _VARIABLE_PATTERN = re.compile(r'\{\{\s*[a-zA-Z0-9_]+\.[a-zA-Z0-9_]+\s*}}') From 9a926a8398cd1d512d2ace38f9bccd2854c720a4 Mon Sep 17 00:00:00 2001 From: zhaoying Date: Fri, 10 Apr 2026 16:24:36 +0800 Subject: [PATCH 16/20] feat(web): workflow variable type --- web/src/i18n/en.ts | 5 ++ web/src/i18n/zh.ts | 5 ++ .../AddChatVariable/ChatVariableModal.tsx | 2 +- .../Workflow/components/PortClickHandler.tsx | 2 +- .../components/Properties/CaseList/index.tsx | 78 +++++++++++++---- .../Properties/ConditionList/index.tsx | 54 ++++++++++-- .../Properties/CycleVarsList/index.tsx | 14 ++++ .../Properties/HttpRequest/EditableTable.tsx | 45 +++++++++- .../Properties/HttpRequest/index.tsx | 76 +++++++++++++++-- .../Properties/ToolConfig/index.tsx | 59 +++++++------ .../Workflow/components/Properties/index.tsx | 83 +++++++++++++++++-- 11 files changed, 356 insertions(+), 67 deletions(-) diff --git a/web/src/i18n/en.ts b/web/src/i18n/en.ts index d7b7c617..6e7f1465 100644 --- a/web/src/i18n/en.ts +++ b/web/src/i18n/en.ts @@ -2342,6 +2342,11 @@ Memory Bear: After the rebellion, regional warlordism intensified for several re "eq": 'Is', "ne": 'Is Not', }, + file: { + "empty": 'Not Exist', + "not_empty": 'Exists', + eq: 'All Are' + }, else_desc: 'Used to define the logic that should be executed when the if condition is not met.', unset: 'Condition Not Set', set: 'Set', diff --git a/web/src/i18n/zh.ts b/web/src/i18n/zh.ts index 55860e46..acd06cdc 100644 --- a/web/src/i18n/zh.ts +++ b/web/src/i18n/zh.ts @@ -2303,6 +2303,11 @@ export const zh = { "eq": '是', "ne": '不是', }, + file: { + "empty": '不存在', + "not_empty": '存在', + eq: '全都是' + }, else_desc: '用于定义当 if 条件不满足时应执行的逻辑。', unset: '条件未设置', set: '已设置', diff --git a/web/src/views/Workflow/components/AddChatVariable/ChatVariableModal.tsx b/web/src/views/Workflow/components/AddChatVariable/ChatVariableModal.tsx index e948aabf..5666f3ab 100644 --- a/web/src/views/Workflow/components/AddChatVariable/ChatVariableModal.tsx +++ b/web/src/views/Workflow/components/AddChatVariable/ChatVariableModal.tsx @@ -5,7 +5,7 @@ * @Last Modified time: 2026-04-08 11:05:34 */ import { forwardRef, useImperativeHandle, useState, useRef, useMemo } from 'react'; -import { Form, Input, Select, InputNumber, Button, Row, Col, Flex, Spin } from 'antd'; +import { Form, Input, Select, InputNumber, Button, Row, Col, Flex } from 'antd'; import { PlusOutlined } from '@ant-design/icons'; import { useTranslation } from 'react-i18next'; diff --git a/web/src/views/Workflow/components/PortClickHandler.tsx b/web/src/views/Workflow/components/PortClickHandler.tsx index b556ffab..cb3e16c4 100644 --- a/web/src/views/Workflow/components/PortClickHandler.tsx +++ b/web/src/views/Workflow/components/PortClickHandler.tsx @@ -328,7 +328,7 @@ const PortClickHandler: React.FC = ({ graph }) => { }; const content = ( - + {nodeLibrary.map((category) => { const sourceNodeData = sourceNode?.getData(); const isChildOfLoop = sourceNodeData?.cycle && graph?.getNodes().find((n: any) => n.getData()?.id === sourceNodeData.cycle && n.getData()?.type === 'loop'); diff --git a/web/src/views/Workflow/components/Properties/CaseList/index.tsx b/web/src/views/Workflow/components/Properties/CaseList/index.tsx index e1583ca0..40353f64 100644 --- a/web/src/views/Workflow/components/Properties/CaseList/index.tsx +++ b/web/src/views/Workflow/components/Properties/CaseList/index.tsx @@ -4,7 +4,7 @@ * @Last Modified by: ZhaoYing * @Last Modified time: 2026-03-25 15:23:45 */ -import { type FC } from 'react' +import { useMemo, type FC } from 'react' import clsx from 'clsx' import { useTranslation } from 'react-i18next'; import { Form, Button, Select, Space, Divider, InputNumber, type SelectProps, Flex, Row, Col } from 'antd' @@ -15,7 +15,7 @@ import Editor from '../../Editor' import { edgeAttrs, nodeWidth } from '../../../constant' import RbButton from '@/components/RbButton'; import RadioGroupBtn from '../RadioGroupBtn' -import { calcConditionNodeTotalHeight, getConditionNodeCasePortY } from '../../../utils' +import { calcConditionNodeTotalHeight, getConditionNodeCasePortY } from '../../../utils'; interface CaseListProps { value?: Array<{ logical_operator: 'and' | 'or'; expressions: { left: string; operator: string; right: string; input_type?: string; }[] }>; @@ -49,6 +49,34 @@ const operatorsObj: { [key: string]: SelectProps['options'] } = { boolean: [ { value: 'eq', label: 'workflow.config.if-else.boolean.eq' }, { value: 'ne', label: 'workflow.config.if-else.boolean.ne' }, + ], + object: [ + { value: 'eq', label: 'workflow.config.if-else.boolean.eq' }, + { value: 'ne', label: 'workflow.config.if-else.boolean.ne' }, + { value: 'empty', label: 'workflow.config.if-else.empty' }, + { value: 'not_empty', label: 'workflow.config.if-else.not_empty' }, + ], + file: [ + { value: 'empty', label: 'workflow.config.if-else.file.empty' }, + { value: 'not_empty', label: 'workflow.config.if-else.file.not_empty' }, + ], + // TODO:包含、不包含、全都是 + 'array[file]': [ + { value: 'empty', label: 'workflow.config.if-else.empty' }, + { value: 'not_empty', label: 'workflow.config.if-else.not_empty' }, + // { value: 'eq', label: 'workflow.config.if-else.eq' }, + // { value: 'contains', label: 'workflow.config.if-else.contains' }, + // { value: 'not_contains', label: 'workflow.config.if-else.not_contains' }, + ], + 'array': [ + { value: 'contains', label: 'workflow.config.if-else.contains' }, + { value: 'not_contains', label: 'workflow.config.if-else.not_contains' }, + { value: 'empty', label: 'workflow.config.if-else.empty' }, + { value: 'not_empty', label: 'workflow.config.if-else.not_empty' }, + ], + 'array[object]': [ + { value: 'empty', label: 'workflow.config.if-else.empty' }, + { value: 'not_empty', label: 'workflow.config.if-else.not_empty' }, ] } @@ -247,6 +275,22 @@ const CaseList: FC = ({ form.setFieldValue([name, caseIndex, 'expressions', conditionIndex, 'right'], undefined); }; + const filterNumberOptions = useMemo(() => { + const filterList: Suggestion[] = [] + options.forEach(vo => { + if (vo.children && vo.children?.length > 0) { + filterList.push({ + ...vo, + children: vo.children.filter(child => child.dataType === 'number') + }) + } else if (vo.dataType === 'number') { + filterList.push(vo) + } + }) + + return filterList + }, [options]) + return ( <> @@ -284,11 +328,15 @@ const CaseList: FC = ({ const currentCase = cases[caseIndex] || {}; const currentExpression = currentCase.expressions?.[conditionIndex] || {}; const currentOperator = currentExpression.operator; - const hideRightField = currentOperator === 'empty' || currentOperator === 'not_empty'; const leftFieldValue = currentExpression.left; const leftFieldOption = options.find(option => `{{${option.value}}}` === leftFieldValue); const leftFieldType = leftFieldOption?.dataType; - const operatorList = operatorsObj[leftFieldType || 'default'] || operatorsObj.default || []; + const hideRightField = currentOperator === 'empty' || currentOperator === 'not_empty' || leftFieldType === 'file' || leftFieldType === 'array[object]' || leftFieldType === 'array[file]'; + const operatorList = leftFieldType && operatorsObj[leftFieldType] + ? operatorsObj[leftFieldType] + : leftFieldType && leftFieldType?.includes('array') + ? operatorsObj.array + : operatorsObj.default; const inputType = leftFieldType === 'number' ? currentExpression.input_type : undefined; return ( @@ -312,7 +360,7 @@ const CaseList: FC = ({ = ({ {inputType === 'variable' ? vo.dataType === 'number')} + options={filterNumberOptions} allowClear={false} variant="borderless" size="small" /> : form.setFieldValue([name, caseIndex, 'expressions', conditionIndex, 'right'], value)} - /> + placeholder={t('common.pleaseEnter')} + variant="borderless" + className="rb:w-full!" + onChange={(value) => form.setFieldValue([name, caseIndex, 'expressions', conditionIndex, 'right'], value)} + /> } : ( - {leftFieldType === 'boolean' - ? + {['boolean', 'array[boolean]'].includes(leftFieldType as string) + ? : } diff --git a/web/src/views/Workflow/components/Properties/ConditionList/index.tsx b/web/src/views/Workflow/components/Properties/ConditionList/index.tsx index cad32e37..ddf92971 100644 --- a/web/src/views/Workflow/components/Properties/ConditionList/index.tsx +++ b/web/src/views/Workflow/components/Properties/ConditionList/index.tsx @@ -1,4 +1,4 @@ -import { type FC } from 'react' +import { type FC, useMemo } from 'react' import clsx from 'clsx' import { useTranslation } from 'react-i18next'; import { Form, Button, Select, InputNumber, Input, Divider, type SelectProps, Flex, Space, Row, Col } from 'antd' @@ -47,6 +47,18 @@ const operatorsObj: { [key: string]: SelectProps['options'] } = { { value: 'ne', label: 'workflow.config.if-else.boolean.ne' }, { value: 'empty', label: 'workflow.config.if-else.empty' }, { value: 'not_empty', label: 'workflow.config.if-else.not_empty' }, + ], + // 为空、不为空 + object: [ + { value: 'empty', label: 'workflow.config.if-else.empty' }, + { value: 'not_empty', label: 'workflow.config.if-else.not_empty' }, + ], + // 包含、不包含、为空、不为空 + 'array': [ + { value: 'contains', label: 'workflow.config.if-else.contains' }, + { value: 'not_contains', label: 'workflow.config.if-else.not_contains' }, + { value: 'empty', label: 'workflow.config.if-else.empty' }, + { value: 'not_empty', label: 'workflow.config.if-else.not_empty' }, ] } @@ -81,6 +93,23 @@ const ConditionList: FC = ({ const currentValue = form.getFieldValue([parentName, 'logical_operator']); form.setFieldValue([parentName, 'logical_operator'], currentValue === 'and' ? 'or' : 'and'); }; + + const getNumVariable = useMemo(() => { + const filterList: Suggestion[] = [] + options.forEach(variable => { + if (variable.dataType === 'number') { + filterList.push(variable) + } else if (variable.dataType === 'file') { + filterList.push({ + ...variable, + disabled: true, + children: variable.children?.filter(child => child.dataType === 'number') + }) + } + }) + + return filterList + }, [options]) return ( <> @@ -125,11 +154,19 @@ const ConditionList: FC = ({ const expressions = form.getFieldValue([parentName, 'expressions']) || []; const currentExpression = expressions[index] || {}; const currentOperator = currentExpression.operator; - const hideRightField = currentOperator === 'empty' || currentOperator === 'not_empty'; const leftFieldValue = currentExpression.left; const leftFieldOption = options.find(option => `{{${option.value}}}` === leftFieldValue); const leftFieldType = leftFieldOption?.dataType; - const operatorList = operatorsObj[leftFieldType || 'default'] || operatorsObj.default || []; + const hideRightField = currentOperator === 'empty' || currentOperator === 'not_empty' || ['array[object]', 'object'].includes(leftFieldType as string); + const operatorList = leftFieldType && ['array[object]', 'object'].includes(leftFieldType) + ? operatorsObj.object + : leftFieldType && ['array[boolean]', 'boolean'].includes(leftFieldType) + ? operatorsObj.boolean + : leftFieldType && operatorsObj[leftFieldType] + ? operatorsObj[leftFieldType] + : leftFieldType?.includes('array') + ? operatorsObj.array + : operatorsObj.default const inputType = leftFieldType === 'number' ? currentExpression.input_type : undefined; return ( = ({ - vo.value.includes('sys.') || + !['file', 'array[file]'].includes(vo.dataType) && + (vo.value.includes('sys.') || vo.value.includes('conv.') || vo.nodeData.type === 'loop' || - (vo.nodeData.cycle && vo.nodeData.cycle === selectedNode?.id) + (vo.nodeData.cycle && vo.nodeData.cycle === selectedNode?.id)) )} size="small" allowClear={false} @@ -163,7 +201,7 @@ const ConditionList: FC = ({ } diff --git a/web/src/views/Workflow/components/Properties/CycleVarsList/index.tsx b/web/src/views/Workflow/components/Properties/CycleVarsList/index.tsx index 09106a77..5d1138f0 100644 --- a/web/src/views/Workflow/components/Properties/CycleVarsList/index.tsx +++ b/web/src/views/Workflow/components/Properties/CycleVarsList/index.tsx @@ -6,6 +6,7 @@ import VariableSelect from '../VariableSelect' import type { Suggestion } from '../../Editor/plugin/AutocompletePlugin' import RadioGroupBtn from '../RadioGroupBtn' import { getChildNodeVariables } from '../hooks/useVariableList' +import CodeMirrorEditor from '@/components/CodeMirrorEditor'; interface CycleVar { name: string; @@ -28,11 +29,17 @@ const types = [ 'string', 'number', 'boolean', + 'object', 'array[string]', 'array[number]', 'array[boolean]', 'array[object]' ] +const object_placeholder = `# example +# { +# "name": "redbear", +# "age": 2 +# }` const CycleVarsList: FC = ({ value = [], @@ -144,6 +151,13 @@ const CycleVarsList: FC = ({ { value: true, label: 'True' }, { value: false, label: 'False' }]} /> + : currentType === 'object' + ? : ( = ({ : options }, [options, filterBooleanType]) + const namefilterOptions = useMemo(() => { + const filterList: Suggestion[] = []; + options.forEach(vo => { + if (vo.dataType === 'file') { + filterList.push({ + ...vo, + disabled: true, + children: vo.children?.filter(child => child.dataType !== 'boolean') + }) + } else if (vo.dataType !== 'array[file]') { + filterList.push(vo) + } + }) + + return filterList + }, [options]) + const valueFilterOptions = (type?: string) => { + let filterOptions: Suggestion[] = [] + options.forEach(vo => { + if (type === 'file' && vo.dataType === 'file') { + filterOptions.push({ + ...vo, + children: [] + }) + } else if (type === 'file' && vo.dataType === 'array[file]') { + filterOptions.push(vo) + } else if (vo.dataType === 'file') { + filterOptions.push({ + ...vo, + disabled: true + }) + } else { + filterOptions.push(vo) + } + }) + + return filterOptions + } + const getColumns = (remove: (index: number) => void): TableProps['columns'] => { const hasType = typeOptions.length > 0; const contentClassName = hasType ? 'rb:w-[110px]!' : "rb:w-[148px]!" @@ -57,7 +96,7 @@ const EditableTable: FC = ({ render: (_: any, __: TableRow, index: number) => ( !option.dataType.includes('file'))} + options={namefilterOptions} type="input" className={contentClassName} size={size} @@ -105,9 +144,7 @@ const EditableTable: FC = ({ > {(form) => { const currentType = form.getFieldValue([...Array.isArray(parentName) ? parentName : [parentName], index, 'type']); - const filteredOptions = currentType === 'file' - ? booleanFilterOptions.filter(option => option.dataType.includes('file')) - : booleanFilterOptions.filter(option => !option.dataType.includes('file')); + const filteredOptions = valueFilterOptions(currentType) return ( diff --git a/web/src/views/Workflow/components/Properties/HttpRequest/index.tsx b/web/src/views/Workflow/components/Properties/HttpRequest/index.tsx index a02549bd..b032016b 100644 --- a/web/src/views/Workflow/components/Properties/HttpRequest/index.tsx +++ b/web/src/views/Workflow/components/Properties/HttpRequest/index.tsx @@ -4,7 +4,7 @@ * @Last Modified by: ZhaoYing * @Last Modified time: 2026-04-02 17:17:06 */ -import { type FC, useRef, useState } from "react"; +import { type FC, useMemo, useRef, useState } from "react"; import { useTranslation } from 'react-i18next' import { Form, Row, Col, Select, Button, Divider, InputNumber, Switch, Input, Flex, Radio } from 'antd' import { CaretDownOutlined, CaretRightOutlined, SettingOutlined } from '@ant-design/icons'; @@ -84,6 +84,64 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an setCollapsed((prev: boolean) => !prev) } + const filterVariables = useMemo(() => { + const filterList: Suggestion[] = [] + options.forEach(variable => { + if (['number', 'string'].includes(variable.dataType)) { + filterList.push(variable) + } else if (variable.dataType === 'file') { + filterList.push({ + ...variable, + disabled: true, + children: variable.children?.filter(child => ['number', 'string'].includes(child.dataType)) + }) + } + }) + + return filterList + }, [options]) + const filterVariablesWithFile = useMemo(() => { + const filterList: Suggestion[] = [] + options.forEach(variable => { + if (['number', 'string', 'file', 'array[file]'].includes(variable.dataType)) { + filterList.push(variable) + } + }) + + return filterList + }, [options]) + const jsonRawFilterVariables = useMemo(() => { + const filterList: Suggestion[] = [] + options.forEach(variable => { + if (['number', 'string', 'array[string]', 'array[number]'].includes(variable.dataType)) { + filterList.push(variable) + } else if (variable.dataType === 'file') { + filterList.push({ + ...variable, + disabled: true, + children: variable.children?.filter(child => ['number', 'string', 'file', 'array[string]', 'array[number]'].includes(child.dataType)) + }) + } + }) + + return filterList + }, [options]) + const fileFilterVariables = useMemo(() => { + const filterList: Suggestion[] = [] + options.forEach(variable => { + if (['array[file]'].includes(variable.dataType)) { + filterList.push(variable) + } else if (variable.dataType === 'file') { + filterList.push({ + ...variable, + children: [] + }) + } + }) + + return filterList + }, [options]) + return ( <> @@ -117,7 +175,7 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an vo.dataType === 'string' || vo.dataType === 'number')} + options={filterVariables} variant="outlined" type="input" size="small" @@ -134,7 +192,7 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an size="small" parentName="headers" title="HEADERS" - options={options.filter(vo => vo.dataType === 'string' || vo.dataType === 'number')} + options={filterVariables} /> @@ -143,7 +201,7 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an size="small" parentName="params" title="PARAMS" - options={options.filter(vo => vo.dataType === 'string' || vo.dataType === 'number')} + options={filterVariables} /> @@ -167,7 +225,7 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an vo.dataType === 'string' || vo.dataType === 'number' || vo.dataType.includes('file'))} + options={filterVariablesWithFile} typeOptions={[ { label: 'text', value: 'text' }, { label: 'file', value: 'file' } @@ -180,7 +238,7 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an vo.dataType === 'string' || vo.dataType === 'number')} + options={filterVariablesWithFile} filterBooleanType={true} /> @@ -190,7 +248,7 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an vo.dataType === 'string' || vo.dataType === 'number')} + options={jsonRawFilterVariables} isArray={false} title="JSON" titleVariant="borderless" @@ -204,7 +262,7 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an vo.dataType === 'string' || vo.dataType === 'number')} + options={jsonRawFilterVariables} isArray={false} title="RAW TEXT" titleVariant="borderless" @@ -220,7 +278,7 @@ const HttpRequest: FC<{ options: Suggestion[]; selectedNode?: any; graphRef?: an vo.dataType.includes('file'))} + options={fileFilterVariables} type="input" size="small" height={28} diff --git a/web/src/views/Workflow/components/Properties/ToolConfig/index.tsx b/web/src/views/Workflow/components/Properties/ToolConfig/index.tsx index ba1e9a5f..ce30ee8f 100644 --- a/web/src/views/Workflow/components/Properties/ToolConfig/index.tsx +++ b/web/src/views/Workflow/components/Properties/ToolConfig/index.tsx @@ -163,25 +163,45 @@ const ToolConfig: FC<{ options: Suggestion[]; }> = ({ form.setFieldsValue(inititalValue) } - const getNumberOptions = useMemo(() => { - const list: Suggestion[] = [] + // string -> string + // integer -> number + // number -> number + // boolean -> boolean【只能选true/false】 + // array -> array[file]/array[object]/array[string]/array[number]/array[boolean] + // object -> object/file + const getFilterOptions = (type: string) => { + const filterList: Suggestion[] = []; options.forEach(vo => { - if (vo.children && vo?.children?.length > 0) { - const filterChild = vo.children.filter(child => child.dataType === 'number') + if (vo.children && vo.children?.length > 0) { + const childOptions = vo.children?.filter(child => child.dataType === type || (type === 'integer' && child.dataType === 'number')) - if (filterChild.length > 0) { - list.push({ ...vo, disabled: vo.dataType !== 'number', children: filterChild }) - } else if (vo.dataType === 'number') { - list.push({ ...vo, children: [] }) + if (vo.dataType === type + || (type === 'integer' && vo.dataType === 'number') + || (type === 'array' && vo.dataType.includes(type)) + || (type === 'object' && vo.dataType === 'object') + ) { + filterList.push({ + ...vo, + children: childOptions + }) + } else if (childOptions.length > 0) { + filterList.push({ + ...vo, + disabled: true, + children: childOptions + }) } - } else if (vo.dataType === 'number') { - list.push({ ...vo }) + } else if (vo.dataType === type + || (type === 'integer' && vo.dataType === 'number') + || (type === 'array' && vo.dataType.includes(type)) + || (type === 'object' && vo.dataType === 'object')) { + filterList.push(vo) } }) - console.log('options', options, list) - return list - }, [options]) + + return filterList + } return ( <> @@ -205,7 +225,7 @@ const ToolConfig: FC<{ options: Suggestion[]; }> = ({ - {parameter.name} + {parameter.name} ({parameter.type})
@@ -220,21 +240,12 @@ const ToolConfig: FC<{ options: Suggestion[]; }> = ({ ?