Merge branch 'develop' into refactor/memory_search

# Conflicts: # api/app/core/memory/storage_services/search/__init__.py
2026-04-20 17:49:29 +08:00
parent 688503a1ca c50969dea4
commit dc3207b1d3
202 changed files with 6621 additions and 1690 deletions
--- a/api/app/core/memory/agent/utils/write_tools.py
+++ b/api/app/core/memory/agent/utils/write_tools.py
@@ -14,6 +14,7 @@ from dotenv import load_dotenv

 from app.core.logging_config import get_agent_logger
 from app.core.memory.agent.utils.get_dialogs import get_chunked_dialogs
+from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import _USER_PLACEHOLDER_NAMES
 from app.core.memory.storage_services.extraction_engine.extraction_orchestrator import ExtractionOrchestrator
 from app.core.memory.storage_services.extraction_engine.knowledge_extraction.memory_summary import \
    memory_summary_generation
@@ -191,15 +192,37 @@ async def write(
            if success:
                logger.info("Successfully saved all data to Neo4j")
                
-                # 使用 Celery 异步任务触发聚类（不阻塞主流程）
                if all_entity_nodes:
+                    end_user_id = all_entity_nodes[0].end_user_id
+
+                    # Neo4j 写入完成后，用 PgSQL 权威 aliases 覆盖 Neo4j 用户实体
+                    try:
+                        from app.repositories.end_user_info_repository import EndUserInfoRepository
+                        if end_user_id:
+                            with get_db_context() as db_session:
+                                info = EndUserInfoRepository(db_session).get_by_end_user_id(uuid.UUID(end_user_id))
+                                pg_aliases = info.aliases if info and info.aliases else []
+                            if info is not None:
+                                # 将 Python 侧占位名集合作为参数传入，避免 Cypher 硬编码
+                                placeholder_names = list(_USER_PLACEHOLDER_NAMES)
+                                await neo4j_connector.execute_query(
+                                    """
+                                    MATCH (e:ExtractedEntity)
+                                    WHERE e.end_user_id = $end_user_id AND toLower(e.name) IN $placeholder_names
+                                    SET e.aliases = $aliases
+                                    """,
+                                    end_user_id=end_user_id, aliases=pg_aliases,
+                                    placeholder_names=placeholder_names,
+                                )
+                                logger.info(f"[AliasSync] Neo4j 用户实体 aliases 已用 PgSQL 权威源覆盖: {pg_aliases}")
+                    except Exception as sync_err:
+                        logger.warning(f"[AliasSync] PgSQL→Neo4j aliases 同步失败（不影响主流程）: {sync_err}")
+
+                    # 使用 Celery 异步任务触发聚类（不阻塞主流程）
                    try:
                        from app.tasks import run_incremental_clustering
                        
-                        end_user_id = all_entity_nodes[0].end_user_id
                        new_entity_ids = [e.id for e in all_entity_nodes]
-                        
-                        # 异步提交 Celery 任务
                        task = run_incremental_clustering.apply_async(
                            kwargs={
                                "end_user_id": end_user_id,
@@ -207,7 +230,6 @@ async def write(
                                "llm_model_id": str(memory_config.llm_model_id) if memory_config.llm_model_id else None,
                                "embedding_model_id": str(memory_config.embedding_model_id) if memory_config.embedding_model_id else None,
                            },
-                            # 设置任务优先级（低优先级，不影响主业务）
                            priority=3,
                        )
                        logger.info(
@@ -215,7 +237,6 @@ async def write(
                            f"task_id={task.id}, end_user_id={end_user_id}, entity_count={len(new_entity_ids)}"
                        )
                    except Exception as e:
-                        # 聚类任务提交失败不影响主流程
                        logger.error(f"[Clustering] 提交聚类任务失败（不影响主流程）: {e}", exc_info=True)
                
                break
--- a/api/app/core/memory/models/init.py
+++ b/api/app/core/memory/models/init.py
@@ -61,9 +61,9 @@ from app.core.memory.models.triplet_models import (
 # User metadata models
 from app.core.memory.models.metadata_models import (
    UserMetadata,
-    UserMetadataBehavioralHints,
    UserMetadataProfile,
    MetadataExtractionResponse,
+    MetadataFieldChange,
 )

 # Ontology scenario models (LLM extracted from scenarios)
@@ -133,9 +133,9 @@ __all__ = [
    "Triplet",
    "TripletExtractionResponse",
    "UserMetadata",
-    "UserMetadataBehavioralHints",
    "UserMetadataProfile",
    "MetadataExtractionResponse",
+    "MetadataFieldChange",
    # Ontology models
    "OntologyClass",
    "OntologyExtractionResponse",
--- a/api/app/core/memory/models/metadata_models.py
+++ b/api/app/core/memory/models/metadata_models.py
@@ -4,7 +4,7 @@ Independent from triplet_models.py - these models are used by the
 standalone metadata extraction pipeline (post-dedup async Celery task).
 """

-from typing import List
+from typing import List, Literal, Optional

 from pydantic import BaseModel, ConfigDict, Field

@@ -13,8 +13,8 @@ class UserMetadataProfile(BaseModel):
    """用户画像信息"""

    model_config = ConfigDict(extra="ignore")
-    role: str = Field(default="", description="用户职业或角色")
-    domain: str = Field(default="", description="用户所在领域")
+    role: List[str] = Field(default_factory=list, description="用户职业或角色")
+    domain: List[str] = Field(default_factory=list, description="用户所在领域")
    expertise: List[str] = Field(
        default_factory=list, description="用户擅长的技能或工具"
    )
@@ -23,31 +23,37 @@ class UserMetadataProfile(BaseModel):
    )


-class UserMetadataBehavioralHints(BaseModel):
-    """行为偏好"""
-
-    model_config = ConfigDict(extra="ignore")
-    learning_stage: str = Field(default="", description="学习阶段")
-    preferred_depth: str = Field(default="", description="偏好深度")
-    tone_preference: str = Field(default="", description="语气偏好")
-
-
 class UserMetadata(BaseModel):
    """用户元数据顶层结构"""

    model_config = ConfigDict(extra="ignore")
    profile: UserMetadataProfile = Field(default_factory=UserMetadataProfile)
-    behavioral_hints: UserMetadataBehavioralHints = Field(
-        default_factory=UserMetadataBehavioralHints
+
+
+class MetadataFieldChange(BaseModel):
+    """单个元数据字段的变更操作"""
+
+    model_config = ConfigDict(extra="ignore")
+    field_path: str = Field(
+        description="字段路径，用点号分隔，如 'profile.role'、'profile.expertise'"
+    )
+    action: Literal["set", "remove"] = Field(
+        description="操作类型：'set' 表示新增或修改，'remove' 表示移除"
+    )
+    value: Optional[str] = Field(
+        default=None,
+        description="字段的新值（action='set' 时必填）。标量字段直接填值，列表字段填单个要新增的元素"
    )
-    knowledge_tags: List[str] = Field(default_factory=list, description="知识标签")


 class MetadataExtractionResponse(BaseModel):
-    """元数据提取 LLM 响应结构"""
+    """元数据提取 LLM 响应结构（增量模式）"""

    model_config = ConfigDict(extra="ignore")
-    user_metadata: UserMetadata = Field(default_factory=UserMetadata)
+    metadata_changes: List[MetadataFieldChange] = Field(
+        default_factory=list,
+        description="元数据的增量变更列表，每项描述一个字段的新增、修改或移除操作",
+    )
    aliases_to_add: List[str] = Field(
        default_factory=list,
        description="本次新发现的用户别名（用户自我介绍或他人对用户的称呼）",
--- a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py
+++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py
@@ -82,51 +82,38 @@ def _merge_attribute(canonical: ExtractedEntityNode, ent: ExtractedEntityNode):
            canonical.connect_strength = next(iter(pair))

    # 别名合并（去重保序，使用标准化工具）
+    # 用户实体的 aliases 由 PgSQL end_user_info 作为唯一权威源，去重合并时不修改
    try:
        canonical_name = (getattr(canonical, "name", "") or "").strip()
-        incoming_name = (getattr(ent, "name", "") or "").strip()
-        
-        # 收集所有需要合并的别名
-        all_aliases = []
-        
-        # 1. 添加canonical现有的别名
-        existing = getattr(canonical, "aliases", []) or []
-        all_aliases.extend(existing)
-        
-        # 2. 添加incoming实体的名称（如果不同于canonical的名称）
-        if incoming_name and incoming_name != canonical_name:
-            all_aliases.append(incoming_name)
-        
-        # 3. 添加incoming实体的所有别名
-        incoming = getattr(ent, "aliases", []) or []
-        all_aliases.extend(incoming)
-        
-        # 4. 标准化并去重（优先使用alias_utils工具函数）
-        try:
-            from app.core.memory.utils.alias_utils import normalize_aliases
-            canonical.aliases = normalize_aliases(canonical_name, all_aliases)
-        except Exception:
-            # 如果导入失败，使用增强的去重逻辑
-            seen_normalized = set()
-            unique_aliases = []
+        if canonical_name.lower() not in _USER_PLACEHOLDER_NAMES:
+            incoming_name = (getattr(ent, "name", "") or "").strip()
            
-            for alias in all_aliases:
-                if not alias:
-                    continue
-                
-                alias_stripped = str(alias).strip()
-                if not alias_stripped or alias_stripped == canonical_name:
-                    continue
-                
-                # 标准化：转小写用于去重判断
-                alias_normalized = alias_stripped.lower()
-                
-                if alias_normalized not in seen_normalized:
-                    seen_normalized.add(alias_normalized)
-                    unique_aliases.append(alias_stripped)
+            # 收集所有需要合并的别名，过滤掉用户占位名避免污染非用户实体
+            all_aliases = list(getattr(canonical, "aliases", []) or [])
+            if incoming_name and incoming_name != canonical_name and incoming_name.lower() not in _USER_PLACEHOLDER_NAMES:
+                all_aliases.append(incoming_name)
+            all_aliases.extend(
+                a for a in (getattr(ent, "aliases", []) or [])
+                if a and a.strip().lower() not in _USER_PLACEHOLDER_NAMES
+            )
            
-            # 排序并赋值
-            canonical.aliases = sorted(unique_aliases)
+            try:
+                from app.core.memory.utils.alias_utils import normalize_aliases
+                canonical.aliases = normalize_aliases(canonical_name, all_aliases)
+            except Exception:
+                seen_normalized = set()
+                unique_aliases = []
+                for alias in all_aliases:
+                    if not alias:
+                        continue
+                    alias_stripped = str(alias).strip()
+                    if not alias_stripped or alias_stripped == canonical_name:
+                        continue
+                    alias_normalized = alias_stripped.lower()
+                    if alias_normalized not in seen_normalized:
+                        seen_normalized.add(alias_normalized)
+                        unique_aliases.append(alias_stripped)
+                canonical.aliases = sorted(unique_aliases)
    except Exception:
        pass

@@ -733,66 +720,37 @@ def fuzzy_match(


    def _merge_entities_with_aliases(canonical: ExtractedEntityNode, losing: ExtractedEntityNode):
-        """ 模糊匹配中的实体合并。
+        """模糊匹配中的实体合并（别名部分）。
        
-        合并策略：
-        1. 保留canonical的主名称不变
-        2. 将losing的主名称添加为alias（如果不同）
-        3. 合并两个实体的所有aliases
-        4. 自动去重（case-insensitive）并排序
-        
-        Args:
-            canonical: 规范实体（保留）
-            losing: 被合并实体（删除）
-            
-        Note:
-            使用alias_utils.normalize_aliases进行标准化去重
+        用户实体的 aliases 由 PgSQL end_user_info 作为唯一权威源，跳过合并。
        """
-        # 获取规范实体的名称
        canonical_name = (getattr(canonical, "name", "") or "").strip()
+        if canonical_name.lower() in _USER_PLACEHOLDER_NAMES:
+            return
+
        losing_name = (getattr(losing, "name", "") or "").strip()
        
-        # 收集所有需要合并的别名
-        all_aliases = []
-        
-        # 1. 添加canonical现有的别名
-        current_aliases = getattr(canonical, "aliases", []) or []
-        all_aliases.extend(current_aliases)
-        
-        # 2. 添加losing实体的名称（如果不同于canonical的名称）
+        all_aliases = list(getattr(canonical, "aliases", []) or [])
        if losing_name and losing_name != canonical_name:
            all_aliases.append(losing_name)
+        all_aliases.extend(getattr(losing, "aliases", []) or [])
        
-        # 3. 添加losing实体的所有别名
-        losing_aliases = getattr(losing, "aliases", []) or []
-        all_aliases.extend(losing_aliases)
-        
-        # 4. 标准化并去重（使用标准化后的字符串进行去重）
        try:
            from app.core.memory.utils.alias_utils import normalize_aliases
            canonical.aliases = normalize_aliases(canonical_name, all_aliases)
        except Exception:
-            # 如果导入失败，使用增强的去重逻辑
-            # 使用标准化后的字符串作为key进行去重
            seen_normalized = set()
            unique_aliases = []
-            
            for alias in all_aliases:
                if not alias:
                    continue
-                
                alias_stripped = str(alias).strip()
                if not alias_stripped or alias_stripped == canonical_name:
                    continue
-                
-                # 标准化：转小写用于去重判断
                alias_normalized = alias_stripped.lower()
-                
                if alias_normalized not in seen_normalized:
                    seen_normalized.add(alias_normalized)
                    unique_aliases.append(alias_stripped)
-            
-            # 排序并赋值
            canonical.aliases = sorted(unique_aliases)
    
    # ========== 主循环：遍历所有实体对进行模糊匹配 ==========
--- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py
+++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py
@@ -1391,18 +1391,18 @@ class ExtractionOrchestrator:
        """
        将本轮提取的用户别名同步到 end_user 和 end_user_info 表。

-        注意：此方法在 Neo4j 写入之前调用，因此不能依赖 Neo4j 作为别名的权威数据源。
-        改为直接使用内存中去重后的 entity_nodes 的 aliases，与 PgSQL 已有的 aliases 合并。
+        PgSQL end_user_info.aliases 是用户别名的唯一权威源。
+        此方法仅将本轮 LLM 从对话中新提取的别名增量追加到 PgSQL，
+        不再从 Neo4j 二层去重合并历史别名，避免脏数据反向污染 PgSQL。

        策略：
-        1. 从内存中的 entity_nodes 提取本轮用户别名（current_aliases）
-        2. 从去重后的 entity_nodes 中提取完整别名（含 Neo4j 二层去重合并的历史别名）
-        3. 从 PgSQL end_user_info 读取已有的 aliases（db_aliases）
-        4. 合并 db_aliases + deduped_aliases + current_aliases，去重保序
-        5. 写回 PgSQL
+        1. 从本轮对话原始发言中提取用户别名（current_aliases）
+        2. 从 PgSQL end_user_info 读取已有的 aliases（db_aliases）
+        3. 合并 db_aliases + current_aliases，去重保序
+        4. 写回 PgSQL

        Args:
-            entity_nodes: 去重后的实体节点列表（内存中，含二层去重合并结果）
+            entity_nodes: 去重后的实体节点列表（内存中）
            dialog_data_list: 对话数据列表
        """
        try:
@@ -1418,11 +1418,6 @@ class ExtractionOrchestrator:
            # 1. 提取本轮对话的用户别名（保持 LLM 提取的原始顺序，不排序）
            current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list)

-            # 1.5 从去重后的 entity_nodes 中提取完整别名
-            # 二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 中，
-            # 这里提取出来确保 PgSQL 与 Neo4j 的别名保持同步
-            deduped_aliases = self._extract_deduped_entity_aliases(entity_nodes)
-
            # 1.6 从 Neo4j 查询已有的 AI 助手别名，作为额外的排除源
            # （防止 LLM 未提取出 AI 助手实体时，AI 别名泄漏到用户别名中）
            neo4j_assistant_aliases = await self._fetch_neo4j_assistant_aliases(end_user_id)
@@ -1434,19 +1429,12 @@ class ExtractionOrchestrator:
                ]
                if len(current_aliases) < before_count:
                    logger.info(f"通过 Neo4j AI 助手别名排除了 {before_count - len(current_aliases)} 个误归属别名")
-                # 同样过滤 deduped_aliases
-                deduped_aliases = [
-                    a for a in deduped_aliases
-                    if a.strip().lower() not in neo4j_assistant_aliases
-                ]

-            if not current_aliases and not deduped_aliases:
+            if not current_aliases:
                logger.debug(f"本轮未提取到用户别名，跳过同步: end_user_id={end_user_id}")
                return

            logger.info(f"本轮对话提取的 aliases: {current_aliases}")
-            if deduped_aliases:
-                logger.info(f"去重后实体的完整 aliases（含历史）: {deduped_aliases}")

            # 2. 同步到数据库
            end_user_uuid = uuid.UUID(end_user_id)
@@ -1457,21 +1445,15 @@ class ExtractionOrchestrator:
                    logger.warning(f"未找到 end_user_id={end_user_id} 的用户记录")
                    return

-                # 3. 从 PgSQL 读取已有 aliases 并与本轮合并
+                # 3. 从 PgSQL 读取已有 aliases 并与本轮新增合并
                info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid)
                db_aliases = (info.aliases if info and info.aliases else [])
                # 过滤掉占位名称
                db_aliases = [a for a in db_aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES]

-                # 合并：已有 + 去重后完整别名 + 本轮新增，去重保序
+                # 合并：PgSQL 已有 + 本轮新增，去重保序（不再合并 Neo4j 历史别名）
                merged_aliases = list(db_aliases)
                seen_lower = {a.strip().lower() for a in merged_aliases}
-                # 先合并去重后实体的完整别名（含 Neo4j 历史别名）
-                for alias in deduped_aliases:
-                    if alias.strip().lower() not in seen_lower:
-                        merged_aliases.append(alias)
-                        seen_lower.add(alias.strip().lower())
-                # 再合并本轮新提取的别名
                for alias in current_aliases:
                    if alias.strip().lower() not in seen_lower:
                        merged_aliases.append(alias)
@@ -1505,9 +1487,7 @@ class ExtractionOrchestrator:
                        info.aliases = merged_aliases
                        logger.info(f"同步合并后 aliases 到 end_user_info: {merged_aliases}")
                else:
-                    first_alias = current_aliases[0].strip() if current_aliases else (
-                        deduped_aliases[0].strip() if deduped_aliases else ""
-                    )
+                    first_alias = current_aliases[0].strip() if current_aliases else ""
                    # 确保 first_alias 不是占位名称
                    if first_alias and first_alias.lower() not in self.USER_PLACEHOLDER_NAMES:
                        db.add(EndUserInfo(
--- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py
+++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/metadata_extractor.py
@@ -118,7 +118,7 @@ class MetadataExtractor:
        existing_aliases: Optional[List[str]] = None,
    ) -> Optional[tuple]:
        """
-        对筛选后的 statement 列表调用 LLM 提取元数据和用户别名。
+        对筛选后的 statement 列表调用 LLM 提取元数据增量变更和用户别名。

        Args:
            statements: 用户发言的 statement 文本列表
@@ -126,7 +126,8 @@ class MetadataExtractor:
            existing_aliases: 数据库已有的用户别名列表（可选）

        Returns:
-            (UserMetadata, List[str], List[str]) tuple: (metadata, aliases_to_add, aliases_to_remove) on success, None on failure
+            (List[MetadataFieldChange], List[str], List[str]) tuple:
+            (metadata_changes, aliases_to_add, aliases_to_remove) on success, None on failure
        """
        if not statements:
            return None
@@ -160,12 +161,12 @@ class MetadataExtractor:
            )

            if response:
-                metadata = response.user_metadata if response.user_metadata else None
+                changes = response.metadata_changes if response.metadata_changes else []
                to_add = response.aliases_to_add if response.aliases_to_add else []
                to_remove = (
                    response.aliases_to_remove if response.aliases_to_remove else []
                )
-                return metadata, to_add, to_remove
+                return changes, to_add, to_remove

            logger.warning("LLM 返回的响应为空")
            return None
--- a/api/app/core/memory/storage_services/search/keyword_search.py
+++ b/api/app/core/memory/storage_services/search/keyword_search.py
--- a/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/extract_user_metadata.jinja2
@@ -1,5 +1,5 @@
 ===Task===
-Extract user metadata from the following conversation statements spoken by the user.
+Extract user metadata changes from the following conversation statements spoken by the user.

 {% if language == "zh" %}
 **"三度原则"判断标准：**
@@ -10,28 +10,36 @@ Extract user metadata from the following conversation statements spoken by the u
 **提取规则：**
 - **只提取关于"用户本人"的画像信息**，忽略用户提到的第三方人物（如朋友、同事、家人）的信息
 - 仅提取文本中明确提到的信息，不要推测
- 如果文本中没有可提取的用户画像信息，返回空的 user_metadata 对象
 - **输出语言必须与输入文本的语言一致**（输入中文则输出中文值，输入英文则输出英文值）

+**增量模式（重要）：**
+你只需要输出**本次对话引起的变更操作**，不要输出完整的元数据。每个变更是一个对象，包含：
+- `field_path`：字段路径，用点号分隔（如 `profile.role`、`profile.expertise`）
+- `action`：操作类型
+  * `set`：新增或修改一个字段的值
+  * `remove`：移除一个字段的值
+- `value`：字段的新值（`action="set"` 时必填，`action="remove"` 时填要移除的元素值）
+  * 所有字段均为列表类型，每个元素一条变更记录
+
+**判断规则：**
+- 用户提到新信息 → `action="set"`，填入新值
+- 用户明确否定已有信息（如"我不再做老师了"、"我已经不学Python了"）→ `action="remove"`，`value` 填要移除的元素值
+- 如果本次对话没有任何可提取的变更，返回空的 `metadata_changes` 数组 `[]`
+- **不要为未被提及的字段生成任何变更操作**
+
 {% if existing_metadata %}
-**重要：合并已有元数据**
-下方提供了数据库中已有的用户元数据。请结合用户最新发言，输出**合并后的完整元数据**：
- 如果用户明确否定了已有信息（如"我不再教高中物理了"），在输出中**移除**该信息
- 如果用户提到了新信息，**添加**到对应字段中
- 如果已有信息未被用户否定，**保留**在输出中
- 标量字段（如 role、domain）：如果用户提到了新值，用新值替换；否则保留已有值
- 最终输出应该是完整的、合并后的元数据，不是增量
+**已有元数据（仅供参考，用于判断是否需要变更）：**
+请对比已有数据和用户最新发言，只输出差异部分的变更操作。
+- 如果用户说的信息和已有数据一致，不需要输出变更
+- 如果用户否定了已有数据中的某个值，输出 `remove` 操作
+- 如果用户提到了新信息，输出 `set` 操作
 {% endif %}

 **字段说明：**
- profile.role：用户的职业或角色，如 教师、医生、后端工程师
- profile.domain：用户所在领域，如 教育、医疗、软件开发
- profile.expertise：用户擅长的技能或工具（通用，不限于编程），如 Python、心理咨询、高中物理
- profile.interests：用户主动表达兴趣的话题或领域标签
- behavioral_hints.learning_stage：学习阶段（初学者/中级/高级）
- behavioral_hints.preferred_depth：偏好深度（概览/技术细节/深入探讨）
- behavioral_hints.tone_preference：语气偏好（轻松随意/专业简洁/学术严谨）
- knowledge_tags：用户涉及的知识领域标签
+- profile.role：用户的职业或角色（列表），如 教师、医生、后端工程师，一个人可以有多个角色
+- profile.domain：用户所在领域（列表），如 教育、医疗、软件开发，一个人可以涉及多个领域
+- profile.expertise：用户擅长的技能或工具（列表），如 Python、心理咨询、高中物理
+- profile.interests：用户主动表达兴趣的话题或领域标签（列表）

 **用户别名变更（增量模式）：**
 - **aliases_to_add**：本次新发现的用户别名，包括：
@@ -43,7 +51,6 @@ Extract user metadata from the following conversation statements spoken by the u
 - **aliases_to_remove**：用户明确否认的别名，包括：
  * 用户说"我不叫XX了"、"别叫我XX"、"我改名了，不叫XX" → 将 XX 放入此数组
  * **严格限制**：只将用户原文中**逐字提到**的被否认名字放入，不要推断关联的其他别名
-  * 例如：用户说"我不叫陈小刀了" → 只移除"陈小刀"，不要移除"陈哥"、"老陈"等未被提及的别名
  * 如果没有要移除的别名，返回空数组 `[]`
 {% if existing_aliases %}
 - 已有别名：{{ existing_aliases | tojson }}（仅供参考，不需要在输出中重复）
@@ -57,28 +64,36 @@ Extract user metadata from the following conversation statements spoken by the u
 **Extraction rules:**
 - **Only extract profile information about the user themselves**, ignore information about third parties (friends, colleagues, family) mentioned by the user
 - Only extract information explicitly mentioned in the text, do not speculate
- If no user profile information can be extracted, return an empty user_metadata object
 - **Output language must match the input text language**

+**Incremental mode (important):**
+You should only output **the change operations caused by this conversation**, not the complete metadata. Each change is an object containing:
+- `field_path`: Field path separated by dots (e.g. `profile.role`, `profile.expertise`)
+- `action`: Operation type
+  * `set`: Add or update a field value
+  * `remove`: Remove a field value
+- `value`: The new value for the field (required when `action="set"`, for `action="remove"` fill in the element value to remove)
+  * All fields are list types, one change record per element
+
+**Decision rules:**
+- User mentions new information → `action="set"`, fill in the new value
+- User explicitly negates existing info (e.g. "I'm no longer a teacher", "I stopped learning Python") → `action="remove"`, `value` is the element to remove
+- If this conversation has no extractable changes, return an empty `metadata_changes` array `[]`
+- **Do NOT generate any change operations for fields not mentioned in the conversation**
+
 {% if existing_metadata %}
-**Important: Merge with existing metadata**
-Existing user metadata from the database is provided below. Combine with the user's latest statements to output the **complete merged metadata**:
- If the user explicitly negates existing info (e.g. "I no longer teach high school physics"), **remove** it from output
- If the user mentions new info, **add** it to the corresponding field
- If existing info is not negated by the user, **keep** it in the output
- Scalar fields (e.g. role, domain): replace with new value if user mentions one; otherwise keep existing
- The final output should be the complete, merged metadata — not an incremental update
+**Existing metadata (for reference only, to determine if changes are needed):**
+Compare existing data with the user's latest statements, and only output change operations for the differences.
+- If the user's statement matches existing data, no change is needed
+- If the user negates a value in existing data, output a `remove` operation
+- If the user mentions new information, output a `set` operation
 {% endif %}

 **Field descriptions:**
- profile.role: User's occupation or role, e.g. teacher, doctor, software engineer
- profile.domain: User's domain, e.g. education, healthcare, software development
- profile.expertise: User's skills or tools (general, not limited to programming)
- profile.interests: Topics or domain tags the user actively expressed interest in
- behavioral_hints.learning_stage: Learning stage (beginner/intermediate/advanced)
- behavioral_hints.preferred_depth: Preferred depth (overview/detailed/deep dive)
- behavioral_hints.tone_preference: Tone preference (casual/professional/academic)
- knowledge_tags: Knowledge domain tags related to the user
+- profile.role: User's occupation or role (list), e.g. teacher, doctor, software engineer. A person can have multiple roles
+- profile.domain: User's domain (list), e.g. education, healthcare, software development. A person can span multiple domains
+- profile.expertise: User's skills or tools (list), e.g. Python, counseling, physics
+- profile.interests: Topics or domain tags the user actively expressed interest in (list)

 **User alias changes (incremental mode):**
 - **aliases_to_add**: Newly discovered user aliases from this conversation, including:
@@ -90,7 +105,6 @@ Existing user metadata from the database is provided below. Combine with the use
 - **aliases_to_remove**: Aliases the user explicitly denies, including:
  * User says "Don't call me XX anymore", "I'm not called XX", "I changed my name from XX" → put XX in this array
  * **Strict rule**: Only include the exact name the user **verbatim mentions** as denied. Do NOT infer or remove related aliases
-  * Example: User says "I'm not called John anymore" → only remove "John", do NOT remove "Johnny", "J" or other related aliases not mentioned
  * If no aliases to remove, return empty array `[]`
 {% if existing_aliases %}
 - Existing aliases: {{ existing_aliases | tojson }} (for reference only, do not repeat in output)
@@ -113,20 +127,11 @@ Existing user metadata from the database is provided below. Combine with the use
 Return a JSON object with the following structure:
 ```json
 {
-  "user_metadata": {
-    "profile": {
-      "role": "",
-      "domain": "",
-      "expertise": [],
-      "interests": []
-    },
-    "behavioral_hints": {
-      "learning_stage": "",
-      "preferred_depth": "",
-      "tone_preference": ""
-    },
-    "knowledge_tags": []
-  },
+  "metadata_changes": [
+    {"field_path": "profile.role", "action": "set", "value": "后端工程师"},
+    {"field_path": "profile.expertise", "action": "set", "value": "Python"},
+    {"field_path": "profile.expertise", "action": "remove", "value": "Java"}
+  ],
  "aliases_to_add": [],
  "aliases_to_remove": []
 }