From 7890970a39ce8e459c8ec17f1e9011fdb5ed4eef Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 3 Apr 2026 10:57:30 +0800 Subject: [PATCH 1/4] feat(memory): prevent cross-role alias contamination between user and AI entities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add speaker context to triplet extraction prompt to distinguish alias ownership - Add explicit examples and rules in extract_triplet.jinja2 for user vs AI alias attribution - Introduce cross-role merge protection in dedup (accurate, fuzzy, and LLM stages) - Normalize special entity names (用户/AI助手) before deduplication - Add clean_cross_role_aliases() to sanitize aliases before Neo4j write - Refactor _update_end_user_other_name to merge aliases from PgSQL instead of Neo4j - Filter AI assistant aliases from user alias extraction in orchestrator --- .../core/memory/agent/utils/write_tools.py | 25 ++++ .../deduplication/deduped_and_disamb.py | 136 ++++++++++++++++++ .../extraction_orchestrator.py | 134 +++++++++++++---- .../triplet_extraction.py | 1 + .../core/memory/utils/prompt/prompt_utils.py | 7 +- .../prompt/prompts/extract_triplet.jinja2 | 127 +++++++++++++++- 6 files changed, 396 insertions(+), 34 deletions(-) diff --git a/api/app/core/memory/agent/utils/write_tools.py b/api/app/core/memory/agent/utils/write_tools.py index 1f437973..22b8138d 100644 --- a/api/app/core/memory/agent/utils/write_tools.py +++ b/api/app/core/memory/agent/utils/write_tools.py @@ -152,6 +152,31 @@ async def write( # Step 3: Save all data to Neo4j database step_start = time.time() + # Neo4j 写入前:清洗用户/AI助手实体之间的别名交叉污染 + # 从 Neo4j 查询已有的 AI 助手别名,与本轮实体中的 AI 助手别名合并, + # 确保用户实体的 aliases 不包含 AI 助手的名字 + try: + from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( + clean_cross_role_aliases, + ) + neo4j_assistant_aliases = set() + if all_entity_nodes: + _eu_id = all_entity_nodes[0].end_user_id + if _eu_id: + _cypher = """ + MATCH (e:ExtractedEntity) + WHERE e.end_user_id = $end_user_id AND e.name IN ['AI助手', '助手', 'AI Assistant', 'Assistant'] + RETURN e.aliases AS aliases + """ + _result = await neo4j_connector.execute_query(_cypher, end_user_id=_eu_id) + for _record in (_result or []): + for _alias in (_record.get('aliases') or []): + neo4j_assistant_aliases.add(_alias.strip().lower()) + clean_cross_role_aliases(all_entity_nodes, external_assistant_aliases=neo4j_assistant_aliases) + logger.info(f"Neo4j 写入前别名清洗完成,AI助手别名排除集大小: {len(neo4j_assistant_aliases)}") + except Exception as e: + logger.warning(f"Neo4j 写入前别名清洗失败(不影响主流程): {e}") + # 添加死锁重试机制 max_retries = 3 retry_delay = 1 # 秒 diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py index 622f6e05..ae906aa8 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py @@ -198,6 +198,124 @@ def _merge_attribute(canonical: ExtractedEntityNode, ent: ExtractedEntityNode): except Exception: pass +# 用户和AI助手的占位名称集合(用于名称标准化) +_USER_PLACEHOLDER_NAMES = {"用户", "我", "user", "i"} +_ASSISTANT_PLACEHOLDER_NAMES = {"ai助手", "助手", "人工智能助手", "智能助手", "智能体", "ai assistant", "assistant"} + +# 标准化后的规范名称和类型 +_CANONICAL_USER_NAME = "用户" +_CANONICAL_USER_TYPE = "用户" +_CANONICAL_ASSISTANT_NAME = "AI助手" +_CANONICAL_ASSISTANT_TYPE = "Agent" + +# 用户和AI助手的所有可能名称(用于判断实体是否为特殊角色实体) +_ALL_USER_NAMES = _USER_PLACEHOLDER_NAMES +_ALL_ASSISTANT_NAMES = _ASSISTANT_PLACEHOLDER_NAMES + + +def _is_user_entity(ent: ExtractedEntityNode) -> bool: + """判断实体是否为用户实体(name 或 entity_type 匹配)""" + name = (getattr(ent, "name", "") or "").strip().lower() + etype = (getattr(ent, "entity_type", "") or "").strip() + return name in _ALL_USER_NAMES or etype == _CANONICAL_USER_TYPE + + +def _is_assistant_entity(ent: ExtractedEntityNode) -> bool: + """判断实体是否为AI助手实体(name 或 entity_type 匹配)""" + name = (getattr(ent, "name", "") or "").strip().lower() + etype = (getattr(ent, "entity_type", "") or "").strip() + return name in _ALL_ASSISTANT_NAMES or etype == _CANONICAL_ASSISTANT_TYPE + + +def _would_merge_cross_role(a: ExtractedEntityNode, b: ExtractedEntityNode) -> bool: + """判断两个实体的合并是否会跨越用户/AI助手角色边界。 + + 用户实体和AI助手实体永远不应该被合并在一起。 + 如果一方是用户实体、另一方是AI助手实体,返回 True(阻止合并)。 + """ + a_is_user = _is_user_entity(a) + a_is_assistant = _is_assistant_entity(a) + b_is_user = _is_user_entity(b) + b_is_assistant = _is_assistant_entity(b) + + # 用户 + AI助手 → 阻止 + if (a_is_user and b_is_assistant) or (a_is_assistant and b_is_user): + return True + return False + + +def _normalize_special_entity_names( + entity_nodes: List[ExtractedEntityNode], +) -> None: + """标准化用户和AI助手实体的名称和类型。 + + 多轮对话中,LLM 对同一角色可能使用不同的名称变体(如"用户"/"我"/"User", + "AI助手"/"助手"/"Assistant"),导致精确匹配无法合并。 + 此函数在去重前将这些变体统一为规范名称,并强制绑定 entity_type,确保: + - name="用户" 的实体 entity_type 一定为 "用户" + - name="AI助手" 的实体 entity_type 一定为 "Agent" + + Args: + entity_nodes: 实体节点列表(原地修改) + """ + for ent in entity_nodes: + name = (getattr(ent, "name", "") or "").strip() + name_lower = name.lower() + + if name_lower in _USER_PLACEHOLDER_NAMES: + ent.name = _CANONICAL_USER_NAME + ent.entity_type = _CANONICAL_USER_TYPE + elif name_lower in _ASSISTANT_PLACEHOLDER_NAMES: + ent.name = _CANONICAL_ASSISTANT_NAME + ent.entity_type = _CANONICAL_ASSISTANT_TYPE + + +def clean_cross_role_aliases( + entity_nodes: List[ExtractedEntityNode], + external_assistant_aliases: set = None, +) -> None: + """清洗用户实体和AI助手实体之间的别名交叉污染。 + + 在 Neo4j 写入前调用,确保: + - 用户实体的 aliases 不包含 AI 助手的别名 + - AI 助手实体的 aliases 不包含用户的别名 + + Args: + entity_nodes: 实体节点列表(原地修改) + external_assistant_aliases: 外部传入的 AI 助手别名集合(如从 Neo4j 查询), + 与本轮实体中的 AI 助手别名合并使用 + """ + # 收集本轮 AI 助手实体的所有别名 + assistant_aliases = set(external_assistant_aliases or set()) + user_aliases = set() + + for ent in entity_nodes: + if _is_assistant_entity(ent): + for alias in (getattr(ent, "aliases", []) or []): + assistant_aliases.add(alias.strip().lower()) + elif _is_user_entity(ent): + for alias in (getattr(ent, "aliases", []) or []): + user_aliases.add(alias.strip().lower()) + + # 从用户实体的 aliases 中移除 AI 助手别名 + if assistant_aliases: + for ent in entity_nodes: + if _is_user_entity(ent): + original = getattr(ent, "aliases", []) or [] + cleaned = [a for a in original if a.strip().lower() not in assistant_aliases] + if len(cleaned) < len(original): + ent.aliases = cleaned + + # 从 AI 助手实体的 aliases 中移除用户别名 + if user_aliases: + for ent in entity_nodes: + if _is_assistant_entity(ent): + original = getattr(ent, "aliases", []) or [] + cleaned = [a for a in original if a.strip().lower() not in user_aliases] + if len(cleaned) < len(original): + ent.aliases = cleaned + + def accurate_match( entity_nodes: List[ExtractedEntityNode] ) -> Tuple[List[ExtractedEntityNode], Dict[str, str], Dict[str, Dict]]: @@ -261,6 +379,10 @@ def accurate_match( canonical = alias_index.get((ent_uid, ent_name)) # 确保不是自身 if canonical is not None and canonical.id != ent.id: + # 保护:禁止跨角色合并(用户实体和AI助手实体不能互相合并) + if _would_merge_cross_role(canonical, ent): + i += 1 + continue _merge_attribute(canonical, ent) id_redirect[ent.id] = canonical.id for k, v in list(id_redirect.items()): @@ -704,6 +826,11 @@ def fuzzy_match( # 条件A(快速通道):alias_match_merge = True # 条件B(标准通道):s_name ≥ tn AND s_type ≥ type_threshold AND overall ≥ tover if alias_match_merge or (s_name >= tn and s_type >= type_threshold and overall >= tover): + # 保护:禁止跨角色合并(用户实体和AI助手实体不能互相合并) + if _would_merge_cross_role(a, b): + j += 1 + continue + # ========== 第六步:执行实体合并 ========== # 6.1 合并别名 @@ -813,6 +940,12 @@ async def LLM_decision( # 决策中包含去重和消歧的功能 b = entity_by_id.get(losing_id) if not a or not b: # 若不存在 a 或 b,可能已在精确或模糊阶段合并,在之前阶段合并之后,不会再处理但是处于审计的目的会记录 continue + # 保护:禁止跨角色合并(用户实体和AI助手实体不能互相合并) + if _would_merge_cross_role(a, b): + llm_records.append( + f"[LLM阻断] 跨角色合并被阻止: {a.id} ({a.name}) 与 {b.id} ({b.name})" + ) + continue _merge_attribute(a, b) # ID 重定向 try: @@ -934,6 +1067,9 @@ async def deduplicate_entities_and_edges( 返回:去重后的实体、语句→实体边、实体↔实体边。 """ local_llm_records: List[str] = [] # 作为“审计日志”的本地收集器 初始化,保留为了之后对于LLM决策追溯 + # 0) 标准化用户和AI助手实体名称(确保多轮对话中的变体名称统一) + _normalize_special_entity_names(entity_nodes) + # 1) 精确匹配 deduped_entities, id_redirect, exact_merge_map = accurate_match(entity_nodes) diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index b20112a2..225852d6 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -1341,14 +1341,19 @@ class ExtractionOrchestrator: dialog_data_list: List[DialogData] ) -> None: """ - 从 Neo4j 读取用户实体的最终 aliases,同步到 end_user 和 end_user_info 表 + 将本轮提取的用户别名同步到 end_user 和 end_user_info 表。 - 注意: - 1. other_name 使用本次对话提取的第一个别名(保持时间顺序) - 2. aliases 从 Neo4j 读取(保持完整性) + 注意:此方法在 Neo4j 写入之前调用,因此不能依赖 Neo4j 作为别名的权威数据源。 + 改为直接使用内存中去重后的 entity_nodes 的 aliases,与 PgSQL 已有的 aliases 合并。 + + 策略: + 1. 从内存中的 entity_nodes 提取本轮用户别名(current_aliases) + 2. 从 PgSQL end_user_info 读取已有的 aliases(db_aliases) + 3. 合并 db_aliases + current_aliases,去重保序 + 4. 写回 PgSQL Args: - entity_nodes: 实体节点列表 + entity_nodes: 去重后的实体节点列表(内存中) dialog_data_list: 对话数据列表 """ try: @@ -1361,23 +1366,28 @@ class ExtractionOrchestrator: logger.warning("end_user_id 为空,跳过用户别名同步") return - # 1. 提取本次对话的用户别名(保持 LLM 提取的原始顺序,不排序) + # 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序) current_aliases = self._extract_current_aliases(entity_nodes) - # 2. 从 Neo4j 获取完整 aliases(权威数据源) - neo4j_aliases = await self._fetch_neo4j_user_aliases(end_user_id) + # 1.5 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源 + # (防止 LLM 未提取出 AI 助手实体时,AI 别名泄漏到用户别名中) + neo4j_assistant_aliases = await self._fetch_neo4j_assistant_aliases(end_user_id) + if neo4j_assistant_aliases: + before_count = len(current_aliases) + current_aliases = [ + a for a in current_aliases + if a.strip().lower() not in neo4j_assistant_aliases + ] + if len(current_aliases) < before_count: + logger.info(f"通过 Neo4j AI 助手别名排除了 {before_count - len(current_aliases)} 个误归属别名") - if not neo4j_aliases: - # Neo4j 中没有别名,使用本次对话提取的别名 - neo4j_aliases = current_aliases - if not neo4j_aliases: - logger.debug(f"aliases 为空,跳过同步: end_user_id={end_user_id}") - return + if not current_aliases: + logger.debug(f"本轮未提取到用户别名,跳过同步: end_user_id={end_user_id}") + return - logger.info(f"本次对话提取的 aliases: {current_aliases}") - logger.info(f"Neo4j 中的完整 aliases: {neo4j_aliases}") + logger.info(f"本轮对话提取的 aliases: {current_aliases}") - # 3. 同步到数据库 + # 2. 同步到数据库 end_user_uuid = uuid.UUID(end_user_id) with get_db_context() as db: # 更新 end_user 表 @@ -1386,7 +1396,32 @@ class ExtractionOrchestrator: logger.warning(f"未找到 end_user_id={end_user_id} 的用户记录") return - new_name = self._resolve_other_name(end_user.other_name, current_aliases, neo4j_aliases) + # 3. 从 PgSQL 读取已有 aliases 并与本轮合并 + info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) + db_aliases = (info.aliases if info and info.aliases else []) + # 过滤掉占位名称 + db_aliases = [a for a in db_aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES] + + # 合并:已有 + 本轮新增,去重保序 + merged_aliases = list(db_aliases) + seen_lower = {a.strip().lower() for a in merged_aliases} + for alias in current_aliases: + if alias.strip().lower() not in seen_lower: + merged_aliases.append(alias) + seen_lower.add(alias.strip().lower()) + + # 最终过滤:从合并结果中排除 AI 助手别名(清理历史脏数据) + if neo4j_assistant_aliases: + merged_aliases = [ + a for a in merged_aliases + if a.strip().lower() not in neo4j_assistant_aliases + ] + + logger.info(f"PgSQL 已有 aliases: {db_aliases}") + logger.info(f"合并后 aliases: {merged_aliases}") + + # 更新 end_user 表 other_name + new_name = self._resolve_other_name(end_user.other_name, current_aliases, merged_aliases) if new_name is not None: end_user.other_name = new_name logger.info(f"更新 end_user 表 other_name → {new_name}") @@ -1394,15 +1429,14 @@ class ExtractionOrchestrator: logger.debug(f"end_user 表 other_name 保持不变: {end_user.other_name}") # 更新或创建 end_user_info 记录 - info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) if info: - new_name_info = self._resolve_other_name(info.other_name, current_aliases, neo4j_aliases) + new_name_info = self._resolve_other_name(info.other_name, current_aliases, merged_aliases) if new_name_info is not None: info.other_name = new_name_info logger.info(f"更新 end_user_info 表 other_name → {new_name_info}") - if info.aliases != neo4j_aliases: - info.aliases = neo4j_aliases - logger.info(f"同步 Neo4j aliases 到 end_user_info: {neo4j_aliases}") + if info.aliases != merged_aliases: + info.aliases = merged_aliases + logger.info(f"同步合并后 aliases 到 end_user_info: {merged_aliases}") else: first_alias = current_aliases[0].strip() if current_aliases else "" # 确保 first_alias 不是占位名称 @@ -1410,10 +1444,10 @@ class ExtractionOrchestrator: db.add(EndUserInfo( end_user_id=end_user_uuid, other_name=first_alias, - aliases=neo4j_aliases, + aliases=merged_aliases, meta_data={} )) - logger.info(f"创建 end_user_info 记录,other_name={first_alias}, aliases={neo4j_aliases}") + logger.info(f"创建 end_user_info 记录,other_name={first_alias}, aliases={merged_aliases}") db.commit() @@ -1428,21 +1462,41 @@ class ExtractionOrchestrator: def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]: """从实体节点提取用户别名(保持 LLM 提取的原始顺序,不进行任何排序) - 这个方法直接返回 LLM 提取的别名列表,并过滤掉占位名称("用户"、"我"、"User"、"I")。 + 这个方法直接返回 LLM 提取的别名列表,并过滤掉: + 1. 占位名称("用户"、"我"、"User"、"I") + 2. AI 助手实体的别名(防止 AI 的名字被错误归入用户别名) + 第一个别名将被用作 other_name。 Args: entity_nodes: 实体节点列表 Returns: - 别名列表(保持 LLM 提取的原始顺序,已过滤占位名称) + 别名列表(保持 LLM 提取的原始顺序,已过滤占位名称和 AI 别名) """ + # 先收集 AI 助手实体的所有别名(用于排除) + assistant_names = set() + ASSISTANT_PLACEHOLDER_NAMES = {"AI助手", "助手", "AI Assistant", "Assistant"} + for entity in entity_nodes: + ent_name = getattr(entity, 'name', '').strip() + if ent_name in ASSISTANT_PLACEHOLDER_NAMES: + for alias in (getattr(entity, 'aliases', []) or []): + assistant_names.add(alias.strip().lower()) + # AI 助手的 name 本身也加入排除集 + assistant_names.add(ent_name.lower()) + + # 提取用户实体的别名,排除占位名称和 AI 助手别名 for entity in entity_nodes: if getattr(entity, 'name', '').strip() in self.USER_PLACEHOLDER_NAMES: aliases = getattr(entity, 'aliases', []) or [] - # 过滤掉占位名称,防止 "用户"/"我"/"User"/"I" 被存入 aliases 和 other_name - filtered = [a for a in aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES] - logger.debug(f"提取到用户别名(原始顺序,已过滤占位名称): {filtered}") + filtered = [ + a for a in aliases + if a.strip() not in self.USER_PLACEHOLDER_NAMES + and a.strip().lower() not in assistant_names + ] + logger.debug(f"提取到用户别名(已过滤占位名称和AI别名): {filtered}") + if assistant_names: + logger.debug(f"已排除的AI助手别名: {assistant_names}") return filtered return [] @@ -1467,6 +1521,26 @@ class ExtractionOrchestrator: filtered = [a for a in aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES] return filtered + async def _fetch_neo4j_assistant_aliases(self, end_user_id: str) -> set: + """从 Neo4j 查询 AI 助手实体的所有别名(用于从用户别名中排除)""" + cypher = """ + MATCH (e:ExtractedEntity) + WHERE e.end_user_id = $end_user_id AND e.name IN ['AI助手', '助手', 'AI Assistant', 'Assistant'] + RETURN e.aliases AS aliases + """ + try: + result = await Neo4jConnector().execute_query(cypher, end_user_id=end_user_id) + assistant_aliases = set() + for record in (result or []): + for alias in (record.get('aliases') or []): + assistant_aliases.add(alias.strip().lower()) + if assistant_aliases: + logger.debug(f"Neo4j 中 AI 助手别名: {assistant_aliases}") + return assistant_aliases + except Exception as e: + logger.warning(f"查询 Neo4j AI 助手别名失败: {e}") + return set() + def _resolve_other_name( self, current: Optional[str], diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py index 147ed777..7fb74b82 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py @@ -61,6 +61,7 @@ class TripletExtractor: predicate_instructions=PREDICATE_DEFINITIONS, language=self._get_language(), ontology_types=self.ontology_types, + speaker=getattr(statement, 'speaker', None), ) # Create messages for LLM diff --git a/api/app/core/memory/utils/prompt/prompt_utils.py b/api/app/core/memory/utils/prompt/prompt_utils.py index 0cea98f2..8d964eaf 100644 --- a/api/app/core/memory/utils/prompt/prompt_utils.py +++ b/api/app/core/memory/utils/prompt/prompt_utils.py @@ -1,6 +1,6 @@ import os from jinja2 import Environment, FileSystemLoader - +from app.core.memory.models.ontology_extraction_models import OntologyTypeList from app.core.memory.utils.log.logging_utils import log_prompt_rendering, log_template_rendering # Setup Jinja2 environment @@ -205,6 +205,7 @@ async def render_triplet_extraction_prompt( predicate_instructions: dict = None, language: str = "zh", ontology_types: "OntologyTypeList | None" = None, + speaker: str = None, ) -> str: """ Renders the triplet extraction prompt using the extract_triplet.jinja2 template. @@ -216,6 +217,7 @@ async def render_triplet_extraction_prompt( predicate_instructions: Optional predicate instructions language: The language to use for entity descriptions ("zh" for Chinese, "en" for English) ontology_types: Optional OntologyTypeList containing predefined ontology types for entity classification + speaker: Speaker role ("user" or "assistant") for the current statement Returns: Rendered prompt content as string @@ -223,7 +225,7 @@ async def render_triplet_extraction_prompt( template = prompt_env.get_template("extract_triplet.jinja2") # 准备本体类型数据 - ontology_type_section = "" + ontology_types: OntologyTypeList | None = None, ontology_type_names = [] type_hierarchy_hints = [] if ontology_types and ontology_types.types: @@ -240,6 +242,7 @@ async def render_triplet_extraction_prompt( ontology_types=ontology_type_section, ontology_type_names=ontology_type_names, type_hierarchy_hints=type_hierarchy_hints, + speaker=speaker, ) # 记录渲染结果到提示日志(与示例日志结构一致) log_prompt_rendering('triplet extraction', rendered_prompt) diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index 6605532d..e7daf0bd 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -23,6 +23,16 @@ Extract entities and knowledge triplets from the given statement. ===Inputs=== **Chunk Content:** "{{ chunk_content }}" **Statement:** "{{ statement }}" +{% if speaker %} +**Speaker:** {{ speaker }} +{% if speaker == "assistant" %} +{% if language == "zh" %} +⚠️ 当前陈述句来自 **AI助手的回复**。AI助手在回复中提到的名字(如称呼用户为"远哥""VV"等)是**用户的别名**,不是 AI 助手的别名。请注意区分说话人的视角。 +{% else %} +⚠️ This statement is from the **AI assistant's reply**. Names the AI uses to address the user (e.g., calling the user "buddy", "VV") are **user's aliases**, NOT the AI assistant's aliases. Pay attention to the speaker's perspective. +{% endif %} +{% endif %} +{% endif %} {% if ontology_types %} ===Ontology Type Guidance=== @@ -88,6 +98,15 @@ Extract entities and knowledge triplets from the given statement. * "大家叫我小李,我全名叫李明" → aliases=["小李", "李明"](小李先出现,将成为 other_name) - 空值:如果没有别名,使用 `[]` - 重要:只提取本次对话中明确提到的别名,不要推测或添加未提及的名字 + - **🚨 归属区分:必须严格区分名称的归属对象。默认情况下,用户提到的名字归属用户实体。只有出现明确的第二人称命名表达(如"叫你""给你取名")时,才将名字归属 AI/助手实体。** + - **🚨 说话人视角:当 speaker 为 assistant 时,AI 助手用来称呼用户的名字(如"远哥""VV""思远大人")是用户的别名,必须归入用户实体的 aliases,绝对不能归入 AI 助手实体。** + * "我叫陈思远,我给AI取名为远仔" → 用户 aliases=["陈思远"],AI助手 aliases=["远仔"] + * "我叫vv" → 用户 aliases=["vv"](没有给AI取名的表达,名字归用户) + * [speaker=assistant] "好的,远哥/VV/思远大人" → 用户 aliases=["远哥", "VV", "思远大人"](AI 在称呼用户,这些是用户的别名) + * [speaker=assistant] "我叫陈仔" → AI助手 aliases=["陈仔"](AI 在自我介绍,这是 AI 的别名) + * ❌ 错误:将"远仔"放入用户的 aliases("远仔"是给AI取的名字,不是用户的名字) + * ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases + * ❌ 错误:AI 称呼用户为"远哥",却把"远哥"放入 AI 助手的 aliases {% else %} - Include: nicknames, full names, abbreviations, alternative names - Order: **The FIRST alias will be used as the user's primary display name (other_name). Put the most important/frequently used name FIRST** @@ -97,6 +116,15 @@ Extract entities and knowledge triplets from the given statement. * "People call me Mike, my full name is Michael" → aliases=["Mike", "Michael"] (Mike appears first, will become other_name) - Empty: If no aliases, use `[]` - Important: Only extract aliases explicitly mentioned in current conversation, do not infer or add unmentioned names + - **🚨 Ownership distinction: By default, all names mentioned by the user belong to the user entity. Only assign a name to the AI/assistant entity when an explicit second-person naming expression (e.g., "I'll call you", "your name is") is present.** + - **🚨 Speaker perspective: When speaker is "assistant", names the AI uses to address the user (e.g., "buddy", "VV", "boss") are the USER's aliases and MUST go into the user entity's aliases, NEVER into the AI assistant entity's aliases.** + * "I'm Alex, I'll call you Buddy" → User aliases=["Alex"], AI assistant aliases=["Buddy"] + * "I'm vv" → User aliases=["vv"] (no AI-naming expression, name belongs to user) + * [speaker=assistant] "Sure thing, buddy/VV" → User aliases=["buddy", "VV"] (AI addressing the user, these are user's aliases) + * [speaker=assistant] "I'm Jarvis" → AI assistant aliases=["Jarvis"] (AI self-introduction, this is AI's alias) + * ❌ Wrong: putting "Buddy" in user's aliases ("Buddy" is a name for the AI, not the user) + * ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases + * ❌ Wrong: AI calls user "buddy" but "buddy" is put in AI assistant's aliases {% endif %} @@ -122,7 +150,58 @@ Extract entities and knowledge triplets from the given statement. -4. **ALIASES ORDER:** +4. **AI/ASSISTANT ENTITY SPECIAL HANDLING:** +{% if language == "zh" %} + - **🚨 默认规则:如果对话中没有出现明确指向 AI/助手的命名表达,则所有名字都归属于用户实体。不要猜测或推断某个名字是给 AI 取的。** + - 只有当用户**明确**对 AI/助手进行命名时,才创建 AI/助手实体并将对应名字放入其 aliases + - AI/助手实体的 name 字段:使用 "AI助手" + - 用户给 AI 取的名字:放入 AI/助手实体的 aliases + - **🚨 禁止将用户给 AI 取的名字放入用户实体的 aliases 中** + - **必须出现以下明确的命名表达才能判定为给 AI 取名:**「给你取名」「叫你」「称呼你为」「给AI取名」「你的名字是」「以后叫你」「你就叫」「你不叫X了」「你现在叫」等**第二人称(你)或明确指向 AI 的命名句式** + - **🚨 "你不叫X了"/"你不叫X,你叫Y" 句式:X 和 Y 都是 AI 的名字(旧名和新名),绝对不是用户的名字。因为句子主语是"你"(AI)。** + - **以下情况名字归属用户,不是给 AI 取名:**「我叫」「我的名字是」「叫我」「我是」「大家叫我」「我的英文名是」「我的昵称是」等**第一人称(我)的自我介绍句式** + - **🚨 speaker=assistant 时的特殊规则:** + * AI 用来称呼用户的名字(如"远哥""VV""思远大人")→ 归入**用户**实体的 aliases + * AI 自称的名字(如"我叫陈仔""我是你的助手")→ 归入**AI助手**实体的 aliases + * 判断依据:AI 说"你叫X"或用 X 称呼用户 → X 是用户别名;AI 说"我叫X"或"我是X" → X 是 AI 别名 + - 示例: + * "我叫vv" → 用户实体: name="用户", aliases=["vv"](第一人称自我介绍,名字归用户) + * "我的英文名叫vv" → 用户实体: name="用户", aliases=["vv"](第一人称自我介绍,名字归用户) + * "我叫陈思远,我给AI取名为远仔" → 用户实体: name="用户", aliases=["陈思远"];AI实体: name="AI助手", aliases=["远仔"] + * "叫你小助,我自己叫老王" → 用户实体: name="用户", aliases=["老王"];AI实体: name="AI助手", aliases=["小助"] + * "你不叫远仔了,你现在叫陈仔" → AI实体: name="AI助手", aliases=["陈仔"]("远仔"是AI旧名,"陈仔"是AI新名,都归AI。不要把"远仔"或"陈仔"放入用户的aliases) + * [speaker=assistant] "好的远哥/VV/思远大人,今天想干点啥?" → 用户实体: name="用户", aliases=["远哥", "VV", "思远大人"](AI 在称呼用户) + * [speaker=assistant] "你叫陈思远(或VV),我叫陈仔" → 用户实体: name="用户", aliases=["陈思远", "VV"];AI实体: name="AI助手", aliases=["陈仔"] + * ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases(没有任何给 AI 取名的表达) + * ❌ 错误:AI 称呼用户为"远哥",却把"远哥"放入 AI 助手的 aliases + * ❌ 错误:aliases=["陈思远", "远仔"]("远仔"是给AI取的名字,不是用户的名字) +{% else %} + - **🚨 Default rule: If there is NO explicit AI/assistant naming expression in the conversation, ALL names belong to the user entity. Do NOT guess or infer that a name is for the AI.** + - Only create an AI/assistant entity when the user **explicitly** names the AI/assistant + - AI/assistant entity name field: use "AI Assistant" + - Names the user gives to the AI: put in the AI/assistant entity's aliases + - **🚨 NEVER put names given to the AI into the user entity's aliases** + - **An AI-naming expression MUST be present to assign a name to the AI:** "I'll call you", "your name is", "I name you", "let me call you", "you'll be called", "you're not called X anymore", "your new name is", etc. — **second-person ("you") or explicit AI-directed naming patterns** + - **🚨 "You're not called X anymore" / "You're not X, you're Y" pattern: BOTH X and Y are AI's names (old and new). They are NOT user's names. The subject is "you" (the AI).** + - **These patterns mean the name belongs to the USER, NOT the AI:** "I'm", "my name is", "call me", "I am", "people call me", "my English name is", "my nickname is", etc. — **first-person ("I"/"me") self-introduction patterns** + - **🚨 Special rules when speaker=assistant:** + * Names the AI uses to address the user (e.g., "buddy", "VV", "boss") → belong to the **user** entity's aliases + * Names the AI uses for itself (e.g., "I'm Jarvis", "I am your assistant") → belong to the **AI assistant** entity's aliases + * Rule: AI says "you are X" or calls user X → X is user's alias; AI says "I'm X" or "I am X" → X is AI's alias + - Examples: + * "I'm vv" → User entity: name="User", aliases=["vv"] (first-person intro, name belongs to user) + * "My English name is vv" → User entity: name="User", aliases=["vv"] (first-person intro, name belongs to user) + * "I'm Alex, I'll call you Buddy" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Buddy"] + * "Call yourself Jarvis, my name is Tony" → User entity: name="User", aliases=["Tony"]; AI entity: name="AI Assistant", aliases=["Jarvis"] + * "You're not called Jarvis anymore, your new name is Friday" → AI entity: name="AI Assistant", aliases=["Friday"] (both "Jarvis" and "Friday" are AI names, NOT user names) + * [speaker=assistant] "Sure thing, buddy/VV!" → User entity: name="User", aliases=["buddy", "VV"] (AI addressing the user) + * [speaker=assistant] "You're Alex, and I'm Jarvis" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Jarvis"] + * ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases (no AI-naming expression exists) + * ❌ Wrong: AI calls user "buddy" but "buddy" is put in AI assistant's aliases + * ❌ Wrong: aliases=["Alex", "Buddy"] ("Buddy" is a name for the AI, not the user) +{% endif %} + +5. **ALIASES ORDER:** {% if language == "zh" %} - 顺序优先级:按出现顺序,先出现的在前 {% else %} @@ -202,8 +281,19 @@ Output: {"entity_idx": 0, "name": "Tripod", "type": "Equipment", "description": "Photography equipment accessory", "example": "", "aliases": ["Camera Tripod"], "is_explicit_memory": false} ] } + +**Example 4 (User vs AI alias distinction - English output):** "I'm Alex, and I'll call you Buddy" +Output: +{ + "triplets": [ + {"subject_name": "User", "subject_id": 0, "predicate": "NAMED", "object_name": "AI Assistant", "object_id": 1, "value": "Buddy"} + ], + "entities": [ + {"entity_idx": 0, "name": "User", "type": "Person", "description": "The user", "example": "", "aliases": ["Alex"], "is_explicit_memory": false}, + {"entity_idx": 1, "name": "AI Assistant", "type": "Person", "description": "The user's AI assistant", "example": "", "aliases": ["Buddy"], "is_explicit_memory": false} + ] +} {% else %} -**Example 1 (English input → Chinese output):** "I plan to travel to Paris next week and visit the Louvre." Output: { "triplets": [ @@ -258,6 +348,39 @@ Output: ] } +**Example 6 (用户与AI别名区分 - Chinese):** "我称呼自己为陈思远,我给AI取名为远仔" +Output: +{ + "triplets": [ + {"subject_name": "用户", "subject_id": 0, "predicate": "NAMED", "object_name": "AI助手", "object_id": 1, "value": "远仔"} + ], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": ["陈思远"], "is_explicit_memory": false}, + {"entity_idx": 1, "name": "AI助手", "type": "Person", "description": "用户的AI助手", "example": "", "aliases": ["远仔"], "is_explicit_memory": false} + ] +} + +**Example 7 (纯用户自我介绍,无AI命名 - Chinese):** "我叫vv" +Output: +{ + "triplets": [], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": ["vv"], "is_explicit_memory": false} + ] +} + +**Example 8 (给AI改名 - Chinese):** "你不叫远仔了,你现在叫陈仔" +Output: +{ + "triplets": [ + {"subject_name": "用户", "subject_id": 0, "predicate": "NAMED", "object_name": "AI助手", "object_id": 1, "value": "陈仔"} + ], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": [], "is_explicit_memory": false}, + {"entity_idx": 1, "name": "AI助手", "type": "Person", "description": "用户的AI助手", "example": "", "aliases": ["陈仔"], "is_explicit_memory": false} + ] +} + {% endif %} ===End of Examples=== From 9cc19047b428fa732be0a18a5f0e23198d9a3482 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 3 Apr 2026 12:34:04 +0800 Subject: [PATCH 2/4] fix(memory): prevent cross-role alias contamination in entity dedup - Extract user aliases from raw dialog statements instead of post-dedup entities to bypass merge pollution - Add alias cross-cleaning step in _normalize_special_entity_names to strip AI assistant aliases from user entities before dedup - Call clean_cross_role_aliases after second-layer dedup to handle historical dirty data merged from Neo4j - Fix syntax error in prompt_utils.py (ontology_types variable assignment) --- .../deduplication/deduped_and_disamb.py | 16 +++++ .../deduplication/two_stage_dedup.py | 5 ++ .../extraction_orchestrator.py | 67 +++++++++++-------- .../core/memory/utils/prompt/prompt_utils.py | 2 +- 4 files changed, 62 insertions(+), 28 deletions(-) diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py index ae906aa8..1401fe5c 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py @@ -269,6 +269,22 @@ def _normalize_special_entity_names( ent.name = _CANONICAL_ASSISTANT_NAME ent.entity_type = _CANONICAL_ASSISTANT_TYPE + # 第二步:收集 AI 助手实体的所有别名,从用户实体的 aliases 中排除 + # 防止 LLM 把 AI 的名字错误放入用户实体的 aliases + assistant_alias_set = set() + for ent in entity_nodes: + if _is_assistant_entity(ent): + for alias in (getattr(ent, "aliases", []) or []): + assistant_alias_set.add(alias.strip().lower()) + + if assistant_alias_set: + for ent in entity_nodes: + if _is_user_entity(ent): + original_aliases = getattr(ent, "aliases", []) or [] + cleaned = [a for a in original_aliases if a.strip().lower() not in assistant_alias_set] + if len(cleaned) < len(original_aliases): + ent.aliases = cleaned + def clean_cross_role_aliases( entity_nodes: List[ExtractedEntityNode], diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/two_stage_dedup.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/two_stage_dedup.py index 4b9c5718..13534b3d 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/two_stage_dedup.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/two_stage_dedup.py @@ -15,6 +15,7 @@ from app.core.memory.models.message_models import DialogData from app.core.memory.models.variate_config import ExtractionPipelineConfig from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( deduplicate_entities_and_edges, + clean_cross_role_aliases, ) from app.core.memory.storage_services.extraction_engine.deduplication.second_layer_dedup import ( second_layer_dedup_and_merge_with_neo4j, @@ -100,6 +101,10 @@ async def dedup_layers_and_merge_and_return( except Exception as e: print(f"Second-layer dedup failed: {e}") + # 第二层去重后,清洗用户/AI助手之间的别名交叉污染 + # 第二层从 Neo4j 合并了旧实体,可能带入历史脏数据 + clean_cross_role_aliases(fused_entity_nodes) + return ( dialogue_nodes, chunk_nodes, diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index 225852d6..bbf06e37 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -1367,7 +1367,7 @@ class ExtractionOrchestrator: return # 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序) - current_aliases = self._extract_current_aliases(entity_nodes) + current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list) # 1.5 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源 # (防止 LLM 未提取出 AI 助手实体时,AI 别名泄漏到用户别名中) @@ -1459,45 +1459,58 @@ class ExtractionOrchestrator: # 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中 USER_PLACEHOLDER_NAMES = {'用户', '我', 'User', 'I'} - def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]: - """从实体节点提取用户别名(保持 LLM 提取的原始顺序,不进行任何排序) + def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]: + """从用户发言的原始实体中提取别名(绕过去重污染) - 这个方法直接返回 LLM 提取的别名列表,并过滤掉: - 1. 占位名称("用户"、"我"、"User"、"I") - 2. AI 助手实体的别名(防止 AI 的名字被错误归入用户别名) - - 第一个别名将被用作 other_name。 + 策略: + 1. 从 dialog_data_list 中找到 speaker="user" 的 statement + 2. 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases + 3. 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响 Args: - entity_nodes: 实体节点列表 + entity_nodes: 去重后的实体节点列表(备用) + dialog_data_list: 对话数据列表(优先使用) Returns: - 别名列表(保持 LLM 提取的原始顺序,已过滤占位名称和 AI 别名) + 别名列表(保持原始顺序,已过滤) """ - # 先收集 AI 助手实体的所有别名(用于排除) - assistant_names = set() - ASSISTANT_PLACEHOLDER_NAMES = {"AI助手", "助手", "AI Assistant", "Assistant"} - for entity in entity_nodes: - ent_name = getattr(entity, 'name', '').strip() - if ent_name in ASSISTANT_PLACEHOLDER_NAMES: - for alias in (getattr(entity, 'aliases', []) or []): - assistant_names.add(alias.strip().lower()) - # AI 助手的 name 本身也加入排除集 - assistant_names.add(ent_name.lower()) - - # 提取用户实体的别名,排除占位名称和 AI 助手别名 + # 优先从原始 dialog_data_list 中提取(绕过去重污染) + if dialog_data_list: + all_user_aliases = [] + seen_lower = set() + for dialog in dialog_data_list: + for chunk in dialog.chunks: + speaker = getattr(chunk, 'speaker', None) + for statement in chunk.statements: + stmt_speaker = getattr(statement, 'speaker', None) or speaker + if stmt_speaker != "user": + continue + triplet_info = getattr(statement, 'triplet_extraction_info', None) + if not triplet_info: + continue + for entity in (triplet_info.entities or []): + ent_name = getattr(entity, 'name', '').strip() + if ent_name in self.USER_PLACEHOLDER_NAMES: + for alias in (getattr(entity, 'aliases', []) or []): + a = alias.strip() + if a and a not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: + all_user_aliases.append(a) + seen_lower.add(a.lower()) + if all_user_aliases: + logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}") + return all_user_aliases + + # 兜底:从去重后的 entity_nodes 提取(旧逻辑) for entity in entity_nodes: if getattr(entity, 'name', '').strip() in self.USER_PLACEHOLDER_NAMES: aliases = getattr(entity, 'aliases', []) or [] filtered = [ a for a in aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES - and a.strip().lower() not in assistant_names ] - logger.debug(f"提取到用户别名(已过滤占位名称和AI别名): {filtered}") - if assistant_names: - logger.debug(f"已排除的AI助手别名: {assistant_names}") - return filtered + if filtered: + logger.debug(f"从去重后实体提取到别名(兜底): {filtered}") + return filtered return [] diff --git a/api/app/core/memory/utils/prompt/prompt_utils.py b/api/app/core/memory/utils/prompt/prompt_utils.py index 8d964eaf..a1ad885e 100644 --- a/api/app/core/memory/utils/prompt/prompt_utils.py +++ b/api/app/core/memory/utils/prompt/prompt_utils.py @@ -225,7 +225,7 @@ async def render_triplet_extraction_prompt( template = prompt_env.get_template("extract_triplet.jinja2") # 准备本体类型数据 - ontology_types: OntologyTypeList | None = None, + ontology_type_section = None ontology_type_names = [] type_hierarchy_hints = [] if ontology_types and ontology_types.types: From 15b3ce3dd51ac57181a0436f96430f40d3485e69 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 3 Apr 2026 13:15:57 +0800 Subject: [PATCH 3/4] refactor(memory): deduplicate assistant alias query and fix case-sensitive placeholder matching - Extract fetch_neo4j_assistant_aliases() into deduped_and_disamb.py as single source of truth, replacing inline Cypher in write_tools and extraction_orchestrator - Normalize USER_PLACEHOLDER_NAMES to lowercase and apply .lower() on all comparisons to prevent case-variant names leaking into aliases --- .../core/memory/agent/utils/write_tools.py | 11 +---- .../deduplication/deduped_and_disamb.py | 45 +++++++++++++++++++ .../extraction_orchestrator.py | 43 +++++++----------- 3 files changed, 62 insertions(+), 37 deletions(-) diff --git a/api/app/core/memory/agent/utils/write_tools.py b/api/app/core/memory/agent/utils/write_tools.py index 22b8138d..bae4643e 100644 --- a/api/app/core/memory/agent/utils/write_tools.py +++ b/api/app/core/memory/agent/utils/write_tools.py @@ -158,20 +158,13 @@ async def write( try: from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( clean_cross_role_aliases, + fetch_neo4j_assistant_aliases, ) neo4j_assistant_aliases = set() if all_entity_nodes: _eu_id = all_entity_nodes[0].end_user_id if _eu_id: - _cypher = """ - MATCH (e:ExtractedEntity) - WHERE e.end_user_id = $end_user_id AND e.name IN ['AI助手', '助手', 'AI Assistant', 'Assistant'] - RETURN e.aliases AS aliases - """ - _result = await neo4j_connector.execute_query(_cypher, end_user_id=_eu_id) - for _record in (_result or []): - for _alias in (_record.get('aliases') or []): - neo4j_assistant_aliases.add(_alias.strip().lower()) + neo4j_assistant_aliases = await fetch_neo4j_assistant_aliases(neo4j_connector, _eu_id) clean_cross_role_aliases(all_entity_nodes, external_assistant_aliases=neo4j_assistant_aliases) logger.info(f"Neo4j 写入前别名清洗完成,AI助手别名排除集大小: {len(neo4j_assistant_aliases)}") except Exception as e: diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py index 1401fe5c..f2ad2ae9 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py @@ -286,6 +286,51 @@ def _normalize_special_entity_names( ent.aliases = cleaned +async def fetch_neo4j_assistant_aliases(neo4j_connector, end_user_id: str) -> set: + """从 Neo4j 查询 AI 助手实体的所有别名(小写归一化)。 + + 这是助手别名查询的唯一入口,供 write_tools 和 extraction_orchestrator 共用, + 避免多处维护相同的 Cypher 和名称列表。 + + Args: + neo4j_connector: Neo4j 连接器实例(需提供 execute_query 方法) + end_user_id: 终端用户 ID + + Returns: + 小写归一化后的助手别名集合 + """ + import logging + logger = logging.getLogger(__name__) + + # 使用模块级 _ASSISTANT_PLACEHOLDER_NAMES 的标题化形式构建查询名称列表, + # 保持与 _normalize_special_entity_names 标准化后的名称一致 + query_names = [_CANONICAL_ASSISTANT_NAME] # "AI助手" + # 补充英文常见变体 + query_names.extend(["助手", "AI Assistant", "Assistant"]) + # 去重 + query_names = list(dict.fromkeys(query_names)) + + cypher = """ + MATCH (e:ExtractedEntity) + WHERE e.end_user_id = $end_user_id AND e.name IN $names + RETURN e.aliases AS aliases + """ + try: + result = await neo4j_connector.execute_query( + cypher, end_user_id=end_user_id, names=query_names + ) + assistant_aliases: set = set() + for record in (result or []): + for alias in (record.get("aliases") or []): + assistant_aliases.add(alias.strip().lower()) + if assistant_aliases: + logger.debug(f"Neo4j 中 AI 助手别名: {assistant_aliases}") + return assistant_aliases + except Exception as e: + logger.warning(f"查询 Neo4j AI 助手别名失败: {e}") + return set() + + def clean_cross_role_aliases( entity_nodes: List[ExtractedEntityNode], external_assistant_aliases: set = None, diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index bbf06e37..a31099ab 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -1400,7 +1400,7 @@ class ExtractionOrchestrator: info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) db_aliases = (info.aliases if info and info.aliases else []) # 过滤掉占位名称 - db_aliases = [a for a in db_aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES] + db_aliases = [a for a in db_aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES] # 合并:已有 + 本轮新增,去重保序 merged_aliases = list(db_aliases) @@ -1440,7 +1440,7 @@ class ExtractionOrchestrator: else: first_alias = current_aliases[0].strip() if current_aliases else "" # 确保 first_alias 不是占位名称 - if first_alias and first_alias not in self.USER_PLACEHOLDER_NAMES: + if first_alias and first_alias.lower() not in self.USER_PLACEHOLDER_NAMES: db.add(EndUserInfo( end_user_id=end_user_uuid, other_name=first_alias, @@ -1457,7 +1457,7 @@ class ExtractionOrchestrator: # 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中 - USER_PLACEHOLDER_NAMES = {'用户', '我', 'User', 'I'} + USER_PLACEHOLDER_NAMES = {'用户', '我', 'user', 'i'} def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]: """从用户发言的原始实体中提取别名(绕过去重污染) @@ -1490,10 +1490,10 @@ class ExtractionOrchestrator: continue for entity in (triplet_info.entities or []): ent_name = getattr(entity, 'name', '').strip() - if ent_name in self.USER_PLACEHOLDER_NAMES: + if ent_name.lower() in self.USER_PLACEHOLDER_NAMES: for alias in (getattr(entity, 'aliases', []) or []): a = alias.strip() - if a and a not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: + if a and a.lower() not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: all_user_aliases.append(a) seen_lower.add(a.lower()) if all_user_aliases: @@ -1502,11 +1502,11 @@ class ExtractionOrchestrator: # 兜底:从去重后的 entity_nodes 提取(旧逻辑) for entity in entity_nodes: - if getattr(entity, 'name', '').strip() in self.USER_PLACEHOLDER_NAMES: + if getattr(entity, 'name', '').strip().lower() in self.USER_PLACEHOLDER_NAMES: aliases = getattr(entity, 'aliases', []) or [] filtered = [ a for a in aliases - if a.strip() not in self.USER_PLACEHOLDER_NAMES + if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES ] if filtered: logger.debug(f"从去重后实体提取到别名(兜底): {filtered}") @@ -1531,28 +1531,15 @@ class ExtractionOrchestrator: logger.debug(f"Neo4j 用户实体 aliases 为空: end_user_id={end_user_id}") return [] # 过滤掉占位名称,防止历史脏数据传播 - filtered = [a for a in aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES] + filtered = [a for a in aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES] return filtered async def _fetch_neo4j_assistant_aliases(self, end_user_id: str) -> set: """从 Neo4j 查询 AI 助手实体的所有别名(用于从用户别名中排除)""" - cypher = """ - MATCH (e:ExtractedEntity) - WHERE e.end_user_id = $end_user_id AND e.name IN ['AI助手', '助手', 'AI Assistant', 'Assistant'] - RETURN e.aliases AS aliases - """ - try: - result = await Neo4jConnector().execute_query(cypher, end_user_id=end_user_id) - assistant_aliases = set() - for record in (result or []): - for alias in (record.get('aliases') or []): - assistant_aliases.add(alias.strip().lower()) - if assistant_aliases: - logger.debug(f"Neo4j 中 AI 助手别名: {assistant_aliases}") - return assistant_aliases - except Exception as e: - logger.warning(f"查询 Neo4j AI 助手别名失败: {e}") - return set() + from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( + fetch_neo4j_assistant_aliases, + ) + return await fetch_neo4j_assistant_aliases(Neo4jConnector(), end_user_id) def _resolve_other_name( self, @@ -1571,16 +1558,16 @@ class ExtractionOrchestrator: 注意:返回值不允许是占位名称("用户"、"我"、"User"、"I") """ # 当前值为空或为占位名称时,需要更新 - if not current or not current.strip() or current.strip() in self.USER_PLACEHOLDER_NAMES: + if not current or not current.strip() or current.strip().lower() in self.USER_PLACEHOLDER_NAMES: candidate = current_aliases[0].strip() if current_aliases else None # 确保候选值不是占位名称 - if candidate and candidate in self.USER_PLACEHOLDER_NAMES: + if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES: return None return candidate if current not in neo4j_aliases: candidate = neo4j_aliases[0].strip() if neo4j_aliases else None # 确保候选值不是占位名称 - if candidate and candidate in self.USER_PLACEHOLDER_NAMES: + if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES: return None return candidate From c4ff1a325bb048b076fa86306542c8606569b0a2 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 3 Apr 2026 14:38:55 +0800 Subject: [PATCH 4/4] refactor(memory): harden alias extraction and sync PgSQL with Neo4j deduped aliases - Strengthen anti-hallucination rules in extract_triplet prompt to enforce verbatim-only alias extraction, removing suggestive examples - Add _extract_deduped_entity_aliases to sync historical aliases from Neo4j two-stage dedup into PgSQL end_user_info - Remove unused _fetch_neo4j_user_aliases; reuse injected connector instead of instantiating new Neo4jConnector - Simplify _would_merge_cross_role and reuse clean_cross_role_aliases in _normalize_special_entity_names - Reuse _USER_PLACEHOLDER_NAMES from dedup module to avoid duplication --- .../deduplication/deduped_and_disamb.py | 45 ++---- .../extraction_orchestrator.py | 151 ++++++++++-------- .../prompt/prompts/extract_triplet.jinja2 | 38 +++-- 3 files changed, 117 insertions(+), 117 deletions(-) diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py index f2ad2ae9..7e0976fe 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py @@ -4,6 +4,7 @@ import asyncio import difflib # 提供字符串相似度计算工具 import importlib +import logging import os import re from datetime import datetime @@ -16,6 +17,8 @@ from app.core.memory.models.graph_models import ( ) from app.core.memory.models.variate_config import DedupConfig +logger = logging.getLogger(__name__) + # 模块级类型统一工具函数 def _unify_entity_type(canonical: ExtractedEntityNode, losing: ExtractedEntityNode, suggested_type: str = None) -> None: @@ -233,15 +236,10 @@ def _would_merge_cross_role(a: ExtractedEntityNode, b: ExtractedEntityNode) -> b 用户实体和AI助手实体永远不应该被合并在一起。 如果一方是用户实体、另一方是AI助手实体,返回 True(阻止合并)。 """ - a_is_user = _is_user_entity(a) - a_is_assistant = _is_assistant_entity(a) - b_is_user = _is_user_entity(b) - b_is_assistant = _is_assistant_entity(b) - - # 用户 + AI助手 → 阻止 - if (a_is_user and b_is_assistant) or (a_is_assistant and b_is_user): - return True - return False + return ( + (_is_user_entity(a) and _is_assistant_entity(b)) + or (_is_assistant_entity(a) and _is_user_entity(b)) + ) def _normalize_special_entity_names( @@ -269,21 +267,8 @@ def _normalize_special_entity_names( ent.name = _CANONICAL_ASSISTANT_NAME ent.entity_type = _CANONICAL_ASSISTANT_TYPE - # 第二步:收集 AI 助手实体的所有别名,从用户实体的 aliases 中排除 - # 防止 LLM 把 AI 的名字错误放入用户实体的 aliases - assistant_alias_set = set() - for ent in entity_nodes: - if _is_assistant_entity(ent): - for alias in (getattr(ent, "aliases", []) or []): - assistant_alias_set.add(alias.strip().lower()) - - if assistant_alias_set: - for ent in entity_nodes: - if _is_user_entity(ent): - original_aliases = getattr(ent, "aliases", []) or [] - cleaned = [a for a in original_aliases if a.strip().lower() not in assistant_alias_set] - if len(cleaned) < len(original_aliases): - ent.aliases = cleaned + # 第二步:清洗用户/AI助手之间的别名交叉污染(复用 clean_cross_role_aliases) + clean_cross_role_aliases(entity_nodes) async def fetch_neo4j_assistant_aliases(neo4j_connector, end_user_id: str) -> set: @@ -299,15 +284,9 @@ async def fetch_neo4j_assistant_aliases(neo4j_connector, end_user_id: str) -> se Returns: 小写归一化后的助手别名集合 """ - import logging - logger = logging.getLogger(__name__) - - # 使用模块级 _ASSISTANT_PLACEHOLDER_NAMES 的标题化形式构建查询名称列表, - # 保持与 _normalize_special_entity_names 标准化后的名称一致 - query_names = [_CANONICAL_ASSISTANT_NAME] # "AI助手" - # 补充英文常见变体 - query_names.extend(["助手", "AI Assistant", "Assistant"]) - # 去重 + # 查询名称列表:规范名称 + 常见变体(与 _normalize_special_entity_names 标准化后一致) + query_names = [_CANONICAL_ASSISTANT_NAME, *_ASSISTANT_PLACEHOLDER_NAMES] + # 去重保序 query_names = list(dict.fromkeys(query_names)) cypher = """ diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index a31099ab..3229674d 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -44,6 +44,10 @@ from app.core.memory.models.variate_config import ( from app.core.memory.storage_services.extraction_engine.deduplication.two_stage_dedup import ( dedup_layers_and_merge_and_return, ) +from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( + _USER_PLACEHOLDER_NAMES, + fetch_neo4j_assistant_aliases, +) from app.core.memory.storage_services.extraction_engine.knowledge_extraction.embedding_generation import ( embedding_generation, generate_entity_embeddings_from_triplets, @@ -1348,12 +1352,13 @@ class ExtractionOrchestrator: 策略: 1. 从内存中的 entity_nodes 提取本轮用户别名(current_aliases) - 2. 从 PgSQL end_user_info 读取已有的 aliases(db_aliases) - 3. 合并 db_aliases + current_aliases,去重保序 - 4. 写回 PgSQL + 2. 从去重后的 entity_nodes 中提取完整别名(含 Neo4j 二层去重合并的历史别名) + 3. 从 PgSQL end_user_info 读取已有的 aliases(db_aliases) + 4. 合并 db_aliases + deduped_aliases + current_aliases,去重保序 + 5. 写回 PgSQL Args: - entity_nodes: 去重后的实体节点列表(内存中) + entity_nodes: 去重后的实体节点列表(内存中,含二层去重合并结果) dialog_data_list: 对话数据列表 """ try: @@ -1369,7 +1374,12 @@ class ExtractionOrchestrator: # 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序) current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list) - # 1.5 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源 + # 1.5 从去重后的 entity_nodes 中提取完整别名 + # 二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 中, + # 这里提取出来确保 PgSQL 与 Neo4j 的别名保持同步 + deduped_aliases = self._extract_deduped_entity_aliases(entity_nodes) + + # 1.6 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源 # (防止 LLM 未提取出 AI 助手实体时,AI 别名泄漏到用户别名中) neo4j_assistant_aliases = await self._fetch_neo4j_assistant_aliases(end_user_id) if neo4j_assistant_aliases: @@ -1380,12 +1390,19 @@ class ExtractionOrchestrator: ] if len(current_aliases) < before_count: logger.info(f"通过 Neo4j AI 助手别名排除了 {before_count - len(current_aliases)} 个误归属别名") + # 同样过滤 deduped_aliases + deduped_aliases = [ + a for a in deduped_aliases + if a.strip().lower() not in neo4j_assistant_aliases + ] - if not current_aliases: + if not current_aliases and not deduped_aliases: logger.debug(f"本轮未提取到用户别名,跳过同步: end_user_id={end_user_id}") return logger.info(f"本轮对话提取的 aliases: {current_aliases}") + if deduped_aliases: + logger.info(f"去重后实体的完整 aliases(含历史): {deduped_aliases}") # 2. 同步到数据库 end_user_uuid = uuid.UUID(end_user_id) @@ -1402,9 +1419,15 @@ class ExtractionOrchestrator: # 过滤掉占位名称 db_aliases = [a for a in db_aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES] - # 合并:已有 + 本轮新增,去重保序 + # 合并:已有 + 去重后完整别名 + 本轮新增,去重保序 merged_aliases = list(db_aliases) seen_lower = {a.strip().lower() for a in merged_aliases} + # 先合并去重后实体的完整别名(含 Neo4j 历史别名) + for alias in deduped_aliases: + if alias.strip().lower() not in seen_lower: + merged_aliases.append(alias) + seen_lower.add(alias.strip().lower()) + # 再合并本轮新提取的别名 for alias in current_aliases: if alias.strip().lower() not in seen_lower: merged_aliases.append(alias) @@ -1438,7 +1461,9 @@ class ExtractionOrchestrator: info.aliases = merged_aliases logger.info(f"同步合并后 aliases 到 end_user_info: {merged_aliases}") else: - first_alias = current_aliases[0].strip() if current_aliases else "" + first_alias = current_aliases[0].strip() if current_aliases else ( + deduped_aliases[0].strip() if deduped_aliases else "" + ) # 确保 first_alias 不是占位名称 if first_alias and first_alias.lower() not in self.USER_PLACEHOLDER_NAMES: db.add(EndUserInfo( @@ -1457,50 +1482,67 @@ class ExtractionOrchestrator: # 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中 - USER_PLACEHOLDER_NAMES = {'用户', '我', 'user', 'i'} + # 复用 deduped_and_disamb 模块级常量,避免重复维护 + USER_PLACEHOLDER_NAMES = _USER_PLACEHOLDER_NAMES def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]: - """从用户发言的原始实体中提取别名(绕过去重污染) + """从用户发言的原始实体中提取本轮新增别名(绕过去重污染) 策略: - 1. 从 dialog_data_list 中找到 speaker="user" 的 statement - 2. 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases - 3. 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响 + 仅从 dialog_data_list 中找到 speaker="user" 的 statement, + 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases。 + 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响。 + + 注意:不再使用去重后 entity_nodes 作为兜底,因为二层去重会将 Neo4j 历史别名 + 合并进来,导致历史别名被误认为"本轮提取"。历史别名的同步由 + _extract_deduped_entity_aliases 负责。 Args: - entity_nodes: 去重后的实体节点列表(备用) - dialog_data_list: 对话数据列表(优先使用) + entity_nodes: 去重后的实体节点列表(未使用,保留参数兼容性) + dialog_data_list: 对话数据列表 Returns: 别名列表(保持原始顺序,已过滤) """ - # 优先从原始 dialog_data_list 中提取(绕过去重污染) - if dialog_data_list: - all_user_aliases = [] - seen_lower = set() - for dialog in dialog_data_list: - for chunk in dialog.chunks: - speaker = getattr(chunk, 'speaker', None) - for statement in chunk.statements: - stmt_speaker = getattr(statement, 'speaker', None) or speaker - if stmt_speaker != "user": - continue - triplet_info = getattr(statement, 'triplet_extraction_info', None) - if not triplet_info: - continue - for entity in (triplet_info.entities or []): - ent_name = getattr(entity, 'name', '').strip() - if ent_name.lower() in self.USER_PLACEHOLDER_NAMES: - for alias in (getattr(entity, 'aliases', []) or []): - a = alias.strip() - if a and a.lower() not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: - all_user_aliases.append(a) - seen_lower.add(a.lower()) - if all_user_aliases: - logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}") - return all_user_aliases + if not dialog_data_list: + return [] - # 兜底:从去重后的 entity_nodes 提取(旧逻辑) + all_user_aliases = [] + seen_lower = set() + for dialog in dialog_data_list: + for chunk in dialog.chunks: + speaker = getattr(chunk, 'speaker', None) + for statement in chunk.statements: + stmt_speaker = getattr(statement, 'speaker', None) or speaker + if stmt_speaker != "user": + continue + triplet_info = getattr(statement, 'triplet_extraction_info', None) + if not triplet_info: + continue + for entity in (triplet_info.entities or []): + ent_name = getattr(entity, 'name', '').strip() + if ent_name.lower() in self.USER_PLACEHOLDER_NAMES: + for alias in (getattr(entity, 'aliases', []) or []): + a = alias.strip() + if a and a.lower() not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: + all_user_aliases.append(a) + seen_lower.add(a.lower()) + if all_user_aliases: + logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}") + return all_user_aliases + + def _extract_deduped_entity_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]: + """从去重后的用户实体中提取完整别名列表。 + + 二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 的用户实体中, + 因此这里提取到的别名包含了历史积累的所有别名,可用于同步到 PgSQL。 + + Args: + entity_nodes: 去重后的实体节点列表(含二层去重合并结果) + + Returns: + 别名列表(已过滤占位名称,去重保序) + """ for entity in entity_nodes: if getattr(entity, 'name', '').strip().lower() in self.USER_PLACEHOLDER_NAMES: aliases = getattr(entity, 'aliases', []) or [] @@ -1509,37 +1551,12 @@ class ExtractionOrchestrator: if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES ] if filtered: - logger.debug(f"从去重后实体提取到别名(兜底): {filtered}") return filtered return [] - - async def _fetch_neo4j_user_aliases(self, end_user_id: str) -> List[str]: - """从 Neo4j 查询用户实体的完整 aliases 列表(已过滤占位名称)""" - cypher = """ - MATCH (e:ExtractedEntity) - WHERE e.end_user_id = $end_user_id AND e.name IN ['用户', '我', 'User', 'I'] - RETURN e.aliases AS aliases - LIMIT 1 - """ - result = await Neo4jConnector().execute_query(cypher, end_user_id=end_user_id) - if not result: - logger.debug(f"Neo4j 中未找到用户实体: end_user_id={end_user_id}") - return [] - aliases = result[0].get('aliases') or [] - if not aliases: - logger.debug(f"Neo4j 用户实体 aliases 为空: end_user_id={end_user_id}") - return [] - # 过滤掉占位名称,防止历史脏数据传播 - filtered = [a for a in aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES] - return filtered - async def _fetch_neo4j_assistant_aliases(self, end_user_id: str) -> set: """从 Neo4j 查询 AI 助手实体的所有别名(用于从用户别名中排除)""" - from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( - fetch_neo4j_assistant_aliases, - ) - return await fetch_neo4j_assistant_aliases(Neo4jConnector(), end_user_id) + return await fetch_neo4j_assistant_aliases(self.connector, end_user_id) def _resolve_other_name( self, diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index e7daf0bd..7ded48a4 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -27,9 +27,9 @@ Extract entities and knowledge triplets from the given statement. **Speaker:** {{ speaker }} {% if speaker == "assistant" %} {% if language == "zh" %} -⚠️ 当前陈述句来自 **AI助手的回复**。AI助手在回复中提到的名字(如称呼用户为"远哥""VV"等)是**用户的别名**,不是 AI 助手的别名。请注意区分说话人的视角。 +⚠️ 当前陈述句来自 **AI助手的回复**。AI助手在回复中用来称呼用户的名字是**用户的别名**,不是 AI 助手的别名。但只能提取原文中逐字出现的名字,严禁推测或创造原文中不存在的别名变体。 {% else %} -⚠️ This statement is from the **AI assistant's reply**. Names the AI uses to address the user (e.g., calling the user "buddy", "VV") are **user's aliases**, NOT the AI assistant's aliases. Pay attention to the speaker's perspective. +⚠️ This statement is from the **AI assistant's reply**. Names the AI uses to address the user are **user's aliases**, NOT the AI assistant's aliases. But only extract names that appear VERBATIM in the text — never infer or fabricate alias variants. {% endif %} {% endif %} {% endif %} @@ -97,16 +97,17 @@ Extract entities and knowledge triplets from the given statement. * "我叫张三,大家叫我小张" → aliases=["张三", "小张"](张三是第一个,将成为 other_name) * "大家叫我小李,我全名叫李明" → aliases=["小李", "李明"](小李先出现,将成为 other_name) - 空值:如果没有别名,使用 `[]` - - 重要:只提取本次对话中明确提到的别名,不要推测或添加未提及的名字 + - **🚨🚨🚨 严禁幻觉:只提取对话原文中逐字出现的别名,绝对不能推测、衍生或创造任何未在原文中出现的名字。例如,看到"陈思远"不能自行添加"思远大人""远哥""小远"等变体。如果原文没有这些字,就不能出现在 aliases 中。** - **🚨 归属区分:必须严格区分名称的归属对象。默认情况下,用户提到的名字归属用户实体。只有出现明确的第二人称命名表达(如"叫你""给你取名")时,才将名字归属 AI/助手实体。** - - **🚨 说话人视角:当 speaker 为 assistant 时,AI 助手用来称呼用户的名字(如"远哥""VV""思远大人")是用户的别名,必须归入用户实体的 aliases,绝对不能归入 AI 助手实体。** + - **🚨 说话人视角:当 speaker 为 assistant 时,AI 助手用来称呼用户的名字是用户的别名,必须归入用户实体的 aliases,绝对不能归入 AI 助手实体。但同样只能提取原文中逐字出现的称呼,不能推测。** * "我叫陈思远,我给AI取名为远仔" → 用户 aliases=["陈思远"],AI助手 aliases=["远仔"] * "我叫vv" → 用户 aliases=["vv"](没有给AI取名的表达,名字归用户) - * [speaker=assistant] "好的,远哥/VV/思远大人" → 用户 aliases=["远哥", "VV", "思远大人"](AI 在称呼用户,这些是用户的别名) + * [speaker=assistant] "好的,VV" → 用户 aliases=["VV"](AI 在称呼用户,原文中出现了"VV") * [speaker=assistant] "我叫陈仔" → AI助手 aliases=["陈仔"](AI 在自我介绍,这是 AI 的别名) * ❌ 错误:将"远仔"放入用户的 aliases("远仔"是给AI取的名字,不是用户的名字) * ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases - * ❌ 错误:AI 称呼用户为"远哥",却把"远哥"放入 AI 助手的 aliases + * ❌ 错误:AI 称呼用户为"VV",却把"VV"放入 AI 助手的 aliases + * ❌ 错误:原文只有"陈思远",却在 aliases 中添加"思远大人""远哥""小远"等从未出现的变体(这是幻觉) {% else %} - Include: nicknames, full names, abbreviations, alternative names - Order: **The FIRST alias will be used as the user's primary display name (other_name). Put the most important/frequently used name FIRST** @@ -115,16 +116,17 @@ Extract entities and knowledge triplets from the given statement. * "I'm John, people call me Johnny" → aliases=["John", "Johnny"] (John is first, will become other_name) * "People call me Mike, my full name is Michael" → aliases=["Mike", "Michael"] (Mike appears first, will become other_name) - Empty: If no aliases, use `[]` - - Important: Only extract aliases explicitly mentioned in current conversation, do not infer or add unmentioned names + - **🚨🚨🚨 NO HALLUCINATION: Only extract aliases that appear VERBATIM in the original text. NEVER infer, derive, or fabricate names not present in the text. For example, seeing "John Smith" does NOT allow adding "Johnny", "Smithy", "Mr. Smith" unless those exact strings appear in the conversation.** - **🚨 Ownership distinction: By default, all names mentioned by the user belong to the user entity. Only assign a name to the AI/assistant entity when an explicit second-person naming expression (e.g., "I'll call you", "your name is") is present.** - - **🚨 Speaker perspective: When speaker is "assistant", names the AI uses to address the user (e.g., "buddy", "VV", "boss") are the USER's aliases and MUST go into the user entity's aliases, NEVER into the AI assistant entity's aliases.** + - **🚨 Speaker perspective: When speaker is "assistant", names the AI uses to address the user are the USER's aliases and MUST go into the user entity's aliases, NEVER into the AI assistant entity's aliases. But only extract names that appear verbatim in the text, never infer.** * "I'm Alex, I'll call you Buddy" → User aliases=["Alex"], AI assistant aliases=["Buddy"] * "I'm vv" → User aliases=["vv"] (no AI-naming expression, name belongs to user) - * [speaker=assistant] "Sure thing, buddy/VV" → User aliases=["buddy", "VV"] (AI addressing the user, these are user's aliases) + * [speaker=assistant] "Sure thing, VV" → User aliases=["VV"] (AI addressing the user, "VV" appears in text) * [speaker=assistant] "I'm Jarvis" → AI assistant aliases=["Jarvis"] (AI self-introduction, this is AI's alias) * ❌ Wrong: putting "Buddy" in user's aliases ("Buddy" is a name for the AI, not the user) * ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases - * ❌ Wrong: AI calls user "buddy" but "buddy" is put in AI assistant's aliases + * ❌ Wrong: AI calls user "VV" but "VV" is put in AI assistant's aliases + * ❌ Wrong: Text only has "John Smith" but aliases include "Johnny", "Smithy" (hallucinated variants) {% endif %} @@ -161,7 +163,7 @@ Extract entities and knowledge triplets from the given statement. - **🚨 "你不叫X了"/"你不叫X,你叫Y" 句式:X 和 Y 都是 AI 的名字(旧名和新名),绝对不是用户的名字。因为句子主语是"你"(AI)。** - **以下情况名字归属用户,不是给 AI 取名:**「我叫」「我的名字是」「叫我」「我是」「大家叫我」「我的英文名是」「我的昵称是」等**第一人称(我)的自我介绍句式** - **🚨 speaker=assistant 时的特殊规则:** - * AI 用来称呼用户的名字(如"远哥""VV""思远大人")→ 归入**用户**实体的 aliases + * AI 用来称呼用户的名字 → 归入**用户**实体的 aliases(但必须是原文中逐字出现的称呼,不能推测) * AI 自称的名字(如"我叫陈仔""我是你的助手")→ 归入**AI助手**实体的 aliases * 判断依据:AI 说"你叫X"或用 X 称呼用户 → X 是用户别名;AI 说"我叫X"或"我是X" → X 是 AI 别名 - 示例: @@ -170,11 +172,12 @@ Extract entities and knowledge triplets from the given statement. * "我叫陈思远,我给AI取名为远仔" → 用户实体: name="用户", aliases=["陈思远"];AI实体: name="AI助手", aliases=["远仔"] * "叫你小助,我自己叫老王" → 用户实体: name="用户", aliases=["老王"];AI实体: name="AI助手", aliases=["小助"] * "你不叫远仔了,你现在叫陈仔" → AI实体: name="AI助手", aliases=["陈仔"]("远仔"是AI旧名,"陈仔"是AI新名,都归AI。不要把"远仔"或"陈仔"放入用户的aliases) - * [speaker=assistant] "好的远哥/VV/思远大人,今天想干点啥?" → 用户实体: name="用户", aliases=["远哥", "VV", "思远大人"](AI 在称呼用户) - * [speaker=assistant] "你叫陈思远(或VV),我叫陈仔" → 用户实体: name="用户", aliases=["陈思远", "VV"];AI实体: name="AI助手", aliases=["陈仔"] + * [speaker=assistant] "好的VV,今天想干点啥?" → 用户实体: name="用户", aliases=["VV"](AI 在称呼用户,原文中出现了"VV") + * [speaker=assistant] "你叫陈思远,我叫陈仔" → 用户实体: name="用户", aliases=["陈思远"];AI实体: name="AI助手", aliases=["陈仔"] * ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases(没有任何给 AI 取名的表达) - * ❌ 错误:AI 称呼用户为"远哥",却把"远哥"放入 AI 助手的 aliases + * ❌ 错误:AI 称呼用户为"VV",却把"VV"放入 AI 助手的 aliases * ❌ 错误:aliases=["陈思远", "远仔"]("远仔"是给AI取的名字,不是用户的名字) + * ❌ 错误:原文只有"陈思远",却在 aliases 中添加"思远大人""远哥""小远"等从未出现的变体(这是幻觉) {% else %} - **🚨 Default rule: If there is NO explicit AI/assistant naming expression in the conversation, ALL names belong to the user entity. Do NOT guess or infer that a name is for the AI.** - Only create an AI/assistant entity when the user **explicitly** names the AI/assistant @@ -185,7 +188,7 @@ Extract entities and knowledge triplets from the given statement. - **🚨 "You're not called X anymore" / "You're not X, you're Y" pattern: BOTH X and Y are AI's names (old and new). They are NOT user's names. The subject is "you" (the AI).** - **These patterns mean the name belongs to the USER, NOT the AI:** "I'm", "my name is", "call me", "I am", "people call me", "my English name is", "my nickname is", etc. — **first-person ("I"/"me") self-introduction patterns** - **🚨 Special rules when speaker=assistant:** - * Names the AI uses to address the user (e.g., "buddy", "VV", "boss") → belong to the **user** entity's aliases + * Names the AI uses to address the user → belong to the **user** entity's aliases (but only extract names that appear verbatim in the text, never infer) * Names the AI uses for itself (e.g., "I'm Jarvis", "I am your assistant") → belong to the **AI assistant** entity's aliases * Rule: AI says "you are X" or calls user X → X is user's alias; AI says "I'm X" or "I am X" → X is AI's alias - Examples: @@ -194,11 +197,12 @@ Extract entities and knowledge triplets from the given statement. * "I'm Alex, I'll call you Buddy" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Buddy"] * "Call yourself Jarvis, my name is Tony" → User entity: name="User", aliases=["Tony"]; AI entity: name="AI Assistant", aliases=["Jarvis"] * "You're not called Jarvis anymore, your new name is Friday" → AI entity: name="AI Assistant", aliases=["Friday"] (both "Jarvis" and "Friday" are AI names, NOT user names) - * [speaker=assistant] "Sure thing, buddy/VV!" → User entity: name="User", aliases=["buddy", "VV"] (AI addressing the user) + * [speaker=assistant] "Sure thing, VV" → User entity: name="User", aliases=["VV"] (AI addressing the user, "VV" appears in text) * [speaker=assistant] "You're Alex, and I'm Jarvis" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Jarvis"] * ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases (no AI-naming expression exists) - * ❌ Wrong: AI calls user "buddy" but "buddy" is put in AI assistant's aliases + * ❌ Wrong: AI calls user "VV" but "VV" is put in AI assistant's aliases * ❌ Wrong: aliases=["Alex", "Buddy"] ("Buddy" is a name for the AI, not the user) + * ❌ Wrong: Text only has "John Smith" but aliases include "Johnny", "Smithy" (hallucinated variants) {% endif %} 5. **ALIASES ORDER:**