From c4ff1a325bb048b076fa86306542c8606569b0a2 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 3 Apr 2026 14:38:55 +0800 Subject: [PATCH] refactor(memory): harden alias extraction and sync PgSQL with Neo4j deduped aliases - Strengthen anti-hallucination rules in extract_triplet prompt to enforce verbatim-only alias extraction, removing suggestive examples - Add _extract_deduped_entity_aliases to sync historical aliases from Neo4j two-stage dedup into PgSQL end_user_info - Remove unused _fetch_neo4j_user_aliases; reuse injected connector instead of instantiating new Neo4jConnector - Simplify _would_merge_cross_role and reuse clean_cross_role_aliases in _normalize_special_entity_names - Reuse _USER_PLACEHOLDER_NAMES from dedup module to avoid duplication --- .../deduplication/deduped_and_disamb.py | 45 ++---- .../extraction_orchestrator.py | 151 ++++++++++-------- .../prompt/prompts/extract_triplet.jinja2 | 38 +++-- 3 files changed, 117 insertions(+), 117 deletions(-) diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py index f2ad2ae9..7e0976fe 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py @@ -4,6 +4,7 @@ import asyncio import difflib # 提供字符串相似度计算工具 import importlib +import logging import os import re from datetime import datetime @@ -16,6 +17,8 @@ from app.core.memory.models.graph_models import ( ) from app.core.memory.models.variate_config import DedupConfig +logger = logging.getLogger(__name__) + # 模块级类型统一工具函数 def _unify_entity_type(canonical: ExtractedEntityNode, losing: ExtractedEntityNode, suggested_type: str = None) -> None: @@ -233,15 +236,10 @@ def _would_merge_cross_role(a: ExtractedEntityNode, b: ExtractedEntityNode) -> b 用户实体和AI助手实体永远不应该被合并在一起。 如果一方是用户实体、另一方是AI助手实体,返回 True(阻止合并)。 """ - a_is_user = _is_user_entity(a) - a_is_assistant = _is_assistant_entity(a) - b_is_user = _is_user_entity(b) - b_is_assistant = _is_assistant_entity(b) - - # 用户 + AI助手 → 阻止 - if (a_is_user and b_is_assistant) or (a_is_assistant and b_is_user): - return True - return False + return ( + (_is_user_entity(a) and _is_assistant_entity(b)) + or (_is_assistant_entity(a) and _is_user_entity(b)) + ) def _normalize_special_entity_names( @@ -269,21 +267,8 @@ def _normalize_special_entity_names( ent.name = _CANONICAL_ASSISTANT_NAME ent.entity_type = _CANONICAL_ASSISTANT_TYPE - # 第二步:收集 AI 助手实体的所有别名,从用户实体的 aliases 中排除 - # 防止 LLM 把 AI 的名字错误放入用户实体的 aliases - assistant_alias_set = set() - for ent in entity_nodes: - if _is_assistant_entity(ent): - for alias in (getattr(ent, "aliases", []) or []): - assistant_alias_set.add(alias.strip().lower()) - - if assistant_alias_set: - for ent in entity_nodes: - if _is_user_entity(ent): - original_aliases = getattr(ent, "aliases", []) or [] - cleaned = [a for a in original_aliases if a.strip().lower() not in assistant_alias_set] - if len(cleaned) < len(original_aliases): - ent.aliases = cleaned + # 第二步:清洗用户/AI助手之间的别名交叉污染(复用 clean_cross_role_aliases) + clean_cross_role_aliases(entity_nodes) async def fetch_neo4j_assistant_aliases(neo4j_connector, end_user_id: str) -> set: @@ -299,15 +284,9 @@ async def fetch_neo4j_assistant_aliases(neo4j_connector, end_user_id: str) -> se Returns: 小写归一化后的助手别名集合 """ - import logging - logger = logging.getLogger(__name__) - - # 使用模块级 _ASSISTANT_PLACEHOLDER_NAMES 的标题化形式构建查询名称列表, - # 保持与 _normalize_special_entity_names 标准化后的名称一致 - query_names = [_CANONICAL_ASSISTANT_NAME] # "AI助手" - # 补充英文常见变体 - query_names.extend(["助手", "AI Assistant", "Assistant"]) - # 去重 + # 查询名称列表:规范名称 + 常见变体(与 _normalize_special_entity_names 标准化后一致) + query_names = [_CANONICAL_ASSISTANT_NAME, *_ASSISTANT_PLACEHOLDER_NAMES] + # 去重保序 query_names = list(dict.fromkeys(query_names)) cypher = """ diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index a31099ab..3229674d 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -44,6 +44,10 @@ from app.core.memory.models.variate_config import ( from app.core.memory.storage_services.extraction_engine.deduplication.two_stage_dedup import ( dedup_layers_and_merge_and_return, ) +from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( + _USER_PLACEHOLDER_NAMES, + fetch_neo4j_assistant_aliases, +) from app.core.memory.storage_services.extraction_engine.knowledge_extraction.embedding_generation import ( embedding_generation, generate_entity_embeddings_from_triplets, @@ -1348,12 +1352,13 @@ class ExtractionOrchestrator: 策略: 1. 从内存中的 entity_nodes 提取本轮用户别名(current_aliases) - 2. 从 PgSQL end_user_info 读取已有的 aliases(db_aliases) - 3. 合并 db_aliases + current_aliases,去重保序 - 4. 写回 PgSQL + 2. 从去重后的 entity_nodes 中提取完整别名(含 Neo4j 二层去重合并的历史别名) + 3. 从 PgSQL end_user_info 读取已有的 aliases(db_aliases) + 4. 合并 db_aliases + deduped_aliases + current_aliases,去重保序 + 5. 写回 PgSQL Args: - entity_nodes: 去重后的实体节点列表(内存中) + entity_nodes: 去重后的实体节点列表(内存中,含二层去重合并结果) dialog_data_list: 对话数据列表 """ try: @@ -1369,7 +1374,12 @@ class ExtractionOrchestrator: # 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序) current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list) - # 1.5 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源 + # 1.5 从去重后的 entity_nodes 中提取完整别名 + # 二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 中, + # 这里提取出来确保 PgSQL 与 Neo4j 的别名保持同步 + deduped_aliases = self._extract_deduped_entity_aliases(entity_nodes) + + # 1.6 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源 # (防止 LLM 未提取出 AI 助手实体时,AI 别名泄漏到用户别名中) neo4j_assistant_aliases = await self._fetch_neo4j_assistant_aliases(end_user_id) if neo4j_assistant_aliases: @@ -1380,12 +1390,19 @@ class ExtractionOrchestrator: ] if len(current_aliases) < before_count: logger.info(f"通过 Neo4j AI 助手别名排除了 {before_count - len(current_aliases)} 个误归属别名") + # 同样过滤 deduped_aliases + deduped_aliases = [ + a for a in deduped_aliases + if a.strip().lower() not in neo4j_assistant_aliases + ] - if not current_aliases: + if not current_aliases and not deduped_aliases: logger.debug(f"本轮未提取到用户别名,跳过同步: end_user_id={end_user_id}") return logger.info(f"本轮对话提取的 aliases: {current_aliases}") + if deduped_aliases: + logger.info(f"去重后实体的完整 aliases(含历史): {deduped_aliases}") # 2. 同步到数据库 end_user_uuid = uuid.UUID(end_user_id) @@ -1402,9 +1419,15 @@ class ExtractionOrchestrator: # 过滤掉占位名称 db_aliases = [a for a in db_aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES] - # 合并:已有 + 本轮新增,去重保序 + # 合并:已有 + 去重后完整别名 + 本轮新增,去重保序 merged_aliases = list(db_aliases) seen_lower = {a.strip().lower() for a in merged_aliases} + # 先合并去重后实体的完整别名(含 Neo4j 历史别名) + for alias in deduped_aliases: + if alias.strip().lower() not in seen_lower: + merged_aliases.append(alias) + seen_lower.add(alias.strip().lower()) + # 再合并本轮新提取的别名 for alias in current_aliases: if alias.strip().lower() not in seen_lower: merged_aliases.append(alias) @@ -1438,7 +1461,9 @@ class ExtractionOrchestrator: info.aliases = merged_aliases logger.info(f"同步合并后 aliases 到 end_user_info: {merged_aliases}") else: - first_alias = current_aliases[0].strip() if current_aliases else "" + first_alias = current_aliases[0].strip() if current_aliases else ( + deduped_aliases[0].strip() if deduped_aliases else "" + ) # 确保 first_alias 不是占位名称 if first_alias and first_alias.lower() not in self.USER_PLACEHOLDER_NAMES: db.add(EndUserInfo( @@ -1457,50 +1482,67 @@ class ExtractionOrchestrator: # 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中 - USER_PLACEHOLDER_NAMES = {'用户', '我', 'user', 'i'} + # 复用 deduped_and_disamb 模块级常量,避免重复维护 + USER_PLACEHOLDER_NAMES = _USER_PLACEHOLDER_NAMES def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]: - """从用户发言的原始实体中提取别名(绕过去重污染) + """从用户发言的原始实体中提取本轮新增别名(绕过去重污染) 策略: - 1. 从 dialog_data_list 中找到 speaker="user" 的 statement - 2. 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases - 3. 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响 + 仅从 dialog_data_list 中找到 speaker="user" 的 statement, + 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases。 + 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响。 + + 注意:不再使用去重后 entity_nodes 作为兜底,因为二层去重会将 Neo4j 历史别名 + 合并进来,导致历史别名被误认为"本轮提取"。历史别名的同步由 + _extract_deduped_entity_aliases 负责。 Args: - entity_nodes: 去重后的实体节点列表(备用) - dialog_data_list: 对话数据列表(优先使用) + entity_nodes: 去重后的实体节点列表(未使用,保留参数兼容性) + dialog_data_list: 对话数据列表 Returns: 别名列表(保持原始顺序,已过滤) """ - # 优先从原始 dialog_data_list 中提取(绕过去重污染) - if dialog_data_list: - all_user_aliases = [] - seen_lower = set() - for dialog in dialog_data_list: - for chunk in dialog.chunks: - speaker = getattr(chunk, 'speaker', None) - for statement in chunk.statements: - stmt_speaker = getattr(statement, 'speaker', None) or speaker - if stmt_speaker != "user": - continue - triplet_info = getattr(statement, 'triplet_extraction_info', None) - if not triplet_info: - continue - for entity in (triplet_info.entities or []): - ent_name = getattr(entity, 'name', '').strip() - if ent_name.lower() in self.USER_PLACEHOLDER_NAMES: - for alias in (getattr(entity, 'aliases', []) or []): - a = alias.strip() - if a and a.lower() not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: - all_user_aliases.append(a) - seen_lower.add(a.lower()) - if all_user_aliases: - logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}") - return all_user_aliases + if not dialog_data_list: + return [] - # 兜底:从去重后的 entity_nodes 提取(旧逻辑) + all_user_aliases = [] + seen_lower = set() + for dialog in dialog_data_list: + for chunk in dialog.chunks: + speaker = getattr(chunk, 'speaker', None) + for statement in chunk.statements: + stmt_speaker = getattr(statement, 'speaker', None) or speaker + if stmt_speaker != "user": + continue + triplet_info = getattr(statement, 'triplet_extraction_info', None) + if not triplet_info: + continue + for entity in (triplet_info.entities or []): + ent_name = getattr(entity, 'name', '').strip() + if ent_name.lower() in self.USER_PLACEHOLDER_NAMES: + for alias in (getattr(entity, 'aliases', []) or []): + a = alias.strip() + if a and a.lower() not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: + all_user_aliases.append(a) + seen_lower.add(a.lower()) + if all_user_aliases: + logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}") + return all_user_aliases + + def _extract_deduped_entity_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]: + """从去重后的用户实体中提取完整别名列表。 + + 二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 的用户实体中, + 因此这里提取到的别名包含了历史积累的所有别名,可用于同步到 PgSQL。 + + Args: + entity_nodes: 去重后的实体节点列表(含二层去重合并结果) + + Returns: + 别名列表(已过滤占位名称,去重保序) + """ for entity in entity_nodes: if getattr(entity, 'name', '').strip().lower() in self.USER_PLACEHOLDER_NAMES: aliases = getattr(entity, 'aliases', []) or [] @@ -1509,37 +1551,12 @@ class ExtractionOrchestrator: if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES ] if filtered: - logger.debug(f"从去重后实体提取到别名(兜底): {filtered}") return filtered return [] - - async def _fetch_neo4j_user_aliases(self, end_user_id: str) -> List[str]: - """从 Neo4j 查询用户实体的完整 aliases 列表(已过滤占位名称)""" - cypher = """ - MATCH (e:ExtractedEntity) - WHERE e.end_user_id = $end_user_id AND e.name IN ['用户', '我', 'User', 'I'] - RETURN e.aliases AS aliases - LIMIT 1 - """ - result = await Neo4jConnector().execute_query(cypher, end_user_id=end_user_id) - if not result: - logger.debug(f"Neo4j 中未找到用户实体: end_user_id={end_user_id}") - return [] - aliases = result[0].get('aliases') or [] - if not aliases: - logger.debug(f"Neo4j 用户实体 aliases 为空: end_user_id={end_user_id}") - return [] - # 过滤掉占位名称,防止历史脏数据传播 - filtered = [a for a in aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES] - return filtered - async def _fetch_neo4j_assistant_aliases(self, end_user_id: str) -> set: """从 Neo4j 查询 AI 助手实体的所有别名(用于从用户别名中排除)""" - from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( - fetch_neo4j_assistant_aliases, - ) - return await fetch_neo4j_assistant_aliases(Neo4jConnector(), end_user_id) + return await fetch_neo4j_assistant_aliases(self.connector, end_user_id) def _resolve_other_name( self, diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index e7daf0bd..7ded48a4 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -27,9 +27,9 @@ Extract entities and knowledge triplets from the given statement. **Speaker:** {{ speaker }} {% if speaker == "assistant" %} {% if language == "zh" %} -⚠️ 当前陈述句来自 **AI助手的回复**。AI助手在回复中提到的名字(如称呼用户为"远哥""VV"等)是**用户的别名**,不是 AI 助手的别名。请注意区分说话人的视角。 +⚠️ 当前陈述句来自 **AI助手的回复**。AI助手在回复中用来称呼用户的名字是**用户的别名**,不是 AI 助手的别名。但只能提取原文中逐字出现的名字,严禁推测或创造原文中不存在的别名变体。 {% else %} -⚠️ This statement is from the **AI assistant's reply**. Names the AI uses to address the user (e.g., calling the user "buddy", "VV") are **user's aliases**, NOT the AI assistant's aliases. Pay attention to the speaker's perspective. +⚠️ This statement is from the **AI assistant's reply**. Names the AI uses to address the user are **user's aliases**, NOT the AI assistant's aliases. But only extract names that appear VERBATIM in the text — never infer or fabricate alias variants. {% endif %} {% endif %} {% endif %} @@ -97,16 +97,17 @@ Extract entities and knowledge triplets from the given statement. * "我叫张三,大家叫我小张" → aliases=["张三", "小张"](张三是第一个,将成为 other_name) * "大家叫我小李,我全名叫李明" → aliases=["小李", "李明"](小李先出现,将成为 other_name) - 空值:如果没有别名,使用 `[]` - - 重要:只提取本次对话中明确提到的别名,不要推测或添加未提及的名字 + - **🚨🚨🚨 严禁幻觉:只提取对话原文中逐字出现的别名,绝对不能推测、衍生或创造任何未在原文中出现的名字。例如,看到"陈思远"不能自行添加"思远大人""远哥""小远"等变体。如果原文没有这些字,就不能出现在 aliases 中。** - **🚨 归属区分:必须严格区分名称的归属对象。默认情况下,用户提到的名字归属用户实体。只有出现明确的第二人称命名表达(如"叫你""给你取名")时,才将名字归属 AI/助手实体。** - - **🚨 说话人视角:当 speaker 为 assistant 时,AI 助手用来称呼用户的名字(如"远哥""VV""思远大人")是用户的别名,必须归入用户实体的 aliases,绝对不能归入 AI 助手实体。** + - **🚨 说话人视角:当 speaker 为 assistant 时,AI 助手用来称呼用户的名字是用户的别名,必须归入用户实体的 aliases,绝对不能归入 AI 助手实体。但同样只能提取原文中逐字出现的称呼,不能推测。** * "我叫陈思远,我给AI取名为远仔" → 用户 aliases=["陈思远"],AI助手 aliases=["远仔"] * "我叫vv" → 用户 aliases=["vv"](没有给AI取名的表达,名字归用户) - * [speaker=assistant] "好的,远哥/VV/思远大人" → 用户 aliases=["远哥", "VV", "思远大人"](AI 在称呼用户,这些是用户的别名) + * [speaker=assistant] "好的,VV" → 用户 aliases=["VV"](AI 在称呼用户,原文中出现了"VV") * [speaker=assistant] "我叫陈仔" → AI助手 aliases=["陈仔"](AI 在自我介绍,这是 AI 的别名) * ❌ 错误:将"远仔"放入用户的 aliases("远仔"是给AI取的名字,不是用户的名字) * ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases - * ❌ 错误:AI 称呼用户为"远哥",却把"远哥"放入 AI 助手的 aliases + * ❌ 错误:AI 称呼用户为"VV",却把"VV"放入 AI 助手的 aliases + * ❌ 错误:原文只有"陈思远",却在 aliases 中添加"思远大人""远哥""小远"等从未出现的变体(这是幻觉) {% else %} - Include: nicknames, full names, abbreviations, alternative names - Order: **The FIRST alias will be used as the user's primary display name (other_name). Put the most important/frequently used name FIRST** @@ -115,16 +116,17 @@ Extract entities and knowledge triplets from the given statement. * "I'm John, people call me Johnny" → aliases=["John", "Johnny"] (John is first, will become other_name) * "People call me Mike, my full name is Michael" → aliases=["Mike", "Michael"] (Mike appears first, will become other_name) - Empty: If no aliases, use `[]` - - Important: Only extract aliases explicitly mentioned in current conversation, do not infer or add unmentioned names + - **🚨🚨🚨 NO HALLUCINATION: Only extract aliases that appear VERBATIM in the original text. NEVER infer, derive, or fabricate names not present in the text. For example, seeing "John Smith" does NOT allow adding "Johnny", "Smithy", "Mr. Smith" unless those exact strings appear in the conversation.** - **🚨 Ownership distinction: By default, all names mentioned by the user belong to the user entity. Only assign a name to the AI/assistant entity when an explicit second-person naming expression (e.g., "I'll call you", "your name is") is present.** - - **🚨 Speaker perspective: When speaker is "assistant", names the AI uses to address the user (e.g., "buddy", "VV", "boss") are the USER's aliases and MUST go into the user entity's aliases, NEVER into the AI assistant entity's aliases.** + - **🚨 Speaker perspective: When speaker is "assistant", names the AI uses to address the user are the USER's aliases and MUST go into the user entity's aliases, NEVER into the AI assistant entity's aliases. But only extract names that appear verbatim in the text, never infer.** * "I'm Alex, I'll call you Buddy" → User aliases=["Alex"], AI assistant aliases=["Buddy"] * "I'm vv" → User aliases=["vv"] (no AI-naming expression, name belongs to user) - * [speaker=assistant] "Sure thing, buddy/VV" → User aliases=["buddy", "VV"] (AI addressing the user, these are user's aliases) + * [speaker=assistant] "Sure thing, VV" → User aliases=["VV"] (AI addressing the user, "VV" appears in text) * [speaker=assistant] "I'm Jarvis" → AI assistant aliases=["Jarvis"] (AI self-introduction, this is AI's alias) * ❌ Wrong: putting "Buddy" in user's aliases ("Buddy" is a name for the AI, not the user) * ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases - * ❌ Wrong: AI calls user "buddy" but "buddy" is put in AI assistant's aliases + * ❌ Wrong: AI calls user "VV" but "VV" is put in AI assistant's aliases + * ❌ Wrong: Text only has "John Smith" but aliases include "Johnny", "Smithy" (hallucinated variants) {% endif %} @@ -161,7 +163,7 @@ Extract entities and knowledge triplets from the given statement. - **🚨 "你不叫X了"/"你不叫X,你叫Y" 句式:X 和 Y 都是 AI 的名字(旧名和新名),绝对不是用户的名字。因为句子主语是"你"(AI)。** - **以下情况名字归属用户,不是给 AI 取名:**「我叫」「我的名字是」「叫我」「我是」「大家叫我」「我的英文名是」「我的昵称是」等**第一人称(我)的自我介绍句式** - **🚨 speaker=assistant 时的特殊规则:** - * AI 用来称呼用户的名字(如"远哥""VV""思远大人")→ 归入**用户**实体的 aliases + * AI 用来称呼用户的名字 → 归入**用户**实体的 aliases(但必须是原文中逐字出现的称呼,不能推测) * AI 自称的名字(如"我叫陈仔""我是你的助手")→ 归入**AI助手**实体的 aliases * 判断依据:AI 说"你叫X"或用 X 称呼用户 → X 是用户别名;AI 说"我叫X"或"我是X" → X 是 AI 别名 - 示例: @@ -170,11 +172,12 @@ Extract entities and knowledge triplets from the given statement. * "我叫陈思远,我给AI取名为远仔" → 用户实体: name="用户", aliases=["陈思远"];AI实体: name="AI助手", aliases=["远仔"] * "叫你小助,我自己叫老王" → 用户实体: name="用户", aliases=["老王"];AI实体: name="AI助手", aliases=["小助"] * "你不叫远仔了,你现在叫陈仔" → AI实体: name="AI助手", aliases=["陈仔"]("远仔"是AI旧名,"陈仔"是AI新名,都归AI。不要把"远仔"或"陈仔"放入用户的aliases) - * [speaker=assistant] "好的远哥/VV/思远大人,今天想干点啥?" → 用户实体: name="用户", aliases=["远哥", "VV", "思远大人"](AI 在称呼用户) - * [speaker=assistant] "你叫陈思远(或VV),我叫陈仔" → 用户实体: name="用户", aliases=["陈思远", "VV"];AI实体: name="AI助手", aliases=["陈仔"] + * [speaker=assistant] "好的VV,今天想干点啥?" → 用户实体: name="用户", aliases=["VV"](AI 在称呼用户,原文中出现了"VV") + * [speaker=assistant] "你叫陈思远,我叫陈仔" → 用户实体: name="用户", aliases=["陈思远"];AI实体: name="AI助手", aliases=["陈仔"] * ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases(没有任何给 AI 取名的表达) - * ❌ 错误:AI 称呼用户为"远哥",却把"远哥"放入 AI 助手的 aliases + * ❌ 错误:AI 称呼用户为"VV",却把"VV"放入 AI 助手的 aliases * ❌ 错误:aliases=["陈思远", "远仔"]("远仔"是给AI取的名字,不是用户的名字) + * ❌ 错误:原文只有"陈思远",却在 aliases 中添加"思远大人""远哥""小远"等从未出现的变体(这是幻觉) {% else %} - **🚨 Default rule: If there is NO explicit AI/assistant naming expression in the conversation, ALL names belong to the user entity. Do NOT guess or infer that a name is for the AI.** - Only create an AI/assistant entity when the user **explicitly** names the AI/assistant @@ -185,7 +188,7 @@ Extract entities and knowledge triplets from the given statement. - **🚨 "You're not called X anymore" / "You're not X, you're Y" pattern: BOTH X and Y are AI's names (old and new). They are NOT user's names. The subject is "you" (the AI).** - **These patterns mean the name belongs to the USER, NOT the AI:** "I'm", "my name is", "call me", "I am", "people call me", "my English name is", "my nickname is", etc. — **first-person ("I"/"me") self-introduction patterns** - **🚨 Special rules when speaker=assistant:** - * Names the AI uses to address the user (e.g., "buddy", "VV", "boss") → belong to the **user** entity's aliases + * Names the AI uses to address the user → belong to the **user** entity's aliases (but only extract names that appear verbatim in the text, never infer) * Names the AI uses for itself (e.g., "I'm Jarvis", "I am your assistant") → belong to the **AI assistant** entity's aliases * Rule: AI says "you are X" or calls user X → X is user's alias; AI says "I'm X" or "I am X" → X is AI's alias - Examples: @@ -194,11 +197,12 @@ Extract entities and knowledge triplets from the given statement. * "I'm Alex, I'll call you Buddy" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Buddy"] * "Call yourself Jarvis, my name is Tony" → User entity: name="User", aliases=["Tony"]; AI entity: name="AI Assistant", aliases=["Jarvis"] * "You're not called Jarvis anymore, your new name is Friday" → AI entity: name="AI Assistant", aliases=["Friday"] (both "Jarvis" and "Friday" are AI names, NOT user names) - * [speaker=assistant] "Sure thing, buddy/VV!" → User entity: name="User", aliases=["buddy", "VV"] (AI addressing the user) + * [speaker=assistant] "Sure thing, VV" → User entity: name="User", aliases=["VV"] (AI addressing the user, "VV" appears in text) * [speaker=assistant] "You're Alex, and I'm Jarvis" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Jarvis"] * ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases (no AI-naming expression exists) - * ❌ Wrong: AI calls user "buddy" but "buddy" is put in AI assistant's aliases + * ❌ Wrong: AI calls user "VV" but "VV" is put in AI assistant's aliases * ❌ Wrong: aliases=["Alex", "Buddy"] ("Buddy" is a name for the AI, not the user) + * ❌ Wrong: Text only has "John Smith" but aliases include "Johnny", "Smithy" (hallucinated variants) {% endif %} 5. **ALIASES ORDER:**