fix(memory): prevent cross-role alias contamination in entity dedup

- Extract user aliases from raw dialog statements instead of post-dedup
  entities to bypass merge pollution
- Add alias cross-cleaning step in _normalize_special_entity_names to
  strip AI assistant aliases from user entities before dedup
- Call clean_cross_role_aliases after second-layer dedup to handle
  historical dirty data merged from Neo4j
- Fix syntax error in prompt_utils.py (ontology_types variable assignment)
This commit is contained in:
lanceyq
2026-04-03 12:34:04 +08:00
parent 7890970a39
commit 9cc19047b4
4 changed files with 62 additions and 28 deletions

View File

@@ -269,6 +269,22 @@ def _normalize_special_entity_names(
ent.name = _CANONICAL_ASSISTANT_NAME
ent.entity_type = _CANONICAL_ASSISTANT_TYPE
# 第二步:收集 AI 助手实体的所有别名,从用户实体的 aliases 中排除
# 防止 LLM 把 AI 的名字错误放入用户实体的 aliases
assistant_alias_set = set()
for ent in entity_nodes:
if _is_assistant_entity(ent):
for alias in (getattr(ent, "aliases", []) or []):
assistant_alias_set.add(alias.strip().lower())
if assistant_alias_set:
for ent in entity_nodes:
if _is_user_entity(ent):
original_aliases = getattr(ent, "aliases", []) or []
cleaned = [a for a in original_aliases if a.strip().lower() not in assistant_alias_set]
if len(cleaned) < len(original_aliases):
ent.aliases = cleaned
def clean_cross_role_aliases(
entity_nodes: List[ExtractedEntityNode],

View File

@@ -15,6 +15,7 @@ from app.core.memory.models.message_models import DialogData
from app.core.memory.models.variate_config import ExtractionPipelineConfig
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
deduplicate_entities_and_edges,
clean_cross_role_aliases,
)
from app.core.memory.storage_services.extraction_engine.deduplication.second_layer_dedup import (
second_layer_dedup_and_merge_with_neo4j,
@@ -100,6 +101,10 @@ async def dedup_layers_and_merge_and_return(
except Exception as e:
print(f"Second-layer dedup failed: {e}")
# 第二层去重后,清洗用户/AI助手之间的别名交叉污染
# 第二层从 Neo4j 合并了旧实体,可能带入历史脏数据
clean_cross_role_aliases(fused_entity_nodes)
return (
dialogue_nodes,
chunk_nodes,

View File

@@ -1367,7 +1367,7 @@ class ExtractionOrchestrator:
return
# 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序)
current_aliases = self._extract_current_aliases(entity_nodes)
current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list)
# 1.5 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源
# (防止 LLM 未提取出 AI 助手实体时AI 别名泄漏到用户别名中)
@@ -1459,45 +1459,58 @@ class ExtractionOrchestrator:
# 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中
USER_PLACEHOLDER_NAMES = {'用户', '', 'User', 'I'}
def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]:
"""实体节点提取用户别名(保持 LLM 提取的原始顺序,不进行任何排序
def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]:
"""用户发言的原始实体中提取别名(绕过去重污染
这个方法直接返回 LLM 提取的别名列表,并过滤掉
1. 占位名称("用户""""User""I"
2. AI 助手实体的别名(防止 AI 的名字被错误归入用户别名)
第一个别名将被用作 other_name。
策略
1. 从 dialog_data_list 中找到 speaker="user" 的 statement
2. 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases
3. 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响
Args:
entity_nodes: 实体节点列表
entity_nodes: 去重后的实体节点列表(备用)
dialog_data_list: 对话数据列表(优先使用)
Returns:
别名列表(保持 LLM 提取的原始顺序,已过滤占位名称和 AI 别名
别名列表(保持原始顺序,已过滤
"""
# 先收集 AI 助手实体的所有别名(用于排除
assistant_names = set()
ASSISTANT_PLACEHOLDER_NAMES = {"AI助手", "助手", "AI Assistant", "Assistant"}
for entity in entity_nodes:
ent_name = getattr(entity, 'name', '').strip()
if ent_name in ASSISTANT_PLACEHOLDER_NAMES:
for alias in (getattr(entity, 'aliases', []) or []):
assistant_names.add(alias.strip().lower())
# AI 助手的 name 本身也加入排除集
assistant_names.add(ent_name.lower())
# 提取用户实体的别名,排除占位名称和 AI 助手别名
# 优先从原始 dialog_data_list 中提取(绕过去重污染
if dialog_data_list:
all_user_aliases = []
seen_lower = set()
for dialog in dialog_data_list:
for chunk in dialog.chunks:
speaker = getattr(chunk, 'speaker', None)
for statement in chunk.statements:
stmt_speaker = getattr(statement, 'speaker', None) or speaker
if stmt_speaker != "user":
continue
triplet_info = getattr(statement, 'triplet_extraction_info', None)
if not triplet_info:
continue
for entity in (triplet_info.entities or []):
ent_name = getattr(entity, 'name', '').strip()
if ent_name in self.USER_PLACEHOLDER_NAMES:
for alias in (getattr(entity, 'aliases', []) or []):
a = alias.strip()
if a and a not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower:
all_user_aliases.append(a)
seen_lower.add(a.lower())
if all_user_aliases:
logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}")
return all_user_aliases
# 兜底:从去重后的 entity_nodes 提取(旧逻辑)
for entity in entity_nodes:
if getattr(entity, 'name', '').strip() in self.USER_PLACEHOLDER_NAMES:
aliases = getattr(entity, 'aliases', []) or []
filtered = [
a for a in aliases
if a.strip() not in self.USER_PLACEHOLDER_NAMES
and a.strip().lower() not in assistant_names
]
logger.debug(f"提取到用户别名已过滤占位名称和AI别名: {filtered}")
if assistant_names:
logger.debug(f"已排除的AI助手别名: {assistant_names}")
return filtered
if filtered:
logger.debug(f"从去重后实体提取到别名(兜底): {filtered}")
return filtered
return []

View File

@@ -225,7 +225,7 @@ async def render_triplet_extraction_prompt(
template = prompt_env.get_template("extract_triplet.jinja2")
# 准备本体类型数据
ontology_types: OntologyTypeList | None = None,
ontology_type_section = None
ontology_type_names = []
type_hierarchy_hints = []
if ontology_types and ontology_types.types: