fix(memory): prevent cross-role alias contamination in entity dedup
- Extract user aliases from raw dialog statements instead of post-dedup entities to bypass merge pollution - Add alias cross-cleaning step in _normalize_special_entity_names to strip AI assistant aliases from user entities before dedup - Call clean_cross_role_aliases after second-layer dedup to handle historical dirty data merged from Neo4j - Fix syntax error in prompt_utils.py (ontology_types variable assignment)
This commit is contained in:
@@ -269,6 +269,22 @@ def _normalize_special_entity_names(
|
|||||||
ent.name = _CANONICAL_ASSISTANT_NAME
|
ent.name = _CANONICAL_ASSISTANT_NAME
|
||||||
ent.entity_type = _CANONICAL_ASSISTANT_TYPE
|
ent.entity_type = _CANONICAL_ASSISTANT_TYPE
|
||||||
|
|
||||||
|
# 第二步:收集 AI 助手实体的所有别名,从用户实体的 aliases 中排除
|
||||||
|
# 防止 LLM 把 AI 的名字错误放入用户实体的 aliases
|
||||||
|
assistant_alias_set = set()
|
||||||
|
for ent in entity_nodes:
|
||||||
|
if _is_assistant_entity(ent):
|
||||||
|
for alias in (getattr(ent, "aliases", []) or []):
|
||||||
|
assistant_alias_set.add(alias.strip().lower())
|
||||||
|
|
||||||
|
if assistant_alias_set:
|
||||||
|
for ent in entity_nodes:
|
||||||
|
if _is_user_entity(ent):
|
||||||
|
original_aliases = getattr(ent, "aliases", []) or []
|
||||||
|
cleaned = [a for a in original_aliases if a.strip().lower() not in assistant_alias_set]
|
||||||
|
if len(cleaned) < len(original_aliases):
|
||||||
|
ent.aliases = cleaned
|
||||||
|
|
||||||
|
|
||||||
def clean_cross_role_aliases(
|
def clean_cross_role_aliases(
|
||||||
entity_nodes: List[ExtractedEntityNode],
|
entity_nodes: List[ExtractedEntityNode],
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from app.core.memory.models.message_models import DialogData
|
|||||||
from app.core.memory.models.variate_config import ExtractionPipelineConfig
|
from app.core.memory.models.variate_config import ExtractionPipelineConfig
|
||||||
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
|
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
|
||||||
deduplicate_entities_and_edges,
|
deduplicate_entities_and_edges,
|
||||||
|
clean_cross_role_aliases,
|
||||||
)
|
)
|
||||||
from app.core.memory.storage_services.extraction_engine.deduplication.second_layer_dedup import (
|
from app.core.memory.storage_services.extraction_engine.deduplication.second_layer_dedup import (
|
||||||
second_layer_dedup_and_merge_with_neo4j,
|
second_layer_dedup_and_merge_with_neo4j,
|
||||||
@@ -100,6 +101,10 @@ async def dedup_layers_and_merge_and_return(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Second-layer dedup failed: {e}")
|
print(f"Second-layer dedup failed: {e}")
|
||||||
|
|
||||||
|
# 第二层去重后,清洗用户/AI助手之间的别名交叉污染
|
||||||
|
# 第二层从 Neo4j 合并了旧实体,可能带入历史脏数据
|
||||||
|
clean_cross_role_aliases(fused_entity_nodes)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
dialogue_nodes,
|
dialogue_nodes,
|
||||||
chunk_nodes,
|
chunk_nodes,
|
||||||
|
|||||||
@@ -1367,7 +1367,7 @@ class ExtractionOrchestrator:
|
|||||||
return
|
return
|
||||||
|
|
||||||
# 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序)
|
# 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序)
|
||||||
current_aliases = self._extract_current_aliases(entity_nodes)
|
current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list)
|
||||||
|
|
||||||
# 1.5 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源
|
# 1.5 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源
|
||||||
# (防止 LLM 未提取出 AI 助手实体时,AI 别名泄漏到用户别名中)
|
# (防止 LLM 未提取出 AI 助手实体时,AI 别名泄漏到用户别名中)
|
||||||
@@ -1459,45 +1459,58 @@ class ExtractionOrchestrator:
|
|||||||
# 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中
|
# 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中
|
||||||
USER_PLACEHOLDER_NAMES = {'用户', '我', 'User', 'I'}
|
USER_PLACEHOLDER_NAMES = {'用户', '我', 'User', 'I'}
|
||||||
|
|
||||||
def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]:
|
def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]:
|
||||||
"""从实体节点提取用户别名(保持 LLM 提取的原始顺序,不进行任何排序)
|
"""从用户发言的原始实体中提取别名(绕过去重污染)
|
||||||
|
|
||||||
这个方法直接返回 LLM 提取的别名列表,并过滤掉:
|
策略:
|
||||||
1. 占位名称("用户"、"我"、"User"、"I")
|
1. 从 dialog_data_list 中找到 speaker="user" 的 statement
|
||||||
2. AI 助手实体的别名(防止 AI 的名字被错误归入用户别名)
|
2. 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases
|
||||||
|
3. 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响
|
||||||
第一个别名将被用作 other_name。
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
entity_nodes: 实体节点列表
|
entity_nodes: 去重后的实体节点列表(备用)
|
||||||
|
dialog_data_list: 对话数据列表(优先使用)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
别名列表(保持 LLM 提取的原始顺序,已过滤占位名称和 AI 别名)
|
别名列表(保持原始顺序,已过滤)
|
||||||
"""
|
"""
|
||||||
# 先收集 AI 助手实体的所有别名(用于排除)
|
# 优先从原始 dialog_data_list 中提取(绕过去重污染)
|
||||||
assistant_names = set()
|
if dialog_data_list:
|
||||||
ASSISTANT_PLACEHOLDER_NAMES = {"AI助手", "助手", "AI Assistant", "Assistant"}
|
all_user_aliases = []
|
||||||
for entity in entity_nodes:
|
seen_lower = set()
|
||||||
ent_name = getattr(entity, 'name', '').strip()
|
for dialog in dialog_data_list:
|
||||||
if ent_name in ASSISTANT_PLACEHOLDER_NAMES:
|
for chunk in dialog.chunks:
|
||||||
for alias in (getattr(entity, 'aliases', []) or []):
|
speaker = getattr(chunk, 'speaker', None)
|
||||||
assistant_names.add(alias.strip().lower())
|
for statement in chunk.statements:
|
||||||
# AI 助手的 name 本身也加入排除集
|
stmt_speaker = getattr(statement, 'speaker', None) or speaker
|
||||||
assistant_names.add(ent_name.lower())
|
if stmt_speaker != "user":
|
||||||
|
continue
|
||||||
# 提取用户实体的别名,排除占位名称和 AI 助手别名
|
triplet_info = getattr(statement, 'triplet_extraction_info', None)
|
||||||
|
if not triplet_info:
|
||||||
|
continue
|
||||||
|
for entity in (triplet_info.entities or []):
|
||||||
|
ent_name = getattr(entity, 'name', '').strip()
|
||||||
|
if ent_name in self.USER_PLACEHOLDER_NAMES:
|
||||||
|
for alias in (getattr(entity, 'aliases', []) or []):
|
||||||
|
a = alias.strip()
|
||||||
|
if a and a not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower:
|
||||||
|
all_user_aliases.append(a)
|
||||||
|
seen_lower.add(a.lower())
|
||||||
|
if all_user_aliases:
|
||||||
|
logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}")
|
||||||
|
return all_user_aliases
|
||||||
|
|
||||||
|
# 兜底:从去重后的 entity_nodes 提取(旧逻辑)
|
||||||
for entity in entity_nodes:
|
for entity in entity_nodes:
|
||||||
if getattr(entity, 'name', '').strip() in self.USER_PLACEHOLDER_NAMES:
|
if getattr(entity, 'name', '').strip() in self.USER_PLACEHOLDER_NAMES:
|
||||||
aliases = getattr(entity, 'aliases', []) or []
|
aliases = getattr(entity, 'aliases', []) or []
|
||||||
filtered = [
|
filtered = [
|
||||||
a for a in aliases
|
a for a in aliases
|
||||||
if a.strip() not in self.USER_PLACEHOLDER_NAMES
|
if a.strip() not in self.USER_PLACEHOLDER_NAMES
|
||||||
and a.strip().lower() not in assistant_names
|
|
||||||
]
|
]
|
||||||
logger.debug(f"提取到用户别名(已过滤占位名称和AI别名): {filtered}")
|
if filtered:
|
||||||
if assistant_names:
|
logger.debug(f"从去重后实体提取到别名(兜底): {filtered}")
|
||||||
logger.debug(f"已排除的AI助手别名: {assistant_names}")
|
return filtered
|
||||||
return filtered
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -225,7 +225,7 @@ async def render_triplet_extraction_prompt(
|
|||||||
template = prompt_env.get_template("extract_triplet.jinja2")
|
template = prompt_env.get_template("extract_triplet.jinja2")
|
||||||
|
|
||||||
# 准备本体类型数据
|
# 准备本体类型数据
|
||||||
ontology_types: OntologyTypeList | None = None,
|
ontology_type_section = None
|
||||||
ontology_type_names = []
|
ontology_type_names = []
|
||||||
type_hierarchy_hints = []
|
type_hierarchy_hints = []
|
||||||
if ontology_types and ontology_types.types:
|
if ontology_types and ontology_types.types:
|
||||||
|
|||||||
Reference in New Issue
Block a user