diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py index ae906aa8..1401fe5c 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py @@ -269,6 +269,22 @@ def _normalize_special_entity_names( ent.name = _CANONICAL_ASSISTANT_NAME ent.entity_type = _CANONICAL_ASSISTANT_TYPE + # 第二步:收集 AI 助手实体的所有别名,从用户实体的 aliases 中排除 + # 防止 LLM 把 AI 的名字错误放入用户实体的 aliases + assistant_alias_set = set() + for ent in entity_nodes: + if _is_assistant_entity(ent): + for alias in (getattr(ent, "aliases", []) or []): + assistant_alias_set.add(alias.strip().lower()) + + if assistant_alias_set: + for ent in entity_nodes: + if _is_user_entity(ent): + original_aliases = getattr(ent, "aliases", []) or [] + cleaned = [a for a in original_aliases if a.strip().lower() not in assistant_alias_set] + if len(cleaned) < len(original_aliases): + ent.aliases = cleaned + def clean_cross_role_aliases( entity_nodes: List[ExtractedEntityNode], diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/two_stage_dedup.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/two_stage_dedup.py index 4b9c5718..13534b3d 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/two_stage_dedup.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/two_stage_dedup.py @@ -15,6 +15,7 @@ from app.core.memory.models.message_models import DialogData from app.core.memory.models.variate_config import ExtractionPipelineConfig from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( deduplicate_entities_and_edges, + clean_cross_role_aliases, ) from app.core.memory.storage_services.extraction_engine.deduplication.second_layer_dedup import ( second_layer_dedup_and_merge_with_neo4j, @@ -100,6 +101,10 @@ async def dedup_layers_and_merge_and_return( except Exception as e: print(f"Second-layer dedup failed: {e}") + # 第二层去重后,清洗用户/AI助手之间的别名交叉污染 + # 第二层从 Neo4j 合并了旧实体,可能带入历史脏数据 + clean_cross_role_aliases(fused_entity_nodes) + return ( dialogue_nodes, chunk_nodes, diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py index 225852d6..bbf06e37 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py @@ -1367,7 +1367,7 @@ class ExtractionOrchestrator: return # 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序) - current_aliases = self._extract_current_aliases(entity_nodes) + current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list) # 1.5 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源 # (防止 LLM 未提取出 AI 助手实体时,AI 别名泄漏到用户别名中) @@ -1459,45 +1459,58 @@ class ExtractionOrchestrator: # 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中 USER_PLACEHOLDER_NAMES = {'用户', '我', 'User', 'I'} - def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]: - """从实体节点提取用户别名(保持 LLM 提取的原始顺序,不进行任何排序) + def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]: + """从用户发言的原始实体中提取别名(绕过去重污染) - 这个方法直接返回 LLM 提取的别名列表,并过滤掉: - 1. 占位名称("用户"、"我"、"User"、"I") - 2. AI 助手实体的别名(防止 AI 的名字被错误归入用户别名) - - 第一个别名将被用作 other_name。 + 策略: + 1. 从 dialog_data_list 中找到 speaker="user" 的 statement + 2. 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases + 3. 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响 Args: - entity_nodes: 实体节点列表 + entity_nodes: 去重后的实体节点列表(备用) + dialog_data_list: 对话数据列表(优先使用) Returns: - 别名列表(保持 LLM 提取的原始顺序,已过滤占位名称和 AI 别名) + 别名列表(保持原始顺序,已过滤) """ - # 先收集 AI 助手实体的所有别名(用于排除) - assistant_names = set() - ASSISTANT_PLACEHOLDER_NAMES = {"AI助手", "助手", "AI Assistant", "Assistant"} - for entity in entity_nodes: - ent_name = getattr(entity, 'name', '').strip() - if ent_name in ASSISTANT_PLACEHOLDER_NAMES: - for alias in (getattr(entity, 'aliases', []) or []): - assistant_names.add(alias.strip().lower()) - # AI 助手的 name 本身也加入排除集 - assistant_names.add(ent_name.lower()) - - # 提取用户实体的别名,排除占位名称和 AI 助手别名 + # 优先从原始 dialog_data_list 中提取(绕过去重污染) + if dialog_data_list: + all_user_aliases = [] + seen_lower = set() + for dialog in dialog_data_list: + for chunk in dialog.chunks: + speaker = getattr(chunk, 'speaker', None) + for statement in chunk.statements: + stmt_speaker = getattr(statement, 'speaker', None) or speaker + if stmt_speaker != "user": + continue + triplet_info = getattr(statement, 'triplet_extraction_info', None) + if not triplet_info: + continue + for entity in (triplet_info.entities or []): + ent_name = getattr(entity, 'name', '').strip() + if ent_name in self.USER_PLACEHOLDER_NAMES: + for alias in (getattr(entity, 'aliases', []) or []): + a = alias.strip() + if a and a not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: + all_user_aliases.append(a) + seen_lower.add(a.lower()) + if all_user_aliases: + logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}") + return all_user_aliases + + # 兜底:从去重后的 entity_nodes 提取(旧逻辑) for entity in entity_nodes: if getattr(entity, 'name', '').strip() in self.USER_PLACEHOLDER_NAMES: aliases = getattr(entity, 'aliases', []) or [] filtered = [ a for a in aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES - and a.strip().lower() not in assistant_names ] - logger.debug(f"提取到用户别名(已过滤占位名称和AI别名): {filtered}") - if assistant_names: - logger.debug(f"已排除的AI助手别名: {assistant_names}") - return filtered + if filtered: + logger.debug(f"从去重后实体提取到别名(兜底): {filtered}") + return filtered return [] diff --git a/api/app/core/memory/utils/prompt/prompt_utils.py b/api/app/core/memory/utils/prompt/prompt_utils.py index 8d964eaf..a1ad885e 100644 --- a/api/app/core/memory/utils/prompt/prompt_utils.py +++ b/api/app/core/memory/utils/prompt/prompt_utils.py @@ -225,7 +225,7 @@ async def render_triplet_extraction_prompt( template = prompt_env.get_template("extract_triplet.jinja2") # 准备本体类型数据 - ontology_types: OntologyTypeList | None = None, + ontology_type_section = None ontology_type_names = [] type_hierarchy_hints = [] if ontology_types and ontology_types.types: