Merge pull request #784 from SuanmoSuanyangTechnology/fix/aliases-extract

feat(memory): prevent cross-role alias contamination between user and…
This commit is contained in:
Ke Sun
2026-04-03 15:26:31 +08:00
committed by GitHub
7 changed files with 490 additions and 69 deletions

View File

@@ -152,6 +152,24 @@ async def write(
# Step 3: Save all data to Neo4j database
step_start = time.time()
# Neo4j 写入前:清洗用户/AI助手实体之间的别名交叉污染
# 从 Neo4j 查询已有的 AI 助手别名,与本轮实体中的 AI 助手别名合并,
# 确保用户实体的 aliases 不包含 AI 助手的名字
try:
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
clean_cross_role_aliases,
fetch_neo4j_assistant_aliases,
)
neo4j_assistant_aliases = set()
if all_entity_nodes:
_eu_id = all_entity_nodes[0].end_user_id
if _eu_id:
neo4j_assistant_aliases = await fetch_neo4j_assistant_aliases(neo4j_connector, _eu_id)
clean_cross_role_aliases(all_entity_nodes, external_assistant_aliases=neo4j_assistant_aliases)
logger.info(f"Neo4j 写入前别名清洗完成AI助手别名排除集大小: {len(neo4j_assistant_aliases)}")
except Exception as e:
logger.warning(f"Neo4j 写入前别名清洗失败(不影响主流程): {e}")
# 添加死锁重试机制
max_retries = 3
retry_delay = 1 # 秒

View File

@@ -4,6 +4,7 @@
import asyncio
import difflib # 提供字符串相似度计算工具
import importlib
import logging
import os
import re
from datetime import datetime
@@ -16,6 +17,8 @@ from app.core.memory.models.graph_models import (
)
from app.core.memory.models.variate_config import DedupConfig
logger = logging.getLogger(__name__)
# 模块级类型统一工具函数
def _unify_entity_type(canonical: ExtractedEntityNode, losing: ExtractedEntityNode, suggested_type: str = None) -> None:
@@ -198,6 +201,161 @@ def _merge_attribute(canonical: ExtractedEntityNode, ent: ExtractedEntityNode):
except Exception:
pass
# 用户和AI助手的占位名称集合用于名称标准化
_USER_PLACEHOLDER_NAMES = {"用户", "", "user", "i"}
_ASSISTANT_PLACEHOLDER_NAMES = {"ai助手", "助手", "人工智能助手", "智能助手", "智能体", "ai assistant", "assistant"}
# 标准化后的规范名称和类型
_CANONICAL_USER_NAME = "用户"
_CANONICAL_USER_TYPE = "用户"
_CANONICAL_ASSISTANT_NAME = "AI助手"
_CANONICAL_ASSISTANT_TYPE = "Agent"
# 用户和AI助手的所有可能名称用于判断实体是否为特殊角色实体
_ALL_USER_NAMES = _USER_PLACEHOLDER_NAMES
_ALL_ASSISTANT_NAMES = _ASSISTANT_PLACEHOLDER_NAMES
def _is_user_entity(ent: ExtractedEntityNode) -> bool:
"""判断实体是否为用户实体name 或 entity_type 匹配)"""
name = (getattr(ent, "name", "") or "").strip().lower()
etype = (getattr(ent, "entity_type", "") or "").strip()
return name in _ALL_USER_NAMES or etype == _CANONICAL_USER_TYPE
def _is_assistant_entity(ent: ExtractedEntityNode) -> bool:
"""判断实体是否为AI助手实体name 或 entity_type 匹配)"""
name = (getattr(ent, "name", "") or "").strip().lower()
etype = (getattr(ent, "entity_type", "") or "").strip()
return name in _ALL_ASSISTANT_NAMES or etype == _CANONICAL_ASSISTANT_TYPE
def _would_merge_cross_role(a: ExtractedEntityNode, b: ExtractedEntityNode) -> bool:
"""判断两个实体的合并是否会跨越用户/AI助手角色边界。
用户实体和AI助手实体永远不应该被合并在一起。
如果一方是用户实体、另一方是AI助手实体返回 True阻止合并
"""
return (
(_is_user_entity(a) and _is_assistant_entity(b))
or (_is_assistant_entity(a) and _is_user_entity(b))
)
def _normalize_special_entity_names(
entity_nodes: List[ExtractedEntityNode],
) -> None:
"""标准化用户和AI助手实体的名称和类型。
多轮对话中LLM 对同一角色可能使用不同的名称变体(如"用户"/""/"User"
"AI助手"/"助手"/"Assistant"),导致精确匹配无法合并。
此函数在去重前将这些变体统一为规范名称,并强制绑定 entity_type确保
- name="用户" 的实体 entity_type 一定为 "用户"
- name="AI助手" 的实体 entity_type 一定为 "Agent"
Args:
entity_nodes: 实体节点列表(原地修改)
"""
for ent in entity_nodes:
name = (getattr(ent, "name", "") or "").strip()
name_lower = name.lower()
if name_lower in _USER_PLACEHOLDER_NAMES:
ent.name = _CANONICAL_USER_NAME
ent.entity_type = _CANONICAL_USER_TYPE
elif name_lower in _ASSISTANT_PLACEHOLDER_NAMES:
ent.name = _CANONICAL_ASSISTANT_NAME
ent.entity_type = _CANONICAL_ASSISTANT_TYPE
# 第二步:清洗用户/AI助手之间的别名交叉污染复用 clean_cross_role_aliases
clean_cross_role_aliases(entity_nodes)
async def fetch_neo4j_assistant_aliases(neo4j_connector, end_user_id: str) -> set:
"""从 Neo4j 查询 AI 助手实体的所有别名(小写归一化)。
这是助手别名查询的唯一入口,供 write_tools 和 extraction_orchestrator 共用,
避免多处维护相同的 Cypher 和名称列表。
Args:
neo4j_connector: Neo4j 连接器实例(需提供 execute_query 方法)
end_user_id: 终端用户 ID
Returns:
小写归一化后的助手别名集合
"""
# 查询名称列表:规范名称 + 常见变体(与 _normalize_special_entity_names 标准化后一致)
query_names = [_CANONICAL_ASSISTANT_NAME, *_ASSISTANT_PLACEHOLDER_NAMES]
# 去重保序
query_names = list(dict.fromkeys(query_names))
cypher = """
MATCH (e:ExtractedEntity)
WHERE e.end_user_id = $end_user_id AND e.name IN $names
RETURN e.aliases AS aliases
"""
try:
result = await neo4j_connector.execute_query(
cypher, end_user_id=end_user_id, names=query_names
)
assistant_aliases: set = set()
for record in (result or []):
for alias in (record.get("aliases") or []):
assistant_aliases.add(alias.strip().lower())
if assistant_aliases:
logger.debug(f"Neo4j 中 AI 助手别名: {assistant_aliases}")
return assistant_aliases
except Exception as e:
logger.warning(f"查询 Neo4j AI 助手别名失败: {e}")
return set()
def clean_cross_role_aliases(
entity_nodes: List[ExtractedEntityNode],
external_assistant_aliases: set = None,
) -> None:
"""清洗用户实体和AI助手实体之间的别名交叉污染。
在 Neo4j 写入前调用,确保:
- 用户实体的 aliases 不包含 AI 助手的别名
- AI 助手实体的 aliases 不包含用户的别名
Args:
entity_nodes: 实体节点列表(原地修改)
external_assistant_aliases: 外部传入的 AI 助手别名集合(如从 Neo4j 查询),
与本轮实体中的 AI 助手别名合并使用
"""
# 收集本轮 AI 助手实体的所有别名
assistant_aliases = set(external_assistant_aliases or set())
user_aliases = set()
for ent in entity_nodes:
if _is_assistant_entity(ent):
for alias in (getattr(ent, "aliases", []) or []):
assistant_aliases.add(alias.strip().lower())
elif _is_user_entity(ent):
for alias in (getattr(ent, "aliases", []) or []):
user_aliases.add(alias.strip().lower())
# 从用户实体的 aliases 中移除 AI 助手别名
if assistant_aliases:
for ent in entity_nodes:
if _is_user_entity(ent):
original = getattr(ent, "aliases", []) or []
cleaned = [a for a in original if a.strip().lower() not in assistant_aliases]
if len(cleaned) < len(original):
ent.aliases = cleaned
# 从 AI 助手实体的 aliases 中移除用户别名
if user_aliases:
for ent in entity_nodes:
if _is_assistant_entity(ent):
original = getattr(ent, "aliases", []) or []
cleaned = [a for a in original if a.strip().lower() not in user_aliases]
if len(cleaned) < len(original):
ent.aliases = cleaned
def accurate_match(
entity_nodes: List[ExtractedEntityNode]
) -> Tuple[List[ExtractedEntityNode], Dict[str, str], Dict[str, Dict]]:
@@ -261,6 +419,10 @@ def accurate_match(
canonical = alias_index.get((ent_uid, ent_name))
# 确保不是自身
if canonical is not None and canonical.id != ent.id:
# 保护禁止跨角色合并用户实体和AI助手实体不能互相合并
if _would_merge_cross_role(canonical, ent):
i += 1
continue
_merge_attribute(canonical, ent)
id_redirect[ent.id] = canonical.id
for k, v in list(id_redirect.items()):
@@ -704,6 +866,11 @@ def fuzzy_match(
# 条件A快速通道alias_match_merge = True
# 条件B标准通道s_name ≥ tn AND s_type ≥ type_threshold AND overall ≥ tover
if alias_match_merge or (s_name >= tn and s_type >= type_threshold and overall >= tover):
# 保护禁止跨角色合并用户实体和AI助手实体不能互相合并
if _would_merge_cross_role(a, b):
j += 1
continue
# ========== 第六步:执行实体合并 ==========
# 6.1 合并别名
@@ -813,6 +980,12 @@ async def LLM_decision( # 决策中包含去重和消歧的功能
b = entity_by_id.get(losing_id)
if not a or not b: # 若不存在 a 或 b可能已在精确或模糊阶段合并在之前阶段合并之后不会再处理但是处于审计的目的会记录
continue
# 保护禁止跨角色合并用户实体和AI助手实体不能互相合并
if _would_merge_cross_role(a, b):
llm_records.append(
f"[LLM阻断] 跨角色合并被阻止: {a.id} ({a.name}) 与 {b.id} ({b.name})"
)
continue
_merge_attribute(a, b)
# ID 重定向
try:
@@ -934,6 +1107,9 @@ async def deduplicate_entities_and_edges(
返回:去重后的实体、语句→实体边、实体↔实体边。
"""
local_llm_records: List[str] = [] # 作为“审计日志”的本地收集器 初始化保留为了之后对于LLM决策追溯
# 0) 标准化用户和AI助手实体名称确保多轮对话中的变体名称统一
_normalize_special_entity_names(entity_nodes)
# 1) 精确匹配
deduped_entities, id_redirect, exact_merge_map = accurate_match(entity_nodes)

View File

@@ -15,6 +15,7 @@ from app.core.memory.models.message_models import DialogData
from app.core.memory.models.variate_config import ExtractionPipelineConfig
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
deduplicate_entities_and_edges,
clean_cross_role_aliases,
)
from app.core.memory.storage_services.extraction_engine.deduplication.second_layer_dedup import (
second_layer_dedup_and_merge_with_neo4j,
@@ -100,6 +101,10 @@ async def dedup_layers_and_merge_and_return(
except Exception as e:
print(f"Second-layer dedup failed: {e}")
# 第二层去重后,清洗用户/AI助手之间的别名交叉污染
# 第二层从 Neo4j 合并了旧实体,可能带入历史脏数据
clean_cross_role_aliases(fused_entity_nodes)
return (
dialogue_nodes,
chunk_nodes,

View File

@@ -44,6 +44,10 @@ from app.core.memory.models.variate_config import (
from app.core.memory.storage_services.extraction_engine.deduplication.two_stage_dedup import (
dedup_layers_and_merge_and_return,
)
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
_USER_PLACEHOLDER_NAMES,
fetch_neo4j_assistant_aliases,
)
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.embedding_generation import (
embedding_generation,
generate_entity_embeddings_from_triplets,
@@ -1341,14 +1345,20 @@ class ExtractionOrchestrator:
dialog_data_list: List[DialogData]
) -> None:
"""
从 Neo4j 读取用户实体的最终 aliases同步到 end_user 和 end_user_info 表
将本轮提取的用户别名同步到 end_user 和 end_user_info 表
注意:
1. other_name 使用本次对话提取的第一个别名(保持时间顺序)
2. aliases 从 Neo4j 读取(保持完整性)
注意:此方法在 Neo4j 写入之前调用,因此不能依赖 Neo4j 作为别名的权威数据源。
改为直接使用内存中去重后的 entity_nodes 的 aliases与 PgSQL 已有的 aliases 合并。
策略:
1. 从内存中的 entity_nodes 提取本轮用户别名current_aliases
2. 从去重后的 entity_nodes 中提取完整别名(含 Neo4j 二层去重合并的历史别名)
3. 从 PgSQL end_user_info 读取已有的 aliasesdb_aliases
4. 合并 db_aliases + deduped_aliases + current_aliases去重保序
5. 写回 PgSQL
Args:
entity_nodes: 实体节点列表
entity_nodes: 去重后的实体节点列表(内存中,含二层去重合并结果)
dialog_data_list: 对话数据列表
"""
try:
@@ -1361,23 +1371,40 @@ class ExtractionOrchestrator:
logger.warning("end_user_id 为空,跳过用户别名同步")
return
# 1. 提取本对话的用户别名(保持 LLM 提取的原始顺序,不排序)
current_aliases = self._extract_current_aliases(entity_nodes)
# 1. 提取本对话的用户别名(保持 LLM 提取的原始顺序,不排序)
current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list)
# 2. 从 Neo4j 获取完整 aliases权威数据源
neo4j_aliases = await self._fetch_neo4j_user_aliases(end_user_id)
# 1.5 从去重后的 entity_nodes 中提取完整别名
# 二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 中,
# 这里提取出来确保 PgSQL 与 Neo4j 的别名保持同步
deduped_aliases = self._extract_deduped_entity_aliases(entity_nodes)
if not neo4j_aliases:
# Neo4j 中没有别名,使用本次对话提取的别名
neo4j_aliases = current_aliases
if not neo4j_aliases:
logger.debug(f"aliases 为空,跳过同步: end_user_id={end_user_id}")
return
# 1.6 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源
# (防止 LLM 未提取出 AI 助手实体时AI 别名泄漏到用户别名中)
neo4j_assistant_aliases = await self._fetch_neo4j_assistant_aliases(end_user_id)
if neo4j_assistant_aliases:
before_count = len(current_aliases)
current_aliases = [
a for a in current_aliases
if a.strip().lower() not in neo4j_assistant_aliases
]
if len(current_aliases) < before_count:
logger.info(f"通过 Neo4j AI 助手别名排除了 {before_count - len(current_aliases)} 个误归属别名")
# 同样过滤 deduped_aliases
deduped_aliases = [
a for a in deduped_aliases
if a.strip().lower() not in neo4j_assistant_aliases
]
logger.info(f"本次对话提取的 aliases: {current_aliases}")
logger.info(f"Neo4j 中的完整 aliases: {neo4j_aliases}")
if not current_aliases and not deduped_aliases:
logger.debug(f"本轮未提取到用户别名,跳过同步: end_user_id={end_user_id}")
return
# 3. 同步到数据库
logger.info(f"本轮对话提取的 aliases: {current_aliases}")
if deduped_aliases:
logger.info(f"去重后实体的完整 aliases含历史: {deduped_aliases}")
# 2. 同步到数据库
end_user_uuid = uuid.UUID(end_user_id)
with get_db_context() as db:
# 更新 end_user 表
@@ -1386,7 +1413,38 @@ class ExtractionOrchestrator:
logger.warning(f"未找到 end_user_id={end_user_id} 的用户记录")
return
new_name = self._resolve_other_name(end_user.other_name, current_aliases, neo4j_aliases)
# 3. 从 PgSQL 读取已有 aliases 并与本轮合并
info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid)
db_aliases = (info.aliases if info and info.aliases else [])
# 过滤掉占位名称
db_aliases = [a for a in db_aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES]
# 合并:已有 + 去重后完整别名 + 本轮新增,去重保序
merged_aliases = list(db_aliases)
seen_lower = {a.strip().lower() for a in merged_aliases}
# 先合并去重后实体的完整别名(含 Neo4j 历史别名)
for alias in deduped_aliases:
if alias.strip().lower() not in seen_lower:
merged_aliases.append(alias)
seen_lower.add(alias.strip().lower())
# 再合并本轮新提取的别名
for alias in current_aliases:
if alias.strip().lower() not in seen_lower:
merged_aliases.append(alias)
seen_lower.add(alias.strip().lower())
# 最终过滤:从合并结果中排除 AI 助手别名(清理历史脏数据)
if neo4j_assistant_aliases:
merged_aliases = [
a for a in merged_aliases
if a.strip().lower() not in neo4j_assistant_aliases
]
logger.info(f"PgSQL 已有 aliases: {db_aliases}")
logger.info(f"合并后 aliases: {merged_aliases}")
# 更新 end_user 表 other_name
new_name = self._resolve_other_name(end_user.other_name, current_aliases, merged_aliases)
if new_name is not None:
end_user.other_name = new_name
logger.info(f"更新 end_user 表 other_name → {new_name}")
@@ -1394,26 +1452,27 @@ class ExtractionOrchestrator:
logger.debug(f"end_user 表 other_name 保持不变: {end_user.other_name}")
# 更新或创建 end_user_info 记录
info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid)
if info:
new_name_info = self._resolve_other_name(info.other_name, current_aliases, neo4j_aliases)
new_name_info = self._resolve_other_name(info.other_name, current_aliases, merged_aliases)
if new_name_info is not None:
info.other_name = new_name_info
logger.info(f"更新 end_user_info 表 other_name → {new_name_info}")
if info.aliases != neo4j_aliases:
info.aliases = neo4j_aliases
logger.info(f"同步 Neo4j aliases 到 end_user_info: {neo4j_aliases}")
if info.aliases != merged_aliases:
info.aliases = merged_aliases
logger.info(f"同步合并后 aliases 到 end_user_info: {merged_aliases}")
else:
first_alias = current_aliases[0].strip() if current_aliases else ""
first_alias = current_aliases[0].strip() if current_aliases else (
deduped_aliases[0].strip() if deduped_aliases else ""
)
# 确保 first_alias 不是占位名称
if first_alias and first_alias not in self.USER_PLACEHOLDER_NAMES:
if first_alias and first_alias.lower() not in self.USER_PLACEHOLDER_NAMES:
db.add(EndUserInfo(
end_user_id=end_user_uuid,
other_name=first_alias,
aliases=neo4j_aliases,
aliases=merged_aliases,
meta_data={}
))
logger.info(f"创建 end_user_info 记录other_name={first_alias}, aliases={neo4j_aliases}")
logger.info(f"创建 end_user_info 记录other_name={first_alias}, aliases={merged_aliases}")
db.commit()
@@ -1423,49 +1482,81 @@ class ExtractionOrchestrator:
# 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中
USER_PLACEHOLDER_NAMES = {'用户', '', 'User', 'I'}
# 复用 deduped_and_disamb 模块级常量,避免重复维护
USER_PLACEHOLDER_NAMES = _USER_PLACEHOLDER_NAMES
def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]:
"""实体节点提取用户别名(保持 LLM 提取的原始顺序,不进行任何排序
def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]:
"""用户发言的原始实体中提取本轮新增别名(绕过去重污染
这个方法直接返回 LLM 提取的别名列表,并过滤掉占位名称("用户""""User""I")。
第一个别名将被用作 other_name。
策略:
仅从 dialog_data_list 中找到 speaker="user" 的 statement
从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases。
这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响。
注意:不再使用去重后 entity_nodes 作为兜底,因为二层去重会将 Neo4j 历史别名
合并进来,导致历史别名被误认为"本轮提取"。历史别名的同步由
_extract_deduped_entity_aliases 负责。
Args:
entity_nodes: 实体节点列表
entity_nodes: 去重后的实体节点列表(未使用,保留参数兼容性)
dialog_data_list: 对话数据列表
Returns:
别名列表(保持 LLM 提取的原始顺序,已过滤占位名称
别名列表(保持原始顺序,已过滤)
"""
if not dialog_data_list:
return []
all_user_aliases = []
seen_lower = set()
for dialog in dialog_data_list:
for chunk in dialog.chunks:
speaker = getattr(chunk, 'speaker', None)
for statement in chunk.statements:
stmt_speaker = getattr(statement, 'speaker', None) or speaker
if stmt_speaker != "user":
continue
triplet_info = getattr(statement, 'triplet_extraction_info', None)
if not triplet_info:
continue
for entity in (triplet_info.entities or []):
ent_name = getattr(entity, 'name', '').strip()
if ent_name.lower() in self.USER_PLACEHOLDER_NAMES:
for alias in (getattr(entity, 'aliases', []) or []):
a = alias.strip()
if a and a.lower() not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower:
all_user_aliases.append(a)
seen_lower.add(a.lower())
if all_user_aliases:
logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}")
return all_user_aliases
def _extract_deduped_entity_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]:
"""从去重后的用户实体中提取完整别名列表。
二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 的用户实体中,
因此这里提取到的别名包含了历史积累的所有别名,可用于同步到 PgSQL。
Args:
entity_nodes: 去重后的实体节点列表(含二层去重合并结果)
Returns:
别名列表(已过滤占位名称,去重保序)
"""
for entity in entity_nodes:
if getattr(entity, 'name', '').strip() in self.USER_PLACEHOLDER_NAMES:
if getattr(entity, 'name', '').strip().lower() in self.USER_PLACEHOLDER_NAMES:
aliases = getattr(entity, 'aliases', []) or []
# 过滤掉占位名称,防止 "用户"/"我"/"User"/"I" 被存入 aliases 和 other_name
filtered = [a for a in aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES]
logger.debug(f"提取到用户别名(原始顺序,已过滤占位名称): {filtered}")
return filtered
filtered = [
a for a in aliases
if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES
]
if filtered:
return filtered
return []
async def _fetch_neo4j_user_aliases(self, end_user_id: str) -> List[str]:
"""从 Neo4j 查询用户实体的完整 aliases 列表(已过滤占位名称)"""
cypher = """
MATCH (e:ExtractedEntity)
WHERE e.end_user_id = $end_user_id AND e.name IN ['用户', '', 'User', 'I']
RETURN e.aliases AS aliases
LIMIT 1
"""
result = await Neo4jConnector().execute_query(cypher, end_user_id=end_user_id)
if not result:
logger.debug(f"Neo4j 中未找到用户实体: end_user_id={end_user_id}")
return []
aliases = result[0].get('aliases') or []
if not aliases:
logger.debug(f"Neo4j 用户实体 aliases 为空: end_user_id={end_user_id}")
return []
# 过滤掉占位名称,防止历史脏数据传播
filtered = [a for a in aliases if a.strip() not in self.USER_PLACEHOLDER_NAMES]
return filtered
async def _fetch_neo4j_assistant_aliases(self, end_user_id: str) -> set:
"""从 Neo4j 查询 AI 助手实体的所有别名(用于从用户别名中排除)"""
return await fetch_neo4j_assistant_aliases(self.connector, end_user_id)
def _resolve_other_name(
self,
@@ -1484,16 +1575,16 @@ class ExtractionOrchestrator:
注意:返回值不允许是占位名称("用户""""User""I"
"""
# 当前值为空或为占位名称时,需要更新
if not current or not current.strip() or current.strip() in self.USER_PLACEHOLDER_NAMES:
if not current or not current.strip() or current.strip().lower() in self.USER_PLACEHOLDER_NAMES:
candidate = current_aliases[0].strip() if current_aliases else None
# 确保候选值不是占位名称
if candidate and candidate in self.USER_PLACEHOLDER_NAMES:
if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES:
return None
return candidate
if current not in neo4j_aliases:
candidate = neo4j_aliases[0].strip() if neo4j_aliases else None
# 确保候选值不是占位名称
if candidate and candidate in self.USER_PLACEHOLDER_NAMES:
if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES:
return None
return candidate

View File

@@ -61,6 +61,7 @@ class TripletExtractor:
predicate_instructions=PREDICATE_DEFINITIONS,
language=self._get_language(),
ontology_types=self.ontology_types,
speaker=getattr(statement, 'speaker', None),
)
# Create messages for LLM

View File

@@ -1,6 +1,6 @@
import os
from jinja2 import Environment, FileSystemLoader
from app.core.memory.models.ontology_extraction_models import OntologyTypeList
from app.core.memory.utils.log.logging_utils import log_prompt_rendering, log_template_rendering
# Setup Jinja2 environment
@@ -205,6 +205,7 @@ async def render_triplet_extraction_prompt(
predicate_instructions: dict = None,
language: str = "zh",
ontology_types: "OntologyTypeList | None" = None,
speaker: str = None,
) -> str:
"""
Renders the triplet extraction prompt using the extract_triplet.jinja2 template.
@@ -216,6 +217,7 @@ async def render_triplet_extraction_prompt(
predicate_instructions: Optional predicate instructions
language: The language to use for entity descriptions ("zh" for Chinese, "en" for English)
ontology_types: Optional OntologyTypeList containing predefined ontology types for entity classification
speaker: Speaker role ("user" or "assistant") for the current statement
Returns:
Rendered prompt content as string
@@ -223,7 +225,7 @@ async def render_triplet_extraction_prompt(
template = prompt_env.get_template("extract_triplet.jinja2")
# 准备本体类型数据
ontology_type_section = ""
ontology_type_section = None
ontology_type_names = []
type_hierarchy_hints = []
if ontology_types and ontology_types.types:
@@ -240,6 +242,7 @@ async def render_triplet_extraction_prompt(
ontology_types=ontology_type_section,
ontology_type_names=ontology_type_names,
type_hierarchy_hints=type_hierarchy_hints,
speaker=speaker,
)
# 记录渲染结果到提示日志(与示例日志结构一致)
log_prompt_rendering('triplet extraction', rendered_prompt)

View File

@@ -23,6 +23,16 @@ Extract entities and knowledge triplets from the given statement.
===Inputs===
**Chunk Content:** "{{ chunk_content }}"
**Statement:** "{{ statement }}"
{% if speaker %}
**Speaker:** {{ speaker }}
{% if speaker == "assistant" %}
{% if language == "zh" %}
⚠️ 当前陈述句来自 **AI助手的回复**。AI助手在回复中用来称呼用户的名字是**用户的别名**,不是 AI 助手的别名。但只能提取原文中逐字出现的名字,严禁推测或创造原文中不存在的别名变体。
{% else %}
⚠️ This statement is from the **AI assistant's reply**. Names the AI uses to address the user are **user's aliases**, NOT the AI assistant's aliases. But only extract names that appear VERBATIM in the text — never infer or fabricate alias variants.
{% endif %}
{% endif %}
{% endif %}
{% if ontology_types %}
===Ontology Type Guidance===
@@ -87,7 +97,17 @@ Extract entities and knowledge triplets from the given statement.
* "我叫张三,大家叫我小张" → aliases=["张三", "小张"](张三是第一个,将成为 other_name
* "大家叫我小李,我全名叫李明" → aliases=["小李", "李明"](小李先出现,将成为 other_name
- 空值:如果没有别名,使用 `[]`
- 重要:只提取本次对话中明确提到的别名,不要推测或添加未提及的名字
- **🚨🚨🚨 严禁幻觉:只提取对话原文中逐字出现的别名,绝对不能推测、衍生或创造任何未在原文中出现的名字。例如,看到"陈思远"不能自行添加"思远大人""远哥""小远"等变体。如果原文没有这些字,就不能出现在 aliases 中。**
- **🚨 归属区分:必须严格区分名称的归属对象。默认情况下,用户提到的名字归属用户实体。只有出现明确的第二人称命名表达(如"叫你""给你取名")时,才将名字归属 AI/助手实体。**
- **🚨 说话人视角:当 speaker 为 assistant 时AI 助手用来称呼用户的名字是用户的别名,必须归入用户实体的 aliases绝对不能归入 AI 助手实体。但同样只能提取原文中逐字出现的称呼,不能推测。**
* "我叫陈思远我给AI取名为远仔" → 用户 aliases=["陈思远"]AI助手 aliases=["远仔"]
* "我叫vv" → 用户 aliases=["vv"]没有给AI取名的表达名字归用户
* [speaker=assistant] "好的VV" → 用户 aliases=["VV"]AI 在称呼用户,原文中出现了"VV"
* [speaker=assistant] "我叫陈仔" → AI助手 aliases=["陈仔"]AI 在自我介绍,这是 AI 的别名)
* ❌ 错误:将"远仔"放入用户的 aliases"远仔"是给AI取的名字不是用户的名字
* ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases
* ❌ 错误AI 称呼用户为"VV",却把"VV"放入 AI 助手的 aliases
* ❌ 错误:原文只有"陈思远",却在 aliases 中添加"思远大人""远哥""小远"等从未出现的变体(这是幻觉)
{% else %}
- Include: nicknames, full names, abbreviations, alternative names
- Order: **The FIRST alias will be used as the user's primary display name (other_name). Put the most important/frequently used name FIRST**
@@ -96,7 +116,17 @@ Extract entities and knowledge triplets from the given statement.
* "I'm John, people call me Johnny" → aliases=["John", "Johnny"] (John is first, will become other_name)
* "People call me Mike, my full name is Michael" → aliases=["Mike", "Michael"] (Mike appears first, will become other_name)
- Empty: If no aliases, use `[]`
- Important: Only extract aliases explicitly mentioned in current conversation, do not infer or add unmentioned names
- **🚨🚨🚨 NO HALLUCINATION: Only extract aliases that appear VERBATIM in the original text. NEVER infer, derive, or fabricate names not present in the text. For example, seeing "John Smith" does NOT allow adding "Johnny", "Smithy", "Mr. Smith" unless those exact strings appear in the conversation.**
- **🚨 Ownership distinction: By default, all names mentioned by the user belong to the user entity. Only assign a name to the AI/assistant entity when an explicit second-person naming expression (e.g., "I'll call you", "your name is") is present.**
- **🚨 Speaker perspective: When speaker is "assistant", names the AI uses to address the user are the USER's aliases and MUST go into the user entity's aliases, NEVER into the AI assistant entity's aliases. But only extract names that appear verbatim in the text, never infer.**
* "I'm Alex, I'll call you Buddy" → User aliases=["Alex"], AI assistant aliases=["Buddy"]
* "I'm vv" → User aliases=["vv"] (no AI-naming expression, name belongs to user)
* [speaker=assistant] "Sure thing, VV" → User aliases=["VV"] (AI addressing the user, "VV" appears in text)
* [speaker=assistant] "I'm Jarvis" → AI assistant aliases=["Jarvis"] (AI self-introduction, this is AI's alias)
* ❌ Wrong: putting "Buddy" in user's aliases ("Buddy" is a name for the AI, not the user)
* ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases
* ❌ Wrong: AI calls user "VV" but "VV" is put in AI assistant's aliases
* ❌ Wrong: Text only has "John Smith" but aliases include "Johnny", "Smithy" (hallucinated variants)
{% endif %}
@@ -122,7 +152,60 @@ Extract entities and knowledge triplets from the given statement.
4. **ALIASES ORDER:**
4. **AI/ASSISTANT ENTITY SPECIAL HANDLING:**
{% if language == "zh" %}
- **🚨 默认规则:如果对话中没有出现明确指向 AI/助手的命名表达,则所有名字都归属于用户实体。不要猜测或推断某个名字是给 AI 取的。**
- 只有当用户**明确**对 AI/助手进行命名时,才创建 AI/助手实体并将对应名字放入其 aliases
- AI/助手实体的 name 字段:使用 "AI助手"
- 用户给 AI 取的名字:放入 AI/助手实体的 aliases
- **🚨 禁止将用户给 AI 取的名字放入用户实体的 aliases 中**
- **必须出现以下明确的命名表达才能判定为给 AI 取名:**「给你取名」「叫你」「称呼你为」「给AI取名」「你的名字是」「以后叫你」「你就叫」「你不叫X了」「你现在叫」等**第二人称(你)或明确指向 AI 的命名句式**
- **🚨 "你不叫X了"/"你不叫X你叫Y" 句式X 和 Y 都是 AI 的名字(旧名和新名),绝对不是用户的名字。因为句子主语是"你"AI。**
- **以下情况名字归属用户,不是给 AI 取名:**「我叫」「我的名字是」「叫我」「我是」「大家叫我」「我的英文名是」「我的昵称是」等**第一人称(我)的自我介绍句式**
- **🚨 speaker=assistant 时的特殊规则:**
* AI 用来称呼用户的名字 → 归入**用户**实体的 aliases但必须是原文中逐字出现的称呼不能推测
* AI 自称的名字(如"我叫陈仔""我是你的助手")→ 归入**AI助手**实体的 aliases
* 判断依据AI 说"你叫X"或用 X 称呼用户 → X 是用户别名AI 说"我叫X"或"我是X" → X 是 AI 别名
- 示例:
* "我叫vv" → 用户实体: name="用户", aliases=["vv"](第一人称自我介绍,名字归用户)
* "我的英文名叫vv" → 用户实体: name="用户", aliases=["vv"](第一人称自我介绍,名字归用户)
* "我叫陈思远我给AI取名为远仔" → 用户实体: name="用户", aliases=["陈思远"]AI实体: name="AI助手", aliases=["远仔"]
* "叫你小助,我自己叫老王" → 用户实体: name="用户", aliases=["老王"]AI实体: name="AI助手", aliases=["小助"]
* "你不叫远仔了,你现在叫陈仔" → AI实体: name="AI助手", aliases=["陈仔"]"远仔"是AI旧名"陈仔"是AI新名都归AI。不要把"远仔"或"陈仔"放入用户的aliases
* [speaker=assistant] "好的VV今天想干点啥" → 用户实体: name="用户", aliases=["VV"]AI 在称呼用户,原文中出现了"VV"
* [speaker=assistant] "你叫陈思远,我叫陈仔" → 用户实体: name="用户", aliases=["陈思远"]AI实体: name="AI助手", aliases=["陈仔"]
* ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases没有任何给 AI 取名的表达)
* ❌ 错误AI 称呼用户为"VV",却把"VV"放入 AI 助手的 aliases
* ❌ 错误aliases=["陈思远", "远仔"]"远仔"是给AI取的名字不是用户的名字
* ❌ 错误:原文只有"陈思远",却在 aliases 中添加"思远大人""远哥""小远"等从未出现的变体(这是幻觉)
{% else %}
- **🚨 Default rule: If there is NO explicit AI/assistant naming expression in the conversation, ALL names belong to the user entity. Do NOT guess or infer that a name is for the AI.**
- Only create an AI/assistant entity when the user **explicitly** names the AI/assistant
- AI/assistant entity name field: use "AI Assistant"
- Names the user gives to the AI: put in the AI/assistant entity's aliases
- **🚨 NEVER put names given to the AI into the user entity's aliases**
- **An AI-naming expression MUST be present to assign a name to the AI:** "I'll call you", "your name is", "I name you", "let me call you", "you'll be called", "you're not called X anymore", "your new name is", etc. — **second-person ("you") or explicit AI-directed naming patterns**
- **🚨 "You're not called X anymore" / "You're not X, you're Y" pattern: BOTH X and Y are AI's names (old and new). They are NOT user's names. The subject is "you" (the AI).**
- **These patterns mean the name belongs to the USER, NOT the AI:** "I'm", "my name is", "call me", "I am", "people call me", "my English name is", "my nickname is", etc. — **first-person ("I"/"me") self-introduction patterns**
- **🚨 Special rules when speaker=assistant:**
* Names the AI uses to address the user → belong to the **user** entity's aliases (but only extract names that appear verbatim in the text, never infer)
* Names the AI uses for itself (e.g., "I'm Jarvis", "I am your assistant") → belong to the **AI assistant** entity's aliases
* Rule: AI says "you are X" or calls user X → X is user's alias; AI says "I'm X" or "I am X" → X is AI's alias
- Examples:
* "I'm vv" → User entity: name="User", aliases=["vv"] (first-person intro, name belongs to user)
* "My English name is vv" → User entity: name="User", aliases=["vv"] (first-person intro, name belongs to user)
* "I'm Alex, I'll call you Buddy" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Buddy"]
* "Call yourself Jarvis, my name is Tony" → User entity: name="User", aliases=["Tony"]; AI entity: name="AI Assistant", aliases=["Jarvis"]
* "You're not called Jarvis anymore, your new name is Friday" → AI entity: name="AI Assistant", aliases=["Friday"] (both "Jarvis" and "Friday" are AI names, NOT user names)
* [speaker=assistant] "Sure thing, VV" → User entity: name="User", aliases=["VV"] (AI addressing the user, "VV" appears in text)
* [speaker=assistant] "You're Alex, and I'm Jarvis" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Jarvis"]
* ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases (no AI-naming expression exists)
* ❌ Wrong: AI calls user "VV" but "VV" is put in AI assistant's aliases
* ❌ Wrong: aliases=["Alex", "Buddy"] ("Buddy" is a name for the AI, not the user)
* ❌ Wrong: Text only has "John Smith" but aliases include "Johnny", "Smithy" (hallucinated variants)
{% endif %}
5. **ALIASES ORDER:**
{% if language == "zh" %}
- 顺序优先级:按出现顺序,先出现的在前
{% else %}
@@ -202,8 +285,19 @@ Output:
{"entity_idx": 0, "name": "Tripod", "type": "Equipment", "description": "Photography equipment accessory", "example": "", "aliases": ["Camera Tripod"], "is_explicit_memory": false}
]
}
**Example 4 (User vs AI alias distinction - English output):** "I'm Alex, and I'll call you Buddy"
Output:
{
"triplets": [
{"subject_name": "User", "subject_id": 0, "predicate": "NAMED", "object_name": "AI Assistant", "object_id": 1, "value": "Buddy"}
],
"entities": [
{"entity_idx": 0, "name": "User", "type": "Person", "description": "The user", "example": "", "aliases": ["Alex"], "is_explicit_memory": false},
{"entity_idx": 1, "name": "AI Assistant", "type": "Person", "description": "The user's AI assistant", "example": "", "aliases": ["Buddy"], "is_explicit_memory": false}
]
}
{% else %}
**Example 1 (English input → Chinese output):** "I plan to travel to Paris next week and visit the Louvre."
Output:
{
"triplets": [
@@ -258,6 +352,39 @@ Output:
]
}
**Example 6 (用户与AI别名区分 - Chinese):** "我称呼自己为陈思远我给AI取名为远仔"
Output:
{
"triplets": [
{"subject_name": "用户", "subject_id": 0, "predicate": "NAMED", "object_name": "AI助手", "object_id": 1, "value": "远仔"}
],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": ["陈思远"], "is_explicit_memory": false},
{"entity_idx": 1, "name": "AI助手", "type": "Person", "description": "用户的AI助手", "example": "", "aliases": ["远仔"], "is_explicit_memory": false}
]
}
**Example 7 (纯用户自我介绍无AI命名 - Chinese):** "我叫vv"
Output:
{
"triplets": [],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": ["vv"], "is_explicit_memory": false}
]
}
**Example 8 (给AI改名 - Chinese):** "你不叫远仔了,你现在叫陈仔"
Output:
{
"triplets": [
{"subject_name": "用户", "subject_id": 0, "predicate": "NAMED", "object_name": "AI助手", "object_id": 1, "value": "陈仔"}
],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": [], "is_explicit_memory": false},
{"entity_idx": 1, "name": "AI助手", "type": "Person", "description": "用户的AI助手", "example": "", "aliases": ["陈仔"], "is_explicit_memory": false}
]
}
{% endif %}
===End of Examples===