feat(memory): unify alias extraction into metadata pipeline and deduplicate user entity nodes

- Merge alias add/remove into MetadataExtractionResponse and Celery metadata task,
  removing the separate sync step from extraction_orchestrator
- Replace first-person pronouns ("我") with "用户" in statement extraction to
  preserve identity semantics for downstream metadata/alias extraction
- Update extract_statement.jinja2 prompt to enforce "用户" as subject for user
  statements instead of resolving to real names
- Add alias change instructions (aliases_to_add/aliases_to_remove) to
  extract_user_metadata.jinja2 with incremental merge logic
- Deduplicate special entities ("用户", "AI助手") in graph_saver by reusing
  existing Neo4j node IDs per end_user_id
- Sync final aliases from PgSQL to Neo4j user entity nodes after metadata write
This commit is contained in:
lanceyq
2026-04-09 21:55:59 +08:00
parent e0546e01ef
commit 15a863b41a
8 changed files with 304 additions and 76 deletions

View File

@@ -38,3 +38,11 @@ class MetadataExtractionResponse(BaseModel):
"""元数据提取 LLM 响应结构"""
model_config = ConfigDict(extra='ignore')
user_metadata: UserMetadata = Field(default_factory=UserMetadata)
aliases_to_add: List[str] = Field(
default_factory=list,
description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)"
)
aliases_to_remove: List[str] = Field(
default_factory=list,
description="用户明确否认的别名(如'我不叫XX了'"
)

View File

@@ -311,9 +311,8 @@ class ExtractionOrchestrator:
dialog_data_list,
)
# 步骤 7: 同步用户别名到数据库表 + 触发异步元数据提取(仅正式模式)
# 步骤 7: 触发异步元数据和别名提取(仅正式模式)
if not is_pilot_run:
# 收集用户相关 statement 并触发异步元数据提取
try:
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor
metadata_extractor = MetadataExtractor(llm_client=self.llm_client, language=self.language)
@@ -322,7 +321,6 @@ class ExtractionOrchestrator:
statement_entity_edges
)
if user_statements:
# 获取 end_user_id 和 config_id
end_user_id = dialog_data_list[0].end_user_id if dialog_data_list else None
config_id = dialog_data_list[0].config_id if dialog_data_list and hasattr(dialog_data_list[0], 'config_id') else None
if end_user_id:
@@ -339,9 +337,7 @@ class ExtractionOrchestrator:
except Exception as e:
logger.error(f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True)
# 同步用户别名到数据库表
logger.info("步骤 7: 同步用户别名到 end_user 和 end_user_info 表")
await self._update_end_user_other_name(entity_nodes, dialog_data_list)
# 别名同步已迁移到 Celery 元数据提取任务中,不再在此处执行
logger.info(f"知识提取流水线运行完成({mode_str}")
return (

View File

@@ -14,9 +14,9 @@ from app.core.memory.models.graph_models import (
StatementNode,
)
from app.core.memory.models.metadata_models import (
MetadataExtractionResponse,
UserMetadata,
)
from app.core.memory.models.message_models import DialogData
logger = logging.getLogger(__name__)
@@ -50,6 +50,35 @@ class MetadataExtractor:
return "zh"
return "en"
# def collect_user_raw_messages(
# self,
# dialog_data_list: List[DialogData],
# ) -> List[str]:
# """
# 从原始对话数据中提取 speaker="user" 的消息原文。
# 直接使用用户的原始输入,不经过陈述句提取阶段的 LLM 改写,
# 避免第一人称被替换为第三人称导致元数据/别名提取错误。
# Returns:
# 用户原始消息文本列表
# """
# result = []
# for dialog in dialog_data_list:
# if not dialog.context or not dialog.context.msgs:
# continue
# for msg in dialog.context.msgs:
# if getattr(msg, 'role', '') == 'user':
# text = (getattr(msg, 'msg', '') or '').strip()
# if text:
# result.append(text)
# logger.info(f"收集到 {len(result)} 条用户原始消息")
# if result:
# for i, text in enumerate(result):
# logger.info(f" [user message {i+1}] {text[:200]}")
# return result
def collect_user_related_statements(
self,
entity_nodes: List[ExtractedEntityNode],
@@ -104,6 +133,9 @@ class MetadataExtractor:
f"(直接关联: {total_associated}, speaker=user: {len(result)}, "
f"跳过非user: {skipped_non_user})"
)
if result:
for i, text in enumerate(result):
logger.info(f" [user statement {i+1}] {text}")
if total_associated > 0 and len(result) == 0:
logger.warning(
f"{total_associated} 条直接关联 statement 但全部被 speaker 过滤,"
@@ -111,18 +143,22 @@ class MetadataExtractor:
)
return result
async def extract_metadata(self, statements: List[str], existing_metadata: Optional[dict] = None) -> Optional[UserMetadata]:
async def extract_metadata(
self,
statements: List[str],
existing_metadata: Optional[dict] = None,
existing_aliases: Optional[List[str]] = None,
) -> Optional[tuple]:
"""
对筛选后的 statement 列表调用 LLM 提取元数据。
语言根据 statement 内容自动检测,不依赖系统界面语言。
传入已有元数据作为上下文,让 LLM 能判断 replace/remove 操作。
对筛选后的 statement 列表调用 LLM 提取元数据和用户别名
Args:
statements: 用户发言的 statement 文本列表
existing_metadata: 数据库已有的元数据(可选),用于 LLM 对比判断变更
existing_metadata: 数据库已有的元数据(可选)
existing_aliases: 数据库已有的用户别名列表(可选)
Returns:
UserMetadata on success, None on failure
(UserMetadata, List[str], List[str]) tuple: (metadata, aliases_to_add, aliases_to_remove) on success, None on failure
"""
if not statements:
return None
@@ -130,7 +166,6 @@ class MetadataExtractor:
try:
from app.core.memory.utils.prompt.prompt_utils import prompt_env
# 根据写入内容的语言自动检测,而非使用系统界面语言
detected_language = self.detect_language(statements)
logger.info(f"元数据提取语言检测结果: {detected_language}")
@@ -139,18 +174,23 @@ class MetadataExtractor:
statements=statements,
language=detected_language,
existing_metadata=existing_metadata,
existing_aliases=existing_aliases,
json_schema="",
)
from app.core.memory.models.metadata_models import MetadataExtractionResponse
response = await self.llm_client.response_structured(
messages=[{"role": "user", "content": prompt}],
response_model=MetadataExtractionResponse,
)
if response and response.user_metadata:
return response.user_metadata
if response:
metadata = response.user_metadata if response.user_metadata else None
to_add = response.aliases_to_add if response.aliases_to_add else []
to_remove = response.aliases_to_remove if response.aliases_to_remove else []
return metadata, to_add, to_remove
logger.warning("LLM 返回的元数据为空")
logger.warning("LLM 返回的响应为空")
return None
except Exception as e:

View File

@@ -82,6 +82,18 @@ class StatementExtractor:
logger.warning(f"Chunk {getattr(chunk, 'id', 'unknown')} has no speaker field or is empty")
return None
@staticmethod
def _replace_first_person_with_user(text: str) -> str:
"""将用户消息中的第一人称代词""替换为"用户"
替换规则:
- 所有独立的""都替换为"用户"(包括"我的""用户的""叫我""叫用户"
- 不替换"我们"中的"""我们"是复数,不指代用户个人)
"""
import re
result = re.sub(r'我(?!们)', '用户', text)
return result
async def _extract_statements(self, chunk, end_user_id: Optional[str] = None, dialogue_content: str = None) -> List[Statement]:
"""Process a single chunk and return extracted statements
@@ -99,6 +111,13 @@ class StatementExtractor:
logger.warning(f"Chunk {chunk.id} content too short or empty, skipping")
return []
# 对 speaker="user" 的 chunk将第一人称"我"替换为"用户"
# 避免 LLM 在陈述句提取时将"我"替换为具体名字(如"齐齐"
# 导致下游元数据/别名提取无法识别第一人称语义。
chunk_speaker = self._get_speaker_from_chunk(chunk)
if chunk_speaker == "user":
chunk_content = self._replace_first_person_with_user(chunk_content)
prompt_content = await render_statement_extraction_prompt(
chunk_content=chunk_content,
definitions=LABEL_DEFINITIONS,
@@ -149,8 +168,6 @@ class StatementExtractor:
relevence_info = RelevenceInfo[relevence_str] if relevence_str in RelevenceInfo.__members__ else RelevenceInfo.RELEVANT
except (KeyError, ValueError):
relevence_info = RelevenceInfo.RELEVANT
chunk_speaker = self._get_speaker_from_chunk(chunk)
chunk_statement = Statement(
statement=extracted_stmt.statement,

View File

@@ -43,8 +43,9 @@ Each statement must be labeled as per the criteria mentioned below.
对话上下文和共指消解:
- 将每个陈述句归属于说出它的参与者。
- 如果参与者列表为说话者提供了名称(例如,"李雪(用户)"),请在提取的陈述句中使用具体名称("李雪"),而不是通用角色("用户"
- 将所有代词解析为对话上下文中的具体人物或实体
- **对于用户的发言:必须使用"用户"作为主语**,禁止将"用户"或"我"替换为用户的真实姓名或别名。例如,用户说"我叫张三"应提取为"用户叫张三",而不是"张三叫张三"
- 对于 AI 助手的发言:使用"助手"或"AI助手"作为主语
- 将所有代词解析为对话上下文中的具体人物或实体,但"我"必须解析为"用户"。
- 识别并将抽象引用解析为其具体名称(如果提到)。
- 将缩写和首字母缩略词扩展为其完整形式。
{% else %}
@@ -68,8 +69,9 @@ Context Resolution Requirements:
Conversational Context & Co-reference Resolution:
- Attribute every statement to the participant who uttered it.
- If the participant list provides a name for a speaker (e.g., "李雪 (用户)"), use the specific name ("李雪") in the extracted statement, not the generic role ("用户").
- Resolve all pronouns to the specific person or entity from the conversation's context.
- **For user's statements: always use "用户" (User) as the subject**. Do NOT replace "用户" or "I" with the user's real name or alias. For example, if the user says "I'm John", extract as "用户 is John", not "John is John".
- For AI assistant's statements: use "助手" or "AI助手" as the subject.
- Resolve all pronouns to the specific person or entity from the conversation's context, but "I"/"我" must always resolve to "用户".
- Identify and resolve abstract references to their specific names if mentioned.
- Expand abbreviations and acronyms to their full form.
{% endif %}
@@ -139,13 +141,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料与阿拉伯树胶等粘合
示例输出: {
"statements": [
{
"statement": "Sarah Chen 最近一直在尝试水彩画。",
"statement": "用户最近一直在尝试水彩画。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"relevance": "RELEVANT"
},
{
"statement": "Sarah Chen 画了一些花朵。",
"statement": "用户画了一些花朵。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"relevance": "RELEVANT"
@@ -157,13 +159,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料与阿拉伯树胶等粘合
"relevance": "IRRELEVANT"
},
{
"statement": "Sarah Chen 认为她的水彩画中的色彩组合可以改进。",
"statement": "用户认为她的水彩画中的色彩组合可以改进。",
"statement_type": "OPINION",
"temporal_type": "STATIC",
"relevance": "RELEVANT"
},
{
"statement": "Sarah Chen 真的很喜欢玫瑰和百合。",
"statement": "用户真的很喜欢玫瑰和百合。",
"statement_type": "FACT",
"temporal_type": "STATIC",
"relevance": "RELEVANT"
@@ -186,13 +188,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合
示例输出: {
"statements": [
{
"statement": "张曼婷最近在尝试水彩画。",
"statement": "用户最近在尝试水彩画。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"relevance": "RELEVANT"
},
{
"statement": "张曼婷画了一些花朵。",
"statement": "用户画了一些花朵。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"relevance": "RELEVANT"
@@ -204,13 +206,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合
"relevance": "IRRELEVANT"
},
{
"statement": "张曼婷觉得水彩画的色彩搭配还有提升的空间。",
"statement": "用户觉得水彩画的色彩搭配还有提升的空间。",
"statement_type": "OPINION",
"temporal_type": "STATIC",
"relevance": "RELEVANT"
},
{
"statement": "张曼婷很喜欢玫瑰和百合。",
"statement": "用户很喜欢玫瑰和百合。",
"statement_type": "FACT",
"temporal_type": "STATIC",
"relevance": "RELEVANT"
@@ -233,13 +235,13 @@ User: "I think the color combinations could use some improvement, but I really l
Example Output: {
"statements": [
{
"statement": "Sarah Chen has been trying watercolor painting recently.",
"statement": "用户 has been trying watercolor painting recently.",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"relevance": "RELEVANT"
},
{
"statement": "Sarah Chen painted some flowers.",
"statement": "用户 painted some flowers.",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"relevance": "RELEVANT"
@@ -251,13 +253,13 @@ Example Output: {
"relevance": "IRRELEVANT"
},
{
"statement": "Sarah Chen thinks the color combinations in her watercolor paintings could use some improvement.",
"statement": "用户 thinks the color combinations in her watercolor paintings could use some improvement.",
"statement_type": "OPINION",
"temporal_type": "STATIC",
"relevance": "RELEVANT"
},
{
"statement": "Sarah Chen really likes roses and lilies.",
"statement": "用户 really likes roses and lilies.",
"statement_type": "FACT",
"temporal_type": "STATIC",
"relevance": "RELEVANT"
@@ -280,13 +282,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合
Example Output: {
"statements": [
{
"statement": "张曼婷最近在尝试水彩画。",
"statement": "用户最近在尝试水彩画。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"relevance": "RELEVANT"
},
{
"statement": "张曼婷画了一些花朵。",
"statement": "用户画了一些花朵。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"relevance": "RELEVANT"
@@ -298,13 +300,13 @@ Example Output: {
"relevance": "IRRELEVANT"
},
{
"statement": "张曼婷觉得水彩画的色彩搭配还有提升的空间。",
"statement": "用户觉得水彩画的色彩搭配还有提升的空间。",
"statement_type": "OPINION",
"temporal_type": "STATIC",
"relevance": "RELEVANT"
},
{
"statement": "张曼婷很喜欢玫瑰和百合。",
"statement": "用户很喜欢玫瑰和百合。",
"statement_type": "FACT",
"temporal_type": "STATIC",
"relevance": "RELEVANT"

View File

@@ -32,6 +32,22 @@ Extract user metadata from the following conversation statements spoken by the u
- behavioral_hints.preferred_depth偏好深度概览/技术细节/深入探讨)
- behavioral_hints.tone_preference语气偏好轻松随意/专业简洁/学术严谨)
- knowledge_tags用户涉及的知识领域标签
**用户别名变更(增量模式):**
- **aliases_to_add**:本次新发现的用户别名,包括:
* 用户主动自我介绍:如"我叫张三"、"我的名字是XX"、"我的网名是XX"
* 他人对用户的称呼:如"同事叫我陈哥"、"大家叫我小张"、"领导叫我老陈"
* 只提取原文中逐字出现的名字,严禁推测或创造
* 禁止提取:用户给 AI 取的名字、第三方人物自身的名字、"用户"/"我" 等占位词
* 如果没有新别名,返回空数组 `[]`
- **aliases_to_remove**:用户明确否认的别名,包括:
* 用户说"我不叫XX了"、"别叫我XX"、"我改名了不叫XX" → 将 XX 放入此数组
* **严格限制**:只将用户原文中**逐字提到**的被否认名字放入,不要推断关联的其他别名
* 例如:用户说"我不叫陈小刀了" → 只移除"陈小刀",不要移除"陈哥"、"老陈"等未被提及的别名
* 如果没有要移除的别名,返回空数组 `[]`
{% if existing_aliases %}
- 已有别名:{{ existing_aliases | tojson }}(仅供参考,不需要在输出中重复)
{% endif %}
{% else %}
**"Three-Degree Principle" criteria:**
- Reusability: Will this information be used by multiple functional modules?
@@ -63,6 +79,22 @@ Existing user metadata from the database is provided below. Combine with the use
- behavioral_hints.preferred_depth: Preferred depth (overview/detailed/deep dive)
- behavioral_hints.tone_preference: Tone preference (casual/professional/academic)
- knowledge_tags: Knowledge domain tags related to the user
**User alias changes (incremental mode):**
- **aliases_to_add**: Newly discovered user aliases from this conversation, including:
* User self-introductions: e.g. "I'm John", "My name is XX", "My username is XX"
* How others address the user: e.g. "My colleagues call me Johnny", "People call me Mike"
* Only extract names that appear VERBATIM in the text — never infer or fabricate
* Do NOT extract: names the user gives to the AI, third-party people's own names, placeholder words like "User"/"I"
* If no new aliases, return empty array `[]`
- **aliases_to_remove**: Aliases the user explicitly denies, including:
* User says "Don't call me XX anymore", "I'm not called XX", "I changed my name from XX" → put XX in this array
* **Strict rule**: Only include the exact name the user **verbatim mentions** as denied. Do NOT infer or remove related aliases
* Example: User says "I'm not called John anymore" → only remove "John", do NOT remove "Johnny", "J" or other related aliases not mentioned
* If no aliases to remove, return empty array `[]`
{% if existing_aliases %}
- Existing aliases: {{ existing_aliases | tojson }} (for reference only, do not repeat in output)
{% endif %}
{% endif %}
===User Statements===
@@ -94,7 +126,9 @@ Return a JSON object with the following structure:
"tone_preference": ""
},
"knowledge_tags": []
}
},
"aliases_to_add": [],
"aliases_to_remove": []
}
```