refactor(memory): remove first-person pronoun replacement and inline metadata utils

- Remove _replace_first_person_with_user from StatementExtractor to preserve
  original user text for downstream metadata/alias extraction
- Delete metadata_utils.py module, inline clean_metadata into Celery task
- Remove unused imports and commented-out collect_user_raw_messages method
- Apply formatting cleanup across metadata models and extraction orchestrator
This commit is contained in:
lanceyq
2026-04-10 00:29:18 +08:00
parent 15a863b41a
commit e0b7e95af6
6 changed files with 69 additions and 127 deletions

View File

@@ -11,16 +11,22 @@ from pydantic import BaseModel, ConfigDict, Field
class UserMetadataProfile(BaseModel): class UserMetadataProfile(BaseModel):
"""用户画像信息""" """用户画像信息"""
model_config = ConfigDict(extra='ignore')
model_config = ConfigDict(extra="ignore")
role: str = Field(default="", description="用户职业或角色") role: str = Field(default="", description="用户职业或角色")
domain: str = Field(default="", description="用户所在领域") domain: str = Field(default="", description="用户所在领域")
expertise: List[str] = Field(default_factory=list, description="用户擅长的技能或工具") expertise: List[str] = Field(
interests: List[str] = Field(default_factory=list, description="用户关注的话题或领域标签") default_factory=list, description="用户擅长的技能或工具"
)
interests: List[str] = Field(
default_factory=list, description="用户关注的话题或领域标签"
)
class UserMetadataBehavioralHints(BaseModel): class UserMetadataBehavioralHints(BaseModel):
"""行为偏好""" """行为偏好"""
model_config = ConfigDict(extra='ignore')
model_config = ConfigDict(extra="ignore")
learning_stage: str = Field(default="", description="学习阶段") learning_stage: str = Field(default="", description="学习阶段")
preferred_depth: str = Field(default="", description="偏好深度") preferred_depth: str = Field(default="", description="偏好深度")
tone_preference: str = Field(default="", description="语气偏好") tone_preference: str = Field(default="", description="语气偏好")
@@ -28,21 +34,24 @@ class UserMetadataBehavioralHints(BaseModel):
class UserMetadata(BaseModel): class UserMetadata(BaseModel):
"""用户元数据顶层结构""" """用户元数据顶层结构"""
model_config = ConfigDict(extra='ignore')
model_config = ConfigDict(extra="ignore")
profile: UserMetadataProfile = Field(default_factory=UserMetadataProfile) profile: UserMetadataProfile = Field(default_factory=UserMetadataProfile)
behavioral_hints: UserMetadataBehavioralHints = Field(default_factory=UserMetadataBehavioralHints) behavioral_hints: UserMetadataBehavioralHints = Field(
default_factory=UserMetadataBehavioralHints
)
knowledge_tags: List[str] = Field(default_factory=list, description="知识标签") knowledge_tags: List[str] = Field(default_factory=list, description="知识标签")
class MetadataExtractionResponse(BaseModel): class MetadataExtractionResponse(BaseModel):
"""元数据提取 LLM 响应结构""" """元数据提取 LLM 响应结构"""
model_config = ConfigDict(extra='ignore')
model_config = ConfigDict(extra="ignore")
user_metadata: UserMetadata = Field(default_factory=UserMetadata) user_metadata: UserMetadata = Field(default_factory=UserMetadata)
aliases_to_add: List[str] = Field( aliases_to_add: List[str] = Field(
default_factory=list, default_factory=list,
description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)" description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)",
) )
aliases_to_remove: List[str] = Field( aliases_to_remove: List[str] = Field(
default_factory=list, default_factory=list, description="用户明确否认的别名(如'我不叫XX了'"
description="用户明确否认的别名(如'我不叫XX了'"
) )

View File

@@ -314,28 +314,48 @@ class ExtractionOrchestrator:
# 步骤 7: 触发异步元数据和别名提取(仅正式模式) # 步骤 7: 触发异步元数据和别名提取(仅正式模式)
if not is_pilot_run: if not is_pilot_run:
try: try:
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import (
metadata_extractor = MetadataExtractor(llm_client=self.llm_client, language=self.language) MetadataExtractor,
user_statements = metadata_extractor.collect_user_related_statements( )
entity_nodes, statement_nodes,
statement_entity_edges metadata_extractor = MetadataExtractor(
llm_client=self.llm_client, language=self.language
)
user_statements = (
metadata_extractor.collect_user_related_statements(
entity_nodes, statement_nodes, statement_entity_edges
)
) )
if user_statements: if user_statements:
end_user_id = dialog_data_list[0].end_user_id if dialog_data_list else None end_user_id = (
config_id = dialog_data_list[0].config_id if dialog_data_list and hasattr(dialog_data_list[0], 'config_id') else None dialog_data_list[0].end_user_id
if dialog_data_list
else None
)
config_id = (
dialog_data_list[0].config_id
if dialog_data_list
and hasattr(dialog_data_list[0], "config_id")
else None
)
if end_user_id: if end_user_id:
from app.tasks import extract_user_metadata_task from app.tasks import extract_user_metadata_task
extract_user_metadata_task.delay( extract_user_metadata_task.delay(
end_user_id=str(end_user_id), end_user_id=str(end_user_id),
statements=user_statements, statements=user_statements,
config_id=str(config_id) if config_id else None, config_id=str(config_id) if config_id else None,
language=self.language, language=self.language,
) )
logger.info(f"已触发异步元数据提取任务,共 {len(user_statements)} 条用户相关 statement") logger.info(
f"已触发异步元数据提取任务,共 {len(user_statements)} 条用户相关 statement"
)
else: else:
logger.info("未找到用户相关 statement跳过元数据提取") logger.info("未找到用户相关 statement跳过元数据提取")
except Exception as e: except Exception as e:
logger.error(f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True) logger.error(
f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True
)
# 别名同步已迁移到 Celery 元数据提取任务中,不再在此处执行 # 别名同步已迁移到 Celery 元数据提取任务中,不再在此处执行
@@ -1501,9 +1521,6 @@ class ExtractionOrchestrator:
except Exception as e: except Exception as e:
logger.error(f"更新 end_user other_name 失败: {e}", exc_info=True) logger.error(f"更新 end_user other_name 失败: {e}", exc_info=True)
# 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中 # 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中
# 复用 deduped_and_disamb 模块级常量,避免重复维护 # 复用 deduped_and_disamb 模块级常量,避免重复维护
USER_PLACEHOLDER_NAMES = _USER_PLACEHOLDER_NAMES USER_PLACEHOLDER_NAMES = _USER_PLACEHOLDER_NAMES
@@ -1610,7 +1627,6 @@ class ExtractionOrchestrator:
if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES: if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES:
return None return None
return candidate return candidate
return None return None
async def _run_dedup_and_write_summary( async def _run_dedup_and_write_summary(

View File

@@ -13,10 +13,6 @@ from app.core.memory.models.graph_models import (
StatementEntityEdge, StatementEntityEdge,
StatementNode, StatementNode,
) )
from app.core.memory.models.metadata_models import (
UserMetadata,
)
from app.core.memory.models.message_models import DialogData
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -45,40 +41,12 @@ class MetadataExtractor:
如果文本中包含中文字符则返回 "zh",否则返回 "en" 如果文本中包含中文字符则返回 "zh",否则返回 "en"
""" """
import re import re
combined = " ".join(statements) combined = " ".join(statements)
if re.search(r'[\u4e00-\u9fff]', combined): if re.search(r"[\u4e00-\u9fff]", combined):
return "zh" return "zh"
return "en" return "en"
# def collect_user_raw_messages(
# self,
# dialog_data_list: List[DialogData],
# ) -> List[str]:
# """
# 从原始对话数据中提取 speaker="user" 的消息原文。
# 直接使用用户的原始输入,不经过陈述句提取阶段的 LLM 改写,
# 避免第一人称被替换为第三人称导致元数据/别名提取错误。
# Returns:
# 用户原始消息文本列表
# """
# result = []
# for dialog in dialog_data_list:
# if not dialog.context or not dialog.context.msgs:
# continue
# for msg in dialog.context.msgs:
# if getattr(msg, 'role', '') == 'user':
# text = (getattr(msg, 'msg', '') or '').strip()
# if text:
# result.append(text)
# logger.info(f"收集到 {len(result)} 条用户原始消息")
# if result:
# for i, text in enumerate(result):
# logger.info(f" [user message {i+1}] {text[:200]}")
# return result
def collect_user_related_statements( def collect_user_related_statements(
self, self,
entity_nodes: List[ExtractedEntityNode], entity_nodes: List[ExtractedEntityNode],
@@ -119,7 +87,7 @@ class MetadataExtractor:
for stmt_node in statement_nodes: for stmt_node in statement_nodes:
if stmt_node.id in target_stmt_ids and stmt_node.id not in seen: if stmt_node.id in target_stmt_ids and stmt_node.id not in seen:
total_associated += 1 total_associated += 1
speaker = getattr(stmt_node, 'speaker', None) or 'unknown' speaker = getattr(stmt_node, "speaker", None) or "unknown"
if speaker == "user": if speaker == "user":
text = (stmt_node.statement or "").strip() text = (stmt_node.statement or "").strip()
if text: if text:
@@ -135,7 +103,7 @@ class MetadataExtractor:
) )
if result: if result:
for i, text in enumerate(result): for i, text in enumerate(result):
logger.info(f" [user statement {i+1}] {text}") logger.info(f" [user statement {i + 1}] {text}")
if total_associated > 0 and len(result) == 0: if total_associated > 0 and len(result) == 0:
logger.warning( logger.warning(
f"{total_associated} 条直接关联 statement 但全部被 speaker 过滤," f"{total_associated} 条直接关联 statement 但全部被 speaker 过滤,"
@@ -178,7 +146,10 @@ class MetadataExtractor:
json_schema="", json_schema="",
) )
from app.core.memory.models.metadata_models import MetadataExtractionResponse from app.core.memory.models.metadata_models import (
MetadataExtractionResponse,
)
response = await self.llm_client.response_structured( response = await self.llm_client.response_structured(
messages=[{"role": "user", "content": prompt}], messages=[{"role": "user", "content": prompt}],
response_model=MetadataExtractionResponse, response_model=MetadataExtractionResponse,
@@ -187,7 +158,9 @@ class MetadataExtractor:
if response: if response:
metadata = response.user_metadata if response.user_metadata else None metadata = response.user_metadata if response.user_metadata else None
to_add = response.aliases_to_add if response.aliases_to_add else [] to_add = response.aliases_to_add if response.aliases_to_add else []
to_remove = response.aliases_to_remove if response.aliases_to_remove else [] to_remove = (
response.aliases_to_remove if response.aliases_to_remove else []
)
return metadata, to_add, to_remove return metadata, to_add, to_remove
logger.warning("LLM 返回的响应为空") logger.warning("LLM 返回的响应为空")

View File

@@ -1,6 +1,5 @@
import asyncio import asyncio
import logging import logging
import os
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@@ -82,17 +81,6 @@ class StatementExtractor:
logger.warning(f"Chunk {getattr(chunk, 'id', 'unknown')} has no speaker field or is empty") logger.warning(f"Chunk {getattr(chunk, 'id', 'unknown')} has no speaker field or is empty")
return None return None
@staticmethod
def _replace_first_person_with_user(text: str) -> str:
"""将用户消息中的第一人称代词""替换为"用户"
替换规则:
- 所有独立的""都替换为"用户"(包括"我的""用户的""叫我""叫用户"
- 不替换"我们"中的"""我们"是复数,不指代用户个人)
"""
import re
result = re.sub(r'我(?!们)', '用户', text)
return result
async def _extract_statements(self, chunk, end_user_id: Optional[str] = None, dialogue_content: str = None) -> List[Statement]: async def _extract_statements(self, chunk, end_user_id: Optional[str] = None, dialogue_content: str = None) -> List[Statement]:
"""Process a single chunk and return extracted statements """Process a single chunk and return extracted statements
@@ -106,18 +94,12 @@ class StatementExtractor:
List of ExtractedStatement objects extracted from the chunk List of ExtractedStatement objects extracted from the chunk
""" """
chunk_content = chunk.content chunk_content = chunk.content
chunk_speaker = self._get_speaker_from_chunk(chunk)
if not chunk_content or len(chunk_content.strip()) < 5: if not chunk_content or len(chunk_content.strip()) < 5:
logger.warning(f"Chunk {chunk.id} content too short or empty, skipping") logger.warning(f"Chunk {chunk.id} content too short or empty, skipping")
return [] return []
# 对 speaker="user" 的 chunk将第一人称"我"替换为"用户"
# 避免 LLM 在陈述句提取时将"我"替换为具体名字(如"齐齐"
# 导致下游元数据/别名提取无法识别第一人称语义。
chunk_speaker = self._get_speaker_from_chunk(chunk)
if chunk_speaker == "user":
chunk_content = self._replace_first_person_with_user(chunk_content)
prompt_content = await render_statement_extraction_prompt( prompt_content = await render_statement_extraction_prompt(
chunk_content=chunk_content, chunk_content=chunk_content,
definitions=LABEL_DEFINITIONS, definitions=LABEL_DEFINITIONS,

View File

@@ -1,45 +0,0 @@
"""
Metadata utility functions for cleaning and validating user metadata.
"""
import logging
from typing import Optional
from app.core.memory.models.metadata_models import UserMetadata
logger = logging.getLogger(__name__)
def clean_metadata(raw: dict) -> dict:
"""
Clean metadata by removing empty string values and empty array fields recursively.
Only keeps fields with actual content. If a nested dict becomes empty after cleaning,
it is removed too.
"""
cleaned = {}
for key, value in raw.items():
if isinstance(value, dict):
nested = clean_metadata(value)
if nested:
cleaned[key] = nested
elif isinstance(value, list):
if len(value) > 0:
cleaned[key] = value
elif isinstance(value, str):
if value != "":
cleaned[key] = value
else:
cleaned[key] = value
return cleaned
def validate_metadata(raw: dict) -> Optional[UserMetadata]:
"""
Validate metadata structure using the Pydantic UserMetadata model.
Returns None and logs a WARNING on validation failure.
"""
try:
return UserMetadata.model_validate(raw)
except Exception as e:
logger.warning("Metadata validation failed: %s", e)
return None

View File

@@ -2969,7 +2969,6 @@ def extract_user_metadata_task(
async def _run() -> Dict[str, Any]: async def _run() -> Dict[str, Any]:
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor
from app.core.memory.utils.metadata_utils import clean_metadata, validate_metadata
from app.repositories.end_user_info_repository import EndUserInfoRepository from app.repositories.end_user_info_repository import EndUserInfoRepository
from app.repositories.end_user_repository import EndUserRepository from app.repositories.end_user_repository import EndUserRepository
from app.services.memory_config_service import MemoryConfigService from app.services.memory_config_service import MemoryConfigService
@@ -3029,6 +3028,14 @@ def extract_user_metadata_task(
logger.info(f"[CELERY METADATA] LLM 别名新增: {aliases_to_add}, 移除: {aliases_to_remove}") logger.info(f"[CELERY METADATA] LLM 别名新增: {aliases_to_add}, 移除: {aliases_to_remove}")
# 4. 清洗元数据、覆盖写入元数据和别名 # 4. 清洗元数据、覆盖写入元数据和别名
def clean_metadata(raw: dict) -> dict:
"""递归移除空字符串、空列表、空字典。"""
return {
k: (cleaned if isinstance(v, dict) and (cleaned := clean_metadata(v)) else v)
for k, v in raw.items()
if not (v == "" or v == [] or (isinstance(v, dict) and not clean_metadata(v)))
}
raw_dict = user_metadata.model_dump(exclude_none=True) if user_metadata else {} raw_dict = user_metadata.model_dump(exclude_none=True) if user_metadata else {}
logger.info(f"[CELERY METADATA] LLM 输出完整元数据: {json.dumps(raw_dict, ensure_ascii=False)}") logger.info(f"[CELERY METADATA] LLM 输出完整元数据: {json.dumps(raw_dict, ensure_ascii=False)}")