refactor(memory): consolidate write pipeline and rename statement extraction step

- Rename StatementExtractionStep → StatementTemporalExtractionStep and
  extract_statement.jinja2 → extract_statement_temporal.jinja2 to reflect
  merged temporal extraction logic
- Move extraction_pipeline_orchestrator.py out of steps/ to engine root
- Move dedup_step.py into steps/ directory
- Introduce WriteMemoryRequest schema to replace positional args in write_memory()
- Extract _resolve_and_load_config, _preprocess_files, _write_neo4j, and
  _invalidate_interest_cache as private helpers in MemoryAgentService
- Remove shadow pipeline and simplify NEW_PIPELINE_ENABLED branch
- Merge 类型归属/成员隶属/任职服务 relation types into single 归属身份关系 in triplet prompt
- Add alias merge logic (别名属于) in deduplication and MERGE_ALIAS_BELONGS_TO Cypher query
- Add StorageType, Language, MessageItem enums/models to memory_agent_schema
- Reduce AgentMemory_Long_Term.DEFAULT_SCOPE from 6 to 1
- Delete standalone extract_temporal.jinja2 (logic merged into statement step)
This commit is contained in:
lanceyq
2026-04-28 16:36:30 +08:00
parent 7747ed7ac1
commit 1f0c88a5f0
17 changed files with 390 additions and 326 deletions

View File

@@ -36,13 +36,14 @@ from app.core.memory.agent.utils.messages_tools import (
from app.core.memory.agent.utils.type_classifier import status_typle
from app.core.memory.agent.utils.write_tools import write as write_neo4j
from app.core.memory.analytics.hot_memory_tags import get_interest_distribution
from app.core.memory.memory_service import MemoryService
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
from app.core.memory.utils.log.audit_logger import audit_logger
from app.db import get_db_context
from app.models.knowledge_model import Knowledge, KnowledgeType
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
from app.schemas import FileInput
from app.schemas.memory_agent_schema import Write_UserInput
from app.schemas.memory_agent_schema import Language, MessageItem, StorageType, Write_UserInput, WriteMemoryRequest
from app.schemas.memory_config_schema import ConfigurationError
from app.services.memory_config_service import MemoryConfigService
from app.services.memory_konwledges_server import (
@@ -267,25 +268,15 @@ class MemoryAgentService:
async def write_memory(
self,
end_user_id: str,
messages: list[dict],
config_id: Optional[uuid.UUID] | int,
request: WriteMemoryRequest,
db: Session,
storage_type: str,
user_rag_memory_id: str,
language: str = "zh"
) -> str:
"""
Process write operation with config_id
长期记忆写入
Args:
end_user_id: Group identifier (also used as end_user_id)
messages: Message to write
config_id: Configuration ID from database
request: 写入请求参数end_user_id、messages、config_id、storage_type、language 等)
db: SQLAlchemy database session
storage_type: Storage type (neo4j or rag)
user_rag_memory_id: User RAG memory ID
language: 语言类型 ("zh" 中文, "en" 英文)
Returns:
Write operation result status
@@ -293,147 +284,50 @@ class MemoryAgentService:
Raises:
ValueError: If config loading fails or write operation fails
"""
# Resolve config_id and workspace_id
# Always get workspace_id from end_user for fallback, even if config_id is provided
workspace_id = None
try:
connected_config = get_end_user_connected_config(end_user_id, db)
workspace_id = connected_config.get("workspace_id")
if config_id is None:
config_id = connected_config.get("memory_config_id")
logger.info(f"Resolved config from end_user: config_id={config_id}, workspace_id={workspace_id}")
if config_id is None and workspace_id is None:
raise ValueError(f"No memory configuration found for end_user {end_user_id}. "
f"Please ensure the user has a connected memory configuration.")
except Exception as e:
if "No memory configuration found" in str(e):
raise # Re-raise our specific error
logger.error(f"Failed to get connected config for end_user {end_user_id}: {e}")
if config_id is None:
raise ValueError(f"Unable to determine memory configuration for end_user {end_user_id}: {e}")
# If config_id was provided, continue without workspace_id fallback
import time
end_user_id = request.end_user_id
messages = request.messages
config_id = request.config_id
storage_type = request.storage_type
user_rag_memory_id = request.user_rag_memory_id
language = request.language
start_time = time.time()
# Load configuration from database with workspace fallback
# Use a separate database session to avoid transaction failures
# ── Step 1: 解析配置 ── 通过 end_user_id 查找关联的 config_id / workspace_id并从数据库加载完整 memory_config
memory_config = await self._resolve_and_load_config(
end_user_id, config_id, db, start_time
)
# ── Step 2: 文件预处理 ── 将消息中附带的文件转换为感知记忆对象,挂载到 message["file_content"]
messages = await self._preprocess_files(messages, end_user_id, memory_config, db)
message_text = "\n".join([
f"{(msg['role'] if isinstance(msg, dict) else msg.role)}: {(msg['content'] if isinstance(msg, dict) else msg.content)}"
for msg in messages
])
# ── Step 3: 写入存储 ── 根据 storage_type 分流到 RAG 或 Neo4j 流水线
try:
from app.db import get_db_context
with get_db_context() as config_db:
config_service = MemoryConfigService(config_db)
memory_config = config_service.load_memory_config(
config_id=config_id,
workspace_id=workspace_id,
service_name="MemoryAgentService"
)
logger.info(f"Configuration loaded successfully: {memory_config.config_name}")
except ConfigurationError as e:
error_msg = f"Failed to load configuration for config_id: {config_id}: {e}"
logger.error(error_msg)
# Log failed operation
duration = time.time() - start_time
audit_logger.log_operation(operation="WRITE", config_id=config_id, end_user_id=end_user_id,
success=False, duration=duration, error=error_msg)
raise ValueError(error_msg)
perceptual_serivce = MemoryPerceptualService(db)
for message in messages:
message["file_content"] = []
for file in (message.get("files") or []):
file_object = await perceptual_serivce.generate_perceptual_memory(
end_user_id=end_user_id,
memory_config=memory_config,
file=FileInput(**file)
)
if file_object is None:
continue
message["file_content"].append((file_object, file["type"]))
logger.info(messages)
message_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
try:
if storage_type == "rag":
# For RAG storage, convert messages to single string
if storage_type == StorageType.RAG:
await write_rag(end_user_id, message_text, user_rag_memory_id)
return "success"
else:
# TODO 乐力齐 重构流水线切换至生产环境后,更改如下代码
import os
use_new_pipeline = os.getenv("NEW_PIPELINE_ENABLED", "false").lower() == "true"
await self._write_neo4j(end_user_id, messages, memory_config, language)
if use_new_pipeline:
# ── 新流水线WritePipeline + NewExtractionOrchestrator ──
from app.core.memory.memory_service import MemoryService
service = MemoryService(
memory_config=memory_config,
end_user_id=end_user_id,
)
result = await service.write(
messages=messages,
language=language,
ref_id='',
is_pilot_run=False,
)
logger.info(
f"[NewPipeline] 完成: status={result.status}, "
f"elapsed={result.elapsed_seconds:.2f}s, "
f"extraction={result.extraction}"
)
else:
# ── 旧流水线write_tools.write() + ExtractionOrchestrator ──
await write_neo4j(
end_user_id=end_user_id,
messages=messages,
memory_config=memory_config,
ref_id='',
language=language
)
# ── 影子运行:新流水线静默执行,只记录日志不影响主流程 ──
if os.getenv("SHADOW_PIPELINE_ENABLED", "false").lower() == "true":
try:
from app.core.memory.memory_service import MemoryService
import copy
shadow_messages = copy.deepcopy(messages)
shadow_service = MemoryService(
memory_config=memory_config,
end_user_id=end_user_id,
)
shadow_result = await shadow_service.write(
messages=shadow_messages,
language=language,
ref_id='',
is_pilot_run=True,
)
logger.info(
f"[Shadow] 新流水线影子运行完成: status={shadow_result.status}, "
f"elapsed={shadow_result.elapsed_seconds:.2f}s, "
f"extraction={shadow_result.extraction}"
)
except Exception as shadow_err:
logger.warning(f"[Shadow] 新流水线影子运行失败(不影响主流程): {shadow_err}")
# ── 影子运行结束 ──
for lang in ["zh", "en"]:
deleted = await InterestMemoryCache.delete_interest_distribution(
end_user_id, lang
)
if deleted:
logger.info(
f"Invalidated interest distribution cache: end_user_id={end_user_id}, language={lang}")
# ── Step 4: 后处理 ── 失效缓存、序列化文件路径、记录审计日志并返回结果
await self._invalidate_interest_cache(end_user_id)
for message in messages:
message["file_content"] = [
perceptual[0].file_path for perceptual in message["file_content"]
]
if isinstance(message, dict):
message["file_content"] = [
perceptual[0].file_path for perceptual in (message["file_content"] or [])
]
else:
message.file_content = [
perceptual[0].file_path for perceptual in (message.file_content or [])
]
return self.writer_messages_deal(
"success",
start_time,
end_user_id,
config_id,
memory_config.config_id,
message_text,
{
"status": "success",
@@ -443,15 +337,139 @@ class MemoryAgentService:
}
)
except Exception as e:
# Ensure proper error handling and logging
error_msg = f"Write operation failed: {str(e)}"
logger.error(error_msg)
duration = time.time() - start_time
audit_logger.log_operation(operation="WRITE", config_id=config_id, end_user_id=end_user_id,
success=False, duration=duration, error=error_msg)
audit_logger.log_operation(
operation="WRITE",
config_id=memory_config.config_id,
end_user_id=end_user_id,
success=False,
duration=time.time() - start_time,
error=error_msg,
)
raise ValueError(error_msg)
async def _resolve_and_load_config(
self,
end_user_id: str,
config_id: Optional[uuid.UUID] | int,
db: Session,
start_time: float,
):
"""解析 end_user 关联配置并从数据库加载完整 memory_config。"""
workspace_id = None
try:
connected_config = get_end_user_connected_config(end_user_id, db)
workspace_id = connected_config.get("workspace_id")
if config_id is None:
config_id = connected_config.get("memory_config_id")
logger.info(f"Resolved config from end_user: config_id={config_id}, workspace_id={workspace_id}")
if config_id is None and workspace_id is None:
raise ValueError(
f"No memory configuration found for end_user {end_user_id}. "
f"Please ensure the user has a connected memory configuration."
)
except Exception as e:
if "No memory configuration found" in str(e):
raise
logger.error(f"Failed to get connected config for end_user {end_user_id}: {e}")
if config_id is None:
raise ValueError(f"Unable to determine memory configuration for end_user {end_user_id}: {e}")
try:
with get_db_context() as config_db:
memory_config = MemoryConfigService(config_db).load_memory_config(
config_id=config_id,
workspace_id=workspace_id,
service_name="MemoryAgentService",
)
logger.info(f"Configuration loaded successfully: {memory_config.config_name}")
return memory_config
except ConfigurationError as e:
error_msg = f"Failed to load configuration for config_id: {config_id}: {e}"
logger.error(error_msg)
audit_logger.log_operation(
operation="WRITE",
config_id=config_id,
end_user_id=end_user_id,
success=False,
duration=time.time() - start_time,
error=error_msg,
)
raise ValueError(error_msg)
async def _preprocess_files(
self,
messages: list[MessageItem] | list[dict],
end_user_id: str,
memory_config,
db: Session,
) -> list[dict]:
"""处理消息中附带的文件,生成感知记忆对象并挂载到 message['file_content']。"""
perceptual_service = MemoryPerceptualService(db)
for message in messages:
if isinstance(message, dict):
message["file_content"] = []
files = message.get("files") or []
else:
message.file_content = []
files = message.files or []
for file in files:
file_object = await perceptual_service.generate_perceptual_memory(
end_user_id=end_user_id,
memory_config=memory_config,
file=FileInput(**file),
)
if file_object is None:
continue
if isinstance(message, dict):
message["file_content"].append((file_object, file["type"]))
else:
message.file_content.append((file_object, file["type"]))
logger.info(messages)
return messages
async def _write_neo4j(
self,
end_user_id: str,
messages: list[MessageItem] | list[dict],
memory_config,
language: Language | str,
) -> None:
"""根据 NEW_PIPELINE_ENABLED 选择新旧流水线写入 Neo4j。"""
# 统一转换为 dict下游流水线期望 list[dict]
messages_dict = [
msg if isinstance(msg, dict) else msg.model_dump(exclude_none=True)
for msg in messages
]
use_new_pipeline = os.getenv("NEW_PIPELINE_ENABLED", "false").lower() == "true"
if use_new_pipeline:
service = MemoryService(memory_config=memory_config, end_user_id=end_user_id)
result = await service.write(messages=messages_dict, language=language, ref_id='')
logger.info(
f"[NewPipeline] 完成: status={result.status}, "
f"elapsed={result.elapsed_seconds:.2f}s, "
f"extraction={result.extraction}"
)
else:
await write_neo4j(
end_user_id=end_user_id,
messages=messages_dict,
memory_config=memory_config,
ref_id='',
language=language,
)
async def _invalidate_interest_cache(self, end_user_id: str) -> None:
"""写入完成后失效兴趣分布缓存。"""
for lang in ["zh", "en"]:
deleted = await InterestMemoryCache.delete_interest_distribution(end_user_id, lang)
if deleted:
logger.info(
f"Invalidated interest distribution cache: end_user_id={end_user_id}, language={lang}"
)
async def read_memory(
self,
end_user_id: str,