Compare commits
16 Commits
main
...
refactor/w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d96b9fab20 | ||
|
|
c27ca5a380 | ||
|
|
aa9eb66668 | ||
|
|
e3ab19dd4f | ||
|
|
d255f33f1f | ||
|
|
6419dcd932 | ||
|
|
9dc9b7aee7 | ||
|
|
cf389bb978 | ||
|
|
d66d601e41 | ||
|
|
4af9b02815 | ||
|
|
1f0c88a5f0 | ||
|
|
7747ed7ac1 | ||
|
|
2355536b44 | ||
|
|
b0ddd12cc6 | ||
|
|
a98011fc8a | ||
|
|
41535c34e6 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -43,3 +43,6 @@ cl100k_base.tiktoken
|
||||
libssl*.deb
|
||||
|
||||
sandbox/lib/seccomp_redbear/target
|
||||
|
||||
# Qoder repowiki generated content
|
||||
.qoder/repowiki/zh/
|
||||
|
||||
@@ -114,6 +114,15 @@ celery_app.conf.update(
|
||||
# Metadata extraction → memory_tasks queue
|
||||
'app.tasks.extract_user_metadata': {'queue': 'memory_tasks'},
|
||||
|
||||
# Async emotion extraction → memory_tasks queue (IO-bound LLM calls)
|
||||
'app.tasks.extract_emotion_batch': {'queue': 'memory_tasks'},
|
||||
|
||||
# Post-store dedup + alias merge → memory_tasks queue
|
||||
'app.tasks.post_store_dedup_and_alias_merge': {'queue': 'memory_tasks'},
|
||||
|
||||
# Async metadata extraction → memory_tasks queue
|
||||
'app.tasks.extract_metadata_batch': {'queue': 'memory_tasks'},
|
||||
|
||||
# Document tasks → document_tasks queue (prefork worker)
|
||||
'app.core.rag.tasks.parse_document': {'queue': 'document_tasks'},
|
||||
'app.core.rag.tasks.sync_knowledge_for_kb': {'queue': 'document_tasks'},
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
Celery Worker 入口点
|
||||
用于启动 Celery Worker: celery -A app.celery_worker worker --loglevel=info
|
||||
"""
|
||||
from celery.signals import worker_process_init
|
||||
# 必须在导入任何使用 DashScope SDK 的模块之前应用补丁
|
||||
import app.plugins.dashscope_patch # noqa: F401
|
||||
|
||||
from app.celery_app import celery_app
|
||||
from app.core.logging_config import LoggingConfig, get_logger
|
||||
|
||||
@@ -21,7 +21,7 @@ from app.dependencies import cur_workspace_access_guard, get_current_user
|
||||
from app.models import ModelApiKey
|
||||
from app.models.user_model import User
|
||||
from app.repositories import knowledge_repository
|
||||
from app.schemas.memory_agent_schema import UserInput, Write_UserInput
|
||||
from app.schemas.memory_agent_schema import StorageType, UserInput, Write_UserInput, WriteMemoryRequest
|
||||
from app.schemas.response_schema import ApiResponse
|
||||
from app.services import task_service, workspace_service
|
||||
from app.services.memory_agent_service import MemoryAgentService
|
||||
@@ -862,7 +862,7 @@ async def get_end_user_connected_config(
|
||||
包含 memory_config_id 和相关信息的响应
|
||||
"""
|
||||
|
||||
api_logger.info(f"Getting connected config for end_user: {end_user_id}")
|
||||
api_logger.info(f"Getting connected config for end_user_id: {end_user_id}")
|
||||
|
||||
try:
|
||||
result = get_config(end_user_id, db)
|
||||
|
||||
@@ -301,11 +301,11 @@ class Settings:
|
||||
# Prompt 中最大类型数量
|
||||
MAX_ONTOLOGY_TYPES_IN_PROMPT: int = int(os.getenv("MAX_ONTOLOGY_TYPES_IN_PROMPT", "50"))
|
||||
|
||||
# 核心通用类型列表(逗号分隔)
|
||||
# 核心通用类型列表(逗号分隔)—— 与 ontology.md Entity Ontology 保持一致的 13 类
|
||||
CORE_GENERAL_TYPES: str = os.getenv(
|
||||
"CORE_GENERAL_TYPES",
|
||||
"Person,Organization,Company,GovernmentAgency,Place,Location,City,Country,Building,"
|
||||
"Event,SportsEvent,SocialEvent,Work,Book,Film,Software,Concept,TopicalConcept,AcademicSubject"
|
||||
"人物,组织,群体,角色职业,地点设施,物品设备,软件平台,识别联系信息,"
|
||||
"文档媒体,知识能力,偏好习惯,具体目标,称呼别名"
|
||||
)
|
||||
|
||||
# 实验模式开关(允许通过 API 动态切换本体配置)
|
||||
|
||||
@@ -46,6 +46,10 @@ def validate_language(language: Optional[str]) -> str:
|
||||
if language is None:
|
||||
return DEFAULT_LANGUAGE
|
||||
|
||||
# 处理枚举类型:优先取 .value,避免 str(Language.ZH) → "Language.ZH"
|
||||
if hasattr(language, "value"):
|
||||
language = language.value
|
||||
|
||||
# 标准化:转小写并去除空白
|
||||
lang = str(language).lower().strip()
|
||||
|
||||
|
||||
@@ -130,6 +130,10 @@ class LoggingConfig:
|
||||
for neo4j_logger_name in ["neo4j", "neo4j.io", "neo4j.pool", "neo4j.notifications"]:
|
||||
neo4j_logger = logging.getLogger(neo4j_logger_name)
|
||||
neo4j_logger.addFilter(neo4j_filter)
|
||||
|
||||
# 压制 httpx / httpcore 的请求级日志(大量 HTTP Request: POST ... 噪音)
|
||||
for noisy_logger in ["httpx", "httpcore", "httpcore.http11", "httpcore.connection"]:
|
||||
logging.getLogger(noisy_logger).setLevel(logging.WARNING)
|
||||
|
||||
# 创建格式化器
|
||||
formatter = logging.Formatter(
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
from app.cache.memory.interest_memory import InterestMemoryCache
|
||||
from app.core.memory.agent.utils.llm_tools import WriteState
|
||||
from app.core.memory.agent.utils.write_tools import write
|
||||
from app.core.logging_config import get_agent_logger
|
||||
|
||||
logger = get_agent_logger(__name__)
|
||||
|
||||
|
||||
async def write_node(state: WriteState) -> WriteState:
|
||||
"""
|
||||
Write data to the database/file system.
|
||||
|
||||
Args:
|
||||
state: WriteState containing messages, end_user_id, memory_config, and language
|
||||
|
||||
Returns:
|
||||
dict: Contains 'write_result' with status and data fields
|
||||
"""
|
||||
messages = state.get('messages', [])
|
||||
end_user_id = state.get('end_user_id', '')
|
||||
memory_config = state.get('memory_config', '')
|
||||
language = state.get('language', 'zh') # 默认中文
|
||||
|
||||
# Convert LangChain messages to structured format expected by write()
|
||||
structured_messages = []
|
||||
for msg in messages:
|
||||
if hasattr(msg, 'type') and hasattr(msg, 'content'):
|
||||
# Map LangChain message types to role names
|
||||
role = 'user' if msg.type == 'human' else 'assistant' if msg.type == 'ai' else msg.type
|
||||
structured_messages.append({
|
||||
"role": role,
|
||||
"content": msg.content # content is now guaranteed to be a string
|
||||
})
|
||||
|
||||
try:
|
||||
result = await write(
|
||||
messages=structured_messages,
|
||||
end_user_id=end_user_id,
|
||||
memory_config=memory_config,
|
||||
language=language,
|
||||
)
|
||||
logger.info(f"Write completed successfully! Config: {memory_config.config_name}")
|
||||
|
||||
# 写入 neo4j 成功后,删除该用户的兴趣分布缓存,确保下次请求重新生成
|
||||
for lang in ["zh", "en"]:
|
||||
deleted = await InterestMemoryCache.delete_interest_distribution(
|
||||
end_user_id=end_user_id,
|
||||
language=lang,
|
||||
)
|
||||
if deleted:
|
||||
logger.info(f"Invalidated interest distribution cache: end_user_id={end_user_id}, language={lang}")
|
||||
|
||||
write_result = {
|
||||
"status": "success",
|
||||
"data": structured_messages,
|
||||
"config_id": memory_config.config_id,
|
||||
"config_name": memory_config.config_name,
|
||||
}
|
||||
return {"write_result": write_result}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Data_write failed: {e}", exc_info=True)
|
||||
write_result = {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
}
|
||||
return {"write_result": write_result}
|
||||
@@ -135,16 +135,17 @@ async def term_memory_save(end_user_id, strategy_type, scope):
|
||||
chunk_data = data[:scope]
|
||||
if len(chunk_data) == scope:
|
||||
repo.upsert(end_user_id, chunk_data)
|
||||
logger.info(f'---------写入短长期-----------')
|
||||
logger.info('---------写入短长期-----------')
|
||||
else:
|
||||
long_time_data = write_store.find_user_recent_sessions(end_user_id, 5)
|
||||
long_messages = await messages_parse(long_time_data)
|
||||
repo.upsert(end_user_id, long_messages)
|
||||
logger.info(f'写入短长期:')
|
||||
logger.info('写入短长期:')
|
||||
|
||||
|
||||
async def window_dialogue(end_user_id, langchain_messages, memory_config, scope):
|
||||
"""
|
||||
TODO 考虑作为滑动窗口写入的函数
|
||||
Process dialogue based on window size and write to Neo4j
|
||||
|
||||
Manages conversation data based on a sliding window approach. When the window
|
||||
|
||||
@@ -252,7 +252,7 @@ def create_hybrid_retrieval_tool_async(memory_config, **search_params):
|
||||
# TODO: fact_summary functionality temporarily disabled, will be enabled after future development
|
||||
fields_to_remove = {
|
||||
'invalid_at', 'valid_at', 'chunk_id_from_rel', 'entity_ids',
|
||||
'expired_at', 'created_at', 'chunk_id', 'apply_id',
|
||||
'created_at', 'chunk_id', 'apply_id',
|
||||
'user_id', 'statement_ids', 'updated_at', "chunk_ids", "fact_summary"
|
||||
}
|
||||
# 注意:'id' 字段保留,community 展开时需要用 community id 查询成员 statements
|
||||
|
||||
@@ -40,8 +40,20 @@ async def long_term_storage(
|
||||
# 获取数据库会话
|
||||
with get_db_context() as db_session:
|
||||
config_service = MemoryConfigService(db_session)
|
||||
# 通过 end_user_id 获取 workspace_id,确保日志和 fallback 逻辑完整
|
||||
from app.services.memory_agent_service import get_end_user_connected_config
|
||||
import uuid as _uuid
|
||||
workspace_id = None
|
||||
try:
|
||||
connected = get_end_user_connected_config(end_user_id, db_session)
|
||||
raw = connected.get("workspace_id")
|
||||
if raw and raw != "None":
|
||||
workspace_id = _uuid.UUID(str(raw))
|
||||
except Exception:
|
||||
pass
|
||||
memory_config = config_service.load_memory_config(
|
||||
config_id=memory_config_id, # 改为整数
|
||||
config_id=memory_config_id,
|
||||
workspace_id=workspace_id,
|
||||
service_name="MemoryAgentService"
|
||||
)
|
||||
if long_term_type == AgentMemory_Long_Term.STRATEGY_CHUNK:
|
||||
|
||||
@@ -15,7 +15,7 @@ class ParameterBuilder:
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the parameter builder."""
|
||||
logger.info("ParameterBuilder initialized")
|
||||
logger.debug("ParameterBuilder initialized")
|
||||
|
||||
def build_tool_args(
|
||||
self,
|
||||
|
||||
@@ -16,7 +16,7 @@ logger = get_agent_logger(__name__)
|
||||
# 需要从展开结果中过滤的字段(含 Neo4j DateTime,不可 JSON 序列化)
|
||||
_EXPAND_FIELDS_TO_REMOVE = {
|
||||
'invalid_at', 'valid_at', 'chunk_id_from_rel', 'entity_ids',
|
||||
'expired_at', 'created_at', 'chunk_id', 'apply_id',
|
||||
'created_at', 'chunk_id', 'apply_id',
|
||||
'user_id', 'statement_ids', 'updated_at', 'chunk_ids', 'fact_summary'
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ class SearchService:
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the search service."""
|
||||
logger.info("SearchService initialized")
|
||||
logger.debug("SearchService initialized")
|
||||
|
||||
def extract_content_from_result(self, result: dict, node_type: str = "") -> str:
|
||||
"""
|
||||
|
||||
@@ -24,7 +24,7 @@ class SessionService:
|
||||
store: Redis session store instance
|
||||
"""
|
||||
self.store = store
|
||||
logger.info("SessionService initialized")
|
||||
logger.debug("SessionService initialized")
|
||||
|
||||
def resolve_user_id(self, session_string: str) -> str:
|
||||
"""
|
||||
|
||||
@@ -51,7 +51,7 @@ class TemplateService:
|
||||
loader=FileSystemLoader(template_root),
|
||||
autoescape=False # Disable autoescape for prompt templates
|
||||
)
|
||||
logger.info(f"TemplateService initialized with root: {template_root}")
|
||||
logger.debug(f"TemplateService initialized with root: {template_root}")
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def _load_template(self, template_name: str) -> Template:
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
import os
|
||||
import json
|
||||
from typing import List
|
||||
from datetime import datetime
|
||||
|
||||
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.chunk_extraction import DialogueChunker
|
||||
from app.core.memory.models.message_models import DialogData, ConversationContext, ConversationMessage
|
||||
@@ -12,16 +9,19 @@ async def get_chunked_dialogs(
|
||||
end_user_id: str = "group_1",
|
||||
messages: list = None,
|
||||
ref_id: str = "",
|
||||
config_id: str = None
|
||||
config_id: str = None,
|
||||
workspace_id=None,
|
||||
snapshot=None,
|
||||
) -> List[DialogData]:
|
||||
"""Generate chunks from structured messages using the specified chunker strategy.
|
||||
|
||||
Args:
|
||||
chunker_strategy: The chunking strategy to use (default: RecursiveChunker)
|
||||
end_user_id: Group identifier
|
||||
messages: Structured message list [{"role": "user", "content": "..."}, ...]
|
||||
messages: Structured message list [{"role": "user", "content": "...", "dialog_at": "..."}]
|
||||
ref_id: Reference identifier
|
||||
config_id: Configuration ID for processing (used to load pruning config)
|
||||
snapshot: Optional PipelineSnapshot instance for saving pruning output
|
||||
|
||||
Returns:
|
||||
List of DialogData objects with generated chunks
|
||||
@@ -34,6 +34,7 @@ async def get_chunked_dialogs(
|
||||
|
||||
conversation_messages = []
|
||||
|
||||
# step1: 消息格式校验 role:user、assistant。content
|
||||
for idx, msg in enumerate(messages):
|
||||
if not isinstance(msg, dict) or 'role' not in msg or 'content' not in msg:
|
||||
raise ValueError(f"Message {idx} format error: must contain 'role' and 'content' fields")
|
||||
@@ -46,7 +47,12 @@ async def get_chunked_dialogs(
|
||||
raise ValueError(f"Message {idx} role must be 'user' or 'assistant', got: {role}")
|
||||
|
||||
if content.strip():
|
||||
conversation_messages.append(ConversationMessage(role=role, msg=content.strip(), files=files))
|
||||
conversation_messages.append(ConversationMessage(
|
||||
role=role,
|
||||
msg=content.strip(),
|
||||
dialog_at=msg.get("dialog_at"),
|
||||
files=files,
|
||||
))
|
||||
|
||||
if not conversation_messages:
|
||||
raise ValueError("Message list cannot be empty after filtering")
|
||||
@@ -56,10 +62,10 @@ async def get_chunked_dialogs(
|
||||
context=conversation_context,
|
||||
ref_id=ref_id,
|
||||
end_user_id=end_user_id,
|
||||
config_id=config_id
|
||||
config_id=config_id,
|
||||
)
|
||||
|
||||
# 语义剪枝步骤(在分块之前)
|
||||
# step2: 语义剪枝步骤(在分块之前)
|
||||
try:
|
||||
from app.core.memory.storage_services.extraction_engine.data_preprocessing.data_pruning import SemanticPruner
|
||||
from app.core.memory.models.config_models import PruningConfig
|
||||
@@ -76,6 +82,7 @@ async def get_chunked_dialogs(
|
||||
config_service = MemoryConfigService(db)
|
||||
memory_config = config_service.load_memory_config(
|
||||
config_id=config_id,
|
||||
workspace_id=workspace_id,
|
||||
service_name="semantic_pruning"
|
||||
)
|
||||
|
||||
@@ -95,7 +102,7 @@ async def get_chunked_dialogs(
|
||||
llm_client = factory.get_llm_client_from_config(memory_config)
|
||||
|
||||
# 执行剪枝 - 使用 prune_dataset 支持消息级剪枝
|
||||
pruner = SemanticPruner(config=pruning_config, llm_client=llm_client)
|
||||
pruner = SemanticPruner(config=pruning_config, llm_client=llm_client, snapshot=snapshot)
|
||||
original_msg_count = len(dialog_data.context.msgs)
|
||||
|
||||
# 使用 prune_dataset 而不是 prune_dialog
|
||||
@@ -107,6 +114,13 @@ async def get_chunked_dialogs(
|
||||
remaining_msg_count = len(dialog_data.context.msgs)
|
||||
deleted_count = original_msg_count - remaining_msg_count
|
||||
logger.info(f"[剪枝] 完成: 原始{original_msg_count}条 -> 保留{remaining_msg_count}条 (删除{deleted_count}条)")
|
||||
|
||||
# 将剪枝记录挂到 metadata,供 graph_build_step 构建节点
|
||||
if pruner.pruning_records:
|
||||
dialog_data.metadata["assistant_pruning_records"] = [
|
||||
r.model_dump() for r in pruner.pruning_records
|
||||
]
|
||||
logger.info(f"[剪枝] 收集到 {len(pruner.pruning_records)} 条剪枝记录")
|
||||
else:
|
||||
logger.warning("[剪枝] prune_dataset 返回空列表")
|
||||
else:
|
||||
@@ -116,6 +130,7 @@ async def get_chunked_dialogs(
|
||||
except Exception as e:
|
||||
logger.warning(f"[剪枝] 执行失败,跳过剪枝: {e}", exc_info=True)
|
||||
|
||||
# step3: 分块
|
||||
chunker = DialogueChunker(chunker_strategy)
|
||||
extracted_chunks = await chunker.process_dialogue(dialog_data)
|
||||
dialog_data.chunks = extracted_chunks
|
||||
|
||||
@@ -1,333 +0,0 @@
|
||||
"""
|
||||
Write Tools for Memory Knowledge Extraction Pipeline
|
||||
|
||||
This module provides the main write function for executing the knowledge extraction
|
||||
pipeline. Only MemoryConfig is needed - clients are constructed internally.
|
||||
"""
|
||||
import asyncio
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from app.core.logging_config import get_agent_logger
|
||||
from app.core.memory.agent.utils.get_dialogs import get_chunked_dialogs
|
||||
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import _USER_PLACEHOLDER_NAMES
|
||||
from app.core.memory.storage_services.extraction_engine.extraction_orchestrator import ExtractionOrchestrator
|
||||
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.memory_summary import \
|
||||
memory_summary_generation
|
||||
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
|
||||
from app.core.memory.utils.log.logging_utils import log_time
|
||||
from app.db import get_db_context
|
||||
from app.repositories.neo4j.add_edges import add_memory_summary_statement_edges
|
||||
from app.repositories.neo4j.add_nodes import add_memory_summary_nodes
|
||||
from app.repositories.neo4j.graph_saver import save_dialog_and_statements_to_neo4j
|
||||
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
|
||||
from app.schemas.memory_config_schema import MemoryConfig
|
||||
|
||||
load_dotenv()
|
||||
|
||||
logger = get_agent_logger(__name__)
|
||||
|
||||
|
||||
async def write(
|
||||
end_user_id: str,
|
||||
memory_config: MemoryConfig,
|
||||
messages: list,
|
||||
ref_id: str = "",
|
||||
language: str = "zh",
|
||||
) -> None:
|
||||
"""
|
||||
Execute the complete knowledge extraction pipeline.
|
||||
|
||||
Args:
|
||||
end_user_id: Group identifier
|
||||
memory_config: MemoryConfig object containing all configuration
|
||||
messages: Structured message list [{"role": "user", "content": "..."}, ...]
|
||||
ref_id: Reference ID, defaults to ""
|
||||
language: 语言类型 ("zh" 中文, "en" 英文),默认中文
|
||||
"""
|
||||
if not ref_id:
|
||||
ref_id = uuid.uuid4().hex
|
||||
# Extract config values
|
||||
embedding_model_id = str(memory_config.embedding_model_id)
|
||||
chunker_strategy = memory_config.chunker_strategy
|
||||
config_id = str(memory_config.config_id)
|
||||
|
||||
logger.info("=== MemSci Knowledge Extraction Pipeline ===")
|
||||
logger.info(f"Config: {memory_config.config_name} (ID: {config_id})")
|
||||
logger.info(f"Workspace: {memory_config.workspace_name}")
|
||||
logger.info(f"LLM model: {memory_config.llm_model_name}")
|
||||
logger.info(f"Embedding model: {memory_config.embedding_model_name}")
|
||||
logger.info(f"Chunker strategy: {chunker_strategy}")
|
||||
logger.info(f"end_user_id ID: {end_user_id}")
|
||||
|
||||
# Construct clients from memory_config using factory pattern with db session
|
||||
with get_db_context() as db:
|
||||
factory = MemoryClientFactory(db)
|
||||
llm_client = factory.get_llm_client_from_config(memory_config)
|
||||
embedder_client = factory.get_embedder_client_from_config(memory_config)
|
||||
logger.info("LLM and embedding clients constructed")
|
||||
|
||||
# Initialize timing log
|
||||
log_file = "logs/time.log"
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
with open(log_file, "a", encoding="utf-8") as f:
|
||||
f.write(f"\n=== Pipeline Run Started: {timestamp} ===\n")
|
||||
f.write(f"Config: {memory_config.config_name} (ID: {config_id})\n")
|
||||
|
||||
pipeline_start = time.time()
|
||||
|
||||
# Initialize Neo4j connector
|
||||
neo4j_connector = Neo4jConnector()
|
||||
|
||||
# Step 1: Load and chunk data
|
||||
step_start = time.time()
|
||||
chunked_dialogs = await get_chunked_dialogs(
|
||||
chunker_strategy=chunker_strategy,
|
||||
end_user_id=end_user_id,
|
||||
messages=messages,
|
||||
ref_id=ref_id,
|
||||
config_id=config_id,
|
||||
)
|
||||
log_time("Data Loading & Chunking", time.time() - step_start, log_file)
|
||||
|
||||
# Step 2: Initialize and run ExtractionOrchestrator
|
||||
step_start = time.time()
|
||||
from app.core.memory.utils.config.config_utils import get_pipeline_config
|
||||
pipeline_config = get_pipeline_config(memory_config)
|
||||
|
||||
# Fetch ontology types if scene_id is configured
|
||||
ontology_types = None
|
||||
if memory_config.scene_id:
|
||||
try:
|
||||
from app.core.memory.ontology_services.ontology_type_loader import load_ontology_types_for_scene
|
||||
|
||||
with get_db_context() as db:
|
||||
ontology_types = load_ontology_types_for_scene(
|
||||
scene_id=memory_config.scene_id,
|
||||
workspace_id=memory_config.workspace_id,
|
||||
db=db
|
||||
)
|
||||
|
||||
if ontology_types:
|
||||
logger.info(
|
||||
f"Loaded {len(ontology_types.types)} ontology types for scene_id: {memory_config.scene_id}"
|
||||
)
|
||||
else:
|
||||
logger.info(f"No ontology classes found for scene_id: {memory_config.scene_id}")
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to fetch ontology types for scene_id {memory_config.scene_id}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
orchestrator = ExtractionOrchestrator(
|
||||
llm_client=llm_client,
|
||||
embedder_client=embedder_client,
|
||||
connector=neo4j_connector,
|
||||
config=pipeline_config,
|
||||
embedding_id=embedding_model_id,
|
||||
language=language,
|
||||
ontology_types=ontology_types,
|
||||
)
|
||||
|
||||
# Run the complete extraction pipeline
|
||||
(
|
||||
all_dialogue_nodes,
|
||||
all_chunk_nodes,
|
||||
all_statement_nodes,
|
||||
all_entity_nodes,
|
||||
all_perceptual_nodes,
|
||||
all_statement_chunk_edges,
|
||||
all_statement_entity_edges,
|
||||
all_entity_entity_edges,
|
||||
all_perceptual_edges,
|
||||
all_dedup_details,
|
||||
) = await orchestrator.run(chunked_dialogs, is_pilot_run=False)
|
||||
|
||||
log_time("Extraction Pipeline", time.time() - step_start, log_file)
|
||||
|
||||
# Step 3: Save all data to Neo4j database
|
||||
step_start = time.time()
|
||||
|
||||
# Neo4j 写入前:清洗用户/AI助手实体之间的别名交叉污染
|
||||
# 从 Neo4j 查询已有的 AI 助手别名,与本轮实体中的 AI 助手别名合并,
|
||||
# 确保用户实体的 aliases 不包含 AI 助手的名字
|
||||
try:
|
||||
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
|
||||
clean_cross_role_aliases,
|
||||
fetch_neo4j_assistant_aliases,
|
||||
)
|
||||
neo4j_assistant_aliases = set()
|
||||
if all_entity_nodes:
|
||||
_eu_id = all_entity_nodes[0].end_user_id
|
||||
if _eu_id:
|
||||
neo4j_assistant_aliases = await fetch_neo4j_assistant_aliases(neo4j_connector, _eu_id)
|
||||
clean_cross_role_aliases(all_entity_nodes, external_assistant_aliases=neo4j_assistant_aliases)
|
||||
logger.info(f"Neo4j 写入前别名清洗完成,AI助手别名排除集大小: {len(neo4j_assistant_aliases)}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Neo4j 写入前别名清洗失败(不影响主流程): {e}")
|
||||
|
||||
# 添加死锁重试机制
|
||||
max_retries = 3
|
||||
retry_delay = 1 # 秒
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
success = await save_dialog_and_statements_to_neo4j(
|
||||
dialogue_nodes=all_dialogue_nodes,
|
||||
chunk_nodes=all_chunk_nodes,
|
||||
statement_nodes=all_statement_nodes,
|
||||
entity_nodes=all_entity_nodes,
|
||||
perceptual_nodes=all_perceptual_nodes,
|
||||
statement_chunk_edges=all_statement_chunk_edges,
|
||||
statement_entity_edges=all_statement_entity_edges,
|
||||
entity_edges=all_entity_entity_edges,
|
||||
perceptual_edges=all_perceptual_edges,
|
||||
connector=neo4j_connector,
|
||||
)
|
||||
if success:
|
||||
logger.info("Successfully saved all data to Neo4j")
|
||||
|
||||
if all_entity_nodes:
|
||||
end_user_id = all_entity_nodes[0].end_user_id
|
||||
|
||||
# Neo4j 写入完成后,用 PgSQL 权威 aliases 覆盖 Neo4j 用户实体
|
||||
try:
|
||||
from app.repositories.end_user_info_repository import EndUserInfoRepository
|
||||
if end_user_id:
|
||||
with get_db_context() as db_session:
|
||||
info = EndUserInfoRepository(db_session).get_by_end_user_id(uuid.UUID(end_user_id))
|
||||
pg_aliases = info.aliases if info and info.aliases else []
|
||||
if info is not None:
|
||||
# 将 Python 侧占位名集合作为参数传入,避免 Cypher 硬编码
|
||||
placeholder_names = list(_USER_PLACEHOLDER_NAMES)
|
||||
await neo4j_connector.execute_query(
|
||||
"""
|
||||
MATCH (e:ExtractedEntity)
|
||||
WHERE e.end_user_id = $end_user_id AND toLower(e.name) IN $placeholder_names
|
||||
SET e.aliases = $aliases
|
||||
""",
|
||||
end_user_id=end_user_id, aliases=pg_aliases,
|
||||
placeholder_names=placeholder_names,
|
||||
)
|
||||
logger.info(f"[AliasSync] Neo4j 用户实体 aliases 已用 PgSQL 权威源覆盖: {pg_aliases}")
|
||||
except Exception as sync_err:
|
||||
logger.warning(f"[AliasSync] PgSQL→Neo4j aliases 同步失败(不影响主流程): {sync_err}")
|
||||
|
||||
# 使用 Celery 异步任务触发聚类(不阻塞主流程)
|
||||
try:
|
||||
from app.tasks import run_incremental_clustering
|
||||
|
||||
new_entity_ids = [e.id for e in all_entity_nodes]
|
||||
task = run_incremental_clustering.apply_async(
|
||||
kwargs={
|
||||
"end_user_id": end_user_id,
|
||||
"new_entity_ids": new_entity_ids,
|
||||
"llm_model_id": str(memory_config.llm_model_id) if memory_config.llm_model_id else None,
|
||||
"embedding_model_id": str(memory_config.embedding_model_id) if memory_config.embedding_model_id else None,
|
||||
},
|
||||
priority=3,
|
||||
)
|
||||
logger.info(
|
||||
f"[Clustering] 增量聚类任务已提交到 Celery - "
|
||||
f"task_id={task.id}, end_user_id={end_user_id}, entity_count={len(new_entity_ids)}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"[Clustering] 提交聚类任务失败(不影响主流程): {e}", exc_info=True)
|
||||
|
||||
break
|
||||
else:
|
||||
logger.warning("Failed to save some data to Neo4j")
|
||||
if attempt < max_retries - 1:
|
||||
logger.info(f"Retrying... (attempt {attempt + 2}/{max_retries})")
|
||||
await asyncio.sleep(retry_delay * (attempt + 1)) # 指数退避
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
# 检查是否是死锁错误
|
||||
if "DeadlockDetected" in error_msg or "deadlock" in error_msg.lower():
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning(f"Deadlock detected, retrying... (attempt {attempt + 2}/{max_retries})")
|
||||
await asyncio.sleep(retry_delay * (attempt + 1)) # 指数退避
|
||||
else:
|
||||
logger.error(f"Failed after {max_retries} attempts due to deadlock: {e}")
|
||||
raise
|
||||
else:
|
||||
# 非死锁错误,直接抛出
|
||||
raise
|
||||
|
||||
try:
|
||||
await neo4j_connector.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing Neo4j connector: {e}")
|
||||
|
||||
log_time("Neo4j Database Save", time.time() - step_start, log_file)
|
||||
|
||||
# Step 4: Generate Memory summaries and save to Neo4j
|
||||
step_start = time.time()
|
||||
try:
|
||||
summaries = await memory_summary_generation(
|
||||
chunked_dialogs, llm_client=llm_client, embedder_client=embedder_client, language=language
|
||||
)
|
||||
ms_connector = Neo4jConnector()
|
||||
try:
|
||||
await add_memory_summary_nodes(summaries, ms_connector)
|
||||
await add_memory_summary_statement_edges(summaries, ms_connector)
|
||||
finally:
|
||||
try:
|
||||
await ms_connector.close()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Memory summary step failed: {e}", exc_info=True)
|
||||
finally:
|
||||
log_time("Memory Summary (Neo4j)", time.time() - step_start, log_file)
|
||||
|
||||
# Log total pipeline time
|
||||
total_time = time.time() - pipeline_start
|
||||
log_time("TOTAL PIPELINE TIME", total_time, log_file)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
with open(log_file, "a", encoding="utf-8") as f:
|
||||
f.write(f"=== Pipeline Run Completed: {timestamp} ===\n\n")
|
||||
|
||||
# 将提取统计写入 Redis,按 workspace_id 存储
|
||||
try:
|
||||
from app.cache.memory.activity_stats_cache import ActivityStatsCache
|
||||
|
||||
stats_to_cache = {
|
||||
"chunk_count": len(all_chunk_nodes) if all_chunk_nodes else 0,
|
||||
"statements_count": len(all_statement_nodes) if all_statement_nodes else 0,
|
||||
"triplet_entities_count": len(all_entity_nodes) if all_entity_nodes else 0,
|
||||
"triplet_relations_count": len(all_entity_entity_edges) if all_entity_entity_edges else 0,
|
||||
"temporal_count": 0,
|
||||
}
|
||||
await ActivityStatsCache.set_activity_stats(
|
||||
workspace_id=str(memory_config.workspace_id),
|
||||
stats=stats_to_cache,
|
||||
)
|
||||
logger.info(f"[WRITE] 活动统计已写入 Redis: workspace_id={memory_config.workspace_id}")
|
||||
except Exception as cache_err:
|
||||
logger.warning(f"[WRITE] 写入活动统计缓存失败(不影响主流程): {cache_err}", exc_info=True)
|
||||
|
||||
# Close LLM/Embedder underlying httpx clients to prevent
|
||||
# 'RuntimeError: Event loop is closed' during garbage collection
|
||||
for client_obj in (llm_client, embedder_client):
|
||||
try:
|
||||
underlying = getattr(client_obj, 'client', None) or getattr(client_obj, 'model', None)
|
||||
if underlying is None:
|
||||
continue
|
||||
# Unwrap RedBearLLM / RedBearEmbeddings to get the LangChain model
|
||||
inner = getattr(underlying, '_model', underlying)
|
||||
# LangChain OpenAI models expose async_client (httpx.AsyncClient)
|
||||
http_client = getattr(inner, 'async_client', None)
|
||||
if http_client is not None and hasattr(http_client, 'aclose'):
|
||||
await http_client.aclose()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info("=== Pipeline Complete ===")
|
||||
logger.info(f"Total execution time: {total_time:.2f} seconds")
|
||||
@@ -64,7 +64,7 @@ class ImplicitMemoryLLMClient:
|
||||
self.default_model_id = default_model_id
|
||||
self._client_factory = MemoryClientFactory(db)
|
||||
|
||||
logger.info("ImplicitMemoryLLMClient initialized")
|
||||
logger.debug("ImplicitMemoryLLMClient initialized")
|
||||
|
||||
def _get_llm_client(self, model_id: Optional[str] = None):
|
||||
"""Get LLM client instance.
|
||||
|
||||
@@ -242,6 +242,7 @@ class ChunkerClient:
|
||||
chunk = Chunk(
|
||||
content=f"{msg.role}: {sub_chunk_text}",
|
||||
speaker=msg.role, # 直接继承角色
|
||||
dialog_at=getattr(msg, "dialog_at", None),
|
||||
metadata={
|
||||
"message_index": msg_idx,
|
||||
"message_role": msg.role,
|
||||
@@ -257,6 +258,7 @@ class ChunkerClient:
|
||||
chunk = Chunk(
|
||||
content=f"{msg.role}: {msg_content}",
|
||||
speaker=msg.role, # 直接继承角色
|
||||
dialog_at=getattr(msg, "dialog_at", None),
|
||||
metadata={
|
||||
"message_index": msg_idx,
|
||||
"message_role": msg.role,
|
||||
|
||||
@@ -1,58 +1,143 @@
|
||||
from sqlalchemy.orm import Session
|
||||
"""
|
||||
MemoryService — 记忆模块统一入口(Facade)
|
||||
|
||||
from app.core.memory.enums import StorageType, SearchStrategy
|
||||
from app.core.memory.models.service_models import MemoryContext, MemorySearchResult
|
||||
from app.core.memory.pipelines.memory_read import ReadPipeLine
|
||||
from app.db import get_db_context
|
||||
from app.services.memory_config_service import MemoryConfigService
|
||||
所有外部调用方(controllers、Celery tasks、API service)只依赖此类。
|
||||
|
||||
职责:
|
||||
- 接收已加载的 MemoryConfig,选择并调用对应的 Pipeline
|
||||
- 不包含任何业务逻辑实现
|
||||
- 不直接操作数据库或 LLM
|
||||
|
||||
依赖方向:外部调用方 → MemoryService → Pipeline → Engine → Repository
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Optional
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.core.memory.pipelines.pilot_write_pipeline import PilotWriteResult
|
||||
from app.core.memory.pipelines.write_pipeline import WriteResult
|
||||
from app.core.memory.models.message_models import DialogData
|
||||
from app.schemas.memory_config_schema import MemoryConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryService:
|
||||
"""记忆模块统一入口
|
||||
|
||||
所有外部调用方(controllers、Celery tasks、API service)只依赖此类。
|
||||
|
||||
设计决策:
|
||||
- __init__ 接收已加载的 MemoryConfig(而非 config_id),
|
||||
配置加载的职责留在调用方(MemoryAgentService),
|
||||
因为调用方需要 config 做其他事情(如感知记忆处理)。
|
||||
- 未实现的方法抛出 NotImplementedError,明确标记待实现状态。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db: Session,
|
||||
config_id: str | None,
|
||||
end_user_id: str,
|
||||
workspace_id: str | None = None,
|
||||
storage_type: str = "neo4j",
|
||||
user_rag_memory_id: str | None = None,
|
||||
language: str = "zh",
|
||||
self,
|
||||
memory_config: MemoryConfig,
|
||||
end_user_id: str,
|
||||
):
|
||||
config_service = MemoryConfigService(db)
|
||||
memory_config = None
|
||||
if config_id is not None:
|
||||
memory_config = config_service.load_memory_config(
|
||||
config_id=config_id,
|
||||
workspace_id=workspace_id,
|
||||
service_name="MemoryService",
|
||||
)
|
||||
if memory_config is None and storage_type.lower() == "neo4j":
|
||||
raise RuntimeError("Memory configuration for unspecified users")
|
||||
self.ctx = MemoryContext(
|
||||
end_user_id=end_user_id,
|
||||
memory_config=memory_config,
|
||||
storage_type=StorageType(storage_type),
|
||||
user_rag_memory_id=user_rag_memory_id,
|
||||
"""
|
||||
Args:
|
||||
memory_config: 已加载的不可变配置对象
|
||||
end_user_id: 终端用户 ID
|
||||
"""
|
||||
self.memory_config = memory_config
|
||||
self.end_user_id = end_user_id
|
||||
|
||||
async def write(
|
||||
self,
|
||||
messages: List[dict],
|
||||
language: str = "zh",
|
||||
ref_id: str = "",
|
||||
is_pilot_run: bool = False,
|
||||
progress_callback: Optional[
|
||||
Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]
|
||||
] = None,
|
||||
) -> WriteResult:
|
||||
"""写入记忆:对话 → 萃取 → 存储 → 聚类 → 摘要
|
||||
|
||||
Args:
|
||||
messages: 结构化消息 [{"role": "user"/"assistant", "content": "...", "dialog_at": "..."}]
|
||||
language: 语言 ("zh" | "en")
|
||||
ref_id: 引用 ID,为空则自动生成
|
||||
is_pilot_run: 试运行模式(只萃取不写入)
|
||||
progress_callback: 可选的进度回调
|
||||
|
||||
Returns:
|
||||
WriteResult 包含状态和统计信息
|
||||
"""
|
||||
from app.core.memory.pipelines.write_pipeline import WritePipeline
|
||||
|
||||
pipeline = WritePipeline(
|
||||
memory_config=self.memory_config,
|
||||
end_user_id=self.end_user_id,
|
||||
language=language,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
return await pipeline.run(
|
||||
messages=messages,
|
||||
ref_id=ref_id,
|
||||
is_pilot_run=is_pilot_run,
|
||||
)
|
||||
|
||||
async def write(self, messages: list[dict]) -> str:
|
||||
raise NotImplementedError
|
||||
async def pilot_write(
|
||||
self,
|
||||
chunked_dialogs: List[DialogData],
|
||||
language: str = "zh",
|
||||
progress_callback: Optional[
|
||||
Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]
|
||||
] = None,
|
||||
) -> PilotWriteResult:
|
||||
"""试运行写入:只执行萃取链路,不写入 Neo4j
|
||||
|
||||
Args:
|
||||
chunked_dialogs: 预处理 + 分块后的 DialogData 列表
|
||||
language: 语言 ("zh" | "en")
|
||||
progress_callback: 可选的进度回调
|
||||
|
||||
Returns:
|
||||
PilotWriteResult 包含萃取结果、图构建结果和去重结果
|
||||
"""
|
||||
from app.core.memory.pipelines.pilot_write_pipeline import PilotWritePipeline
|
||||
|
||||
pipeline = PilotWritePipeline(
|
||||
memory_config=self.memory_config,
|
||||
end_user_id=self.end_user_id,
|
||||
language=language,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
return await pipeline.run(chunked_dialogs)
|
||||
|
||||
async def read(
|
||||
self,
|
||||
query: str,
|
||||
search_switch: SearchStrategy,
|
||||
limit: int = 10,
|
||||
) -> MemorySearchResult:
|
||||
with get_db_context() as db:
|
||||
return await ReadPipeLine(self.ctx, db).run(query, search_switch, limit)
|
||||
self, query: str, history: list, search_switch: str
|
||||
) -> dict:
|
||||
"""读取记忆:根据 search_switch 选择快速/深度路径"""
|
||||
raise NotImplementedError("ReadPipeline 尚未实现")
|
||||
|
||||
async def forget(self, max_batch: int = 100, min_days: int = 30) -> dict:
|
||||
raise NotImplementedError
|
||||
# async def search(
|
||||
# self,
|
||||
# query: str,
|
||||
# search_type: str = "hybrid",
|
||||
# limit: int = 10,
|
||||
# ) -> dict:
|
||||
# """独立检索:不经过 LangGraph,直接执行混合检索"""
|
||||
# raise NotImplementedError("SearchPipeline 尚未实现")
|
||||
|
||||
async def forget(
|
||||
self, max_batch: int = 100, min_days: int = 30
|
||||
) -> dict:
|
||||
"""遗忘:识别低激活节点并融合"""
|
||||
raise NotImplementedError("ForgettingPipeline 尚未实现")
|
||||
|
||||
async def reflect(self) -> dict:
|
||||
raise NotImplementedError
|
||||
"""反思:检测事实冲突并修正"""
|
||||
raise NotImplementedError("ReflectionPipeline 尚未实现")
|
||||
|
||||
async def cluster(self, new_entity_ids: list[str] = None) -> None:
|
||||
raise NotImplementedError
|
||||
# async def cluster(self, new_entity_ids: list[str] = None) -> None:
|
||||
# """聚类:全量初始化或增量更新社区"""
|
||||
# raise NotImplementedError("ClusteringPipeline 尚未实现")
|
||||
|
||||
@@ -60,8 +60,6 @@ from app.core.memory.models.triplet_models import (
|
||||
|
||||
# User metadata models
|
||||
from app.core.memory.models.metadata_models import (
|
||||
UserMetadata,
|
||||
UserMetadataProfile,
|
||||
MetadataExtractionResponse,
|
||||
MetadataFieldChange,
|
||||
)
|
||||
@@ -132,8 +130,6 @@ __all__ = [
|
||||
"Entity",
|
||||
"Triplet",
|
||||
"TripletExtractionResponse",
|
||||
"UserMetadata",
|
||||
"UserMetadataProfile",
|
||||
"MetadataExtractionResponse",
|
||||
"MetadataFieldChange",
|
||||
# Ontology models
|
||||
|
||||
@@ -106,7 +106,6 @@ class Edge(BaseModel):
|
||||
end_user_id: End user ID for multi-tenancy
|
||||
run_id: Unique identifier for the pipeline run that created this edge
|
||||
created_at: Timestamp when the edge was created (system perspective)
|
||||
expired_at: Optional timestamp when the edge expires (system perspective)
|
||||
"""
|
||||
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the edge.")
|
||||
source: str = Field(..., description="The ID of the source node.")
|
||||
@@ -114,7 +113,6 @@ class Edge(BaseModel):
|
||||
end_user_id: str = Field(..., description="The end user ID of the edge.")
|
||||
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
|
||||
created_at: datetime = Field(..., description="The valid time of the edge from system perspective.")
|
||||
expired_at: Optional[datetime] = Field(default=None, description="The expired time of the edge from system perspective.")
|
||||
|
||||
|
||||
class ChunkEdge(Edge):
|
||||
@@ -162,6 +160,7 @@ class EntityEntityEdge(Edge):
|
||||
invalid_at: Optional end date of temporal validity
|
||||
"""
|
||||
relation_type: str = Field(..., description="Relation type as defined in ontology")
|
||||
relation_type_description: str = Field(default="", description="Chinese definition of the relation type from ontology")
|
||||
relation_value: Optional[str] = Field(None, description="Value of the relation")
|
||||
statement: str = Field(..., description='The statement of the edge.')
|
||||
source_statement_id: str = Field(..., description="Statement where this relationship was extracted")
|
||||
@@ -190,14 +189,12 @@ class Node(BaseModel):
|
||||
end_user_id: End user ID for multi-tenancy
|
||||
run_id: Unique identifier for the pipeline run that created this node
|
||||
created_at: Timestamp when the node was created (system perspective)
|
||||
expired_at: Optional timestamp when the node expires (system perspective)
|
||||
"""
|
||||
id: str = Field(..., description="The unique identifier for the node.")
|
||||
name: str = Field(..., description="The name of the node.")
|
||||
end_user_id: str = Field(..., description="The end user ID of the node.")
|
||||
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
|
||||
created_at: datetime = Field(..., description="The valid time of the node from system perspective.")
|
||||
expired_at: Optional[datetime] = Field(None, description="The expired time of the node from system perspective.")
|
||||
|
||||
|
||||
class DialogueNode(Node):
|
||||
@@ -283,6 +280,7 @@ class StatementNode(Node):
|
||||
temporal_info: TemporalInfo = Field(..., description="Temporal information")
|
||||
valid_at: Optional[datetime] = Field(None, description="Temporal validity start")
|
||||
invalid_at: Optional[datetime] = Field(None, description="Temporal validity end")
|
||||
dialog_at: Optional[datetime] = Field(None, description="Absolute timestamp of the conversation this statement belongs to")
|
||||
|
||||
# Embedding and other fields
|
||||
statement_embedding: Optional[List[float]] = Field(None, description="Statement embedding vector")
|
||||
@@ -318,7 +316,7 @@ class StatementNode(Node):
|
||||
description="Total number of times this node has been accessed"
|
||||
)
|
||||
|
||||
@field_validator('valid_at', 'invalid_at', mode='before')
|
||||
@field_validator('valid_at', 'invalid_at', 'dialog_at', mode='before')
|
||||
@classmethod
|
||||
def validate_datetime(cls, v):
|
||||
"""使用通用的历史日期解析函数"""
|
||||
@@ -413,6 +411,7 @@ class ExtractedEntityNode(Node):
|
||||
entity_idx: int = Field(..., description="Unique identifier for the entity")
|
||||
statement_id: str = Field(..., description="Statement this entity was extracted from")
|
||||
entity_type: str = Field(..., description="Type of the entity")
|
||||
type_description: str = Field(default="", description="Chinese definition of the entity type from ontology")
|
||||
description: str = Field(..., description="Entity description")
|
||||
example: str = Field(
|
||||
default="",
|
||||
@@ -462,6 +461,16 @@ class ExtractedEntityNode(Node):
|
||||
description="Whether this entity represents explicit/semantic memory (knowledge, concepts, definitions, theories, principles)"
|
||||
)
|
||||
|
||||
# User Metadata Fields (populated by async metadata extraction after dedup)
|
||||
core_facts: List[str] = Field(default_factory=list, description="Stable basic facts about the user")
|
||||
traits: List[str] = Field(default_factory=list, description="Stable personality traits or behavioral tendencies")
|
||||
relations: List[str] = Field(default_factory=list, description="Durable relationships with people/groups/entities")
|
||||
goals: List[str] = Field(default_factory=list, description="Long-term goals or ongoing pursuits")
|
||||
interests: List[str] = Field(default_factory=list, description="Stable interests, preferences, or hobbies")
|
||||
beliefs_or_stances: List[str] = Field(default_factory=list, description="Stable beliefs, values, or stances")
|
||||
anchors: List[str] = Field(default_factory=list, description="Personally meaningful objects or symbols")
|
||||
events: List[str] = Field(default_factory=list, description="Durable personal experiences or milestones")
|
||||
|
||||
@field_validator('aliases', mode='before')
|
||||
@classmethod
|
||||
def validate_aliases_field(cls, v): # 字段验证器 自动清理和验证 aliases 字段
|
||||
@@ -576,3 +585,47 @@ class PerceptualNode(Node):
|
||||
domain: str
|
||||
file_type: str
|
||||
summary_embedding: list[float] | None
|
||||
|
||||
|
||||
class AssistantOriginalNode(Node):
|
||||
"""Node storing the original text of an Assistant message before pruning.
|
||||
|
||||
Attributes:
|
||||
pair_id: Shared ID with the corresponding AssistantPrunedNode for pairing
|
||||
dialog_id: ID of the parent dialogue this message belongs to
|
||||
text: The full original Assistant response text
|
||||
"""
|
||||
pair_id: str = Field(..., description="Shared pairing ID with the corresponding pruned node")
|
||||
dialog_id: str = Field(..., description="ID of the parent dialogue")
|
||||
text: str = Field(..., description="Original Assistant message text")
|
||||
|
||||
|
||||
class AssistantPrunedNode(Node):
|
||||
"""Node storing the pruned (compressed) text of an Assistant message.
|
||||
|
||||
Attributes:
|
||||
pair_id: Shared ID with the corresponding AssistantOriginalNode for pairing
|
||||
dialog_id: ID of the parent dialogue this message belongs to
|
||||
text: The pruned memory hint text (or "NULL" if no memory value)
|
||||
memory_type: Type of the memory hint (comfort|suggestion|recommendation|warning|instruction|NULL)
|
||||
text_embedding: Optional embedding vector for semantic search on pruned text
|
||||
"""
|
||||
pair_id: str = Field(..., description="Shared pairing ID with the corresponding original node")
|
||||
dialog_id: str = Field(..., description="ID of the parent dialogue")
|
||||
text: str = Field(..., description="Pruned assistant memory hint text")
|
||||
memory_type: str = Field(..., description="Memory type: comfort|suggestion|recommendation|warning|instruction|NULL")
|
||||
text_embedding: Optional[List[float]] = Field(None, description="Embedding vector for semantic search")
|
||||
|
||||
|
||||
class AssistantPrunedEdge(Edge):
|
||||
"""Edge connecting an AssistantOriginal node to its AssistantPruned node (PRUNED_TO).
|
||||
|
||||
Attributes:
|
||||
pair_id: Shared pairing ID for traceability
|
||||
"""
|
||||
pair_id: str = Field(..., description="Shared pairing ID for traceability")
|
||||
|
||||
|
||||
class AssistantDialogEdge(Edge):
|
||||
"""Edge connecting an AssistantOriginal node to its parent Dialogue node (BELONGS_TO_DIALOG)."""
|
||||
pass
|
||||
|
||||
@@ -30,6 +30,7 @@ class ConversationMessage(BaseModel):
|
||||
"""
|
||||
role: str = Field(..., description="The role of the speaker (e.g., 'user', 'assistant').")
|
||||
msg: str = Field(..., description="The text content of the message.")
|
||||
dialog_at: Optional[str] = Field(None, description="Absolute timestamp of this message (ISO 8601).")
|
||||
files: list[tuple] = Field(default_factory=list, description="The file content of the message", exclude=True)
|
||||
|
||||
|
||||
@@ -94,6 +95,13 @@ class Statement(BaseModel):
|
||||
emotion_keywords: Optional[List[str]] = Field(default_factory=list, description="Emotion keywords, max 3")
|
||||
emotion_subject: Optional[str] = Field(None, description="Emotion subject: self/other/object")
|
||||
emotion_target: Optional[str] = Field(None, description="Emotion target: person or object name")
|
||||
# Reference resolution
|
||||
has_unsolved_reference: bool = Field(False, description="Whether the statement has unresolved references")
|
||||
has_emotional_state: bool = Field(
|
||||
False,
|
||||
description="Whether the statement reflects user's emotional state",
|
||||
)
|
||||
dialog_at: Optional[str] = Field(None, description="Absolute timestamp of the source message (ISO 8601).")
|
||||
|
||||
|
||||
class ConversationContext(BaseModel):
|
||||
@@ -133,6 +141,7 @@ class Chunk(BaseModel):
|
||||
statements: List[Statement] = Field(default_factory=list, description="A list of statements in the chunk.")
|
||||
files: list[tuple] = Field(default_factory=list, description="List of files in the chunk.")
|
||||
chunk_embedding: Optional[List[float]] = Field(default=None, description="The embedding vector of the chunk.")
|
||||
dialog_at: Optional[str] = Field(None, description="Absolute timestamp of the source message (ISO 8601).")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata for the chunk.")
|
||||
|
||||
@classmethod
|
||||
@@ -149,6 +158,7 @@ class Chunk(BaseModel):
|
||||
return cls(
|
||||
content=f"{message.role}: {message.msg}",
|
||||
speaker=message.role,
|
||||
dialog_at=message.dialog_at,
|
||||
metadata=metadata or {}
|
||||
)
|
||||
|
||||
@@ -163,7 +173,6 @@ class DialogData(BaseModel):
|
||||
ref_id: Reference ID linking to external dialog system
|
||||
end_user_id: End user ID for multi-tenancy
|
||||
created_at: Timestamp when the dialog was created
|
||||
expired_at: Timestamp when the dialog expires (default: far future)
|
||||
metadata: Additional metadata as key-value pairs
|
||||
chunks: List of chunks from the conversation
|
||||
config_id: Configuration ID used to process this dialog
|
||||
@@ -178,7 +187,6 @@ class DialogData(BaseModel):
|
||||
end_user_id: str = Field(default=..., description="End user ID of dialogue data")
|
||||
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
|
||||
created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the dialog was created.")
|
||||
expired_at: datetime = Field(default_factory=lambda: datetime(9999, 12, 31), description="The timestamp when the dialog expires.")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata for the dialog.")
|
||||
chunks: List[Chunk] = Field(default_factory=list, description="A list of chunks from the conversation context.")
|
||||
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this dialog (integer or string)")
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
|
||||
Independent from triplet_models.py - these models are used by the
|
||||
standalone metadata extraction pipeline (post-dedup async Celery task).
|
||||
|
||||
The field definitions align with the Jinja2 prompt template
|
||||
``extract_user_metadata.jinja2``.
|
||||
"""
|
||||
|
||||
from typing import List, Literal, Optional
|
||||
@@ -9,55 +12,69 @@ from typing import List, Literal, Optional
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class UserMetadataProfile(BaseModel):
|
||||
"""用户画像信息"""
|
||||
class MetadataExtractionResponse(BaseModel):
|
||||
"""LLM 元数据提取响应结构。
|
||||
|
||||
字段与 extract_user_metadata.jinja2 模板的输出 JSON 一一对应。
|
||||
每个字段都是字符串数组,表示本次新增的元数据条目。
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
role: List[str] = Field(default_factory=list, description="用户职业或角色")
|
||||
domain: List[str] = Field(default_factory=list, description="用户所在领域")
|
||||
expertise: List[str] = Field(
|
||||
default_factory=list, description="用户擅长的技能或工具"
|
||||
|
||||
aliases: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="用户别名、昵称、称呼",
|
||||
)
|
||||
core_facts: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="用户稳定的基础事实(身份、年龄、国籍、所在地等)",
|
||||
)
|
||||
traits: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="用户稳定的人格特质、风格、行为倾向",
|
||||
)
|
||||
relations: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="用户与他人/群体/宠物/重要对象之间的长期关系",
|
||||
)
|
||||
goals: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="用户明确、稳定的长期目标或计划",
|
||||
)
|
||||
interests: List[str] = Field(
|
||||
default_factory=list, description="用户关注的话题或领域标签"
|
||||
)
|
||||
|
||||
|
||||
class UserMetadata(BaseModel):
|
||||
"""用户元数据顶层结构"""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
profile: UserMetadataProfile = Field(default_factory=UserMetadataProfile)
|
||||
|
||||
|
||||
class MetadataFieldChange(BaseModel):
|
||||
"""单个元数据字段的变更操作"""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
field_path: str = Field(
|
||||
description="字段路径,用点号分隔,如 'profile.role'、'profile.expertise'"
|
||||
)
|
||||
action: Literal["set", "remove"] = Field(
|
||||
description="操作类型:'set' 表示新增或修改,'remove' 表示移除"
|
||||
)
|
||||
value: Optional[str] = Field(
|
||||
default=None,
|
||||
description="字段的新值(action='set' 时必填)。标量字段直接填值,列表字段填单个要新增的元素"
|
||||
)
|
||||
|
||||
|
||||
class MetadataExtractionResponse(BaseModel):
|
||||
"""元数据提取 LLM 响应结构(增量模式)"""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
metadata_changes: List[MetadataFieldChange] = Field(
|
||||
default_factory=list,
|
||||
description="元数据的增量变更列表,每项描述一个字段的新增、修改或移除操作",
|
||||
description="用户稳定的兴趣、偏好、长期爱好",
|
||||
)
|
||||
aliases_to_add: List[str] = Field(
|
||||
beliefs_or_stances: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)",
|
||||
description="用户稳定的信念、价值立场",
|
||||
)
|
||||
aliases_to_remove: List[str] = Field(
|
||||
default_factory=list, description="用户明确否认的别名(如'我不叫XX了')"
|
||||
anchors: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="对用户有长期意义的物品、收藏、纪念物",
|
||||
)
|
||||
events: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="对用户画像有长期价值的个人经历、事件、里程碑",
|
||||
)
|
||||
|
||||
# ── 便捷属性 ──
|
||||
|
||||
METADATA_FIELDS: List[str] = [
|
||||
"core_facts", "traits", "relations", "goals",
|
||||
"interests", "beliefs_or_stances", "anchors", "events",
|
||||
]
|
||||
|
||||
def has_any_metadata(self) -> bool:
|
||||
"""是否提取到了任何元数据(不含 aliases)。"""
|
||||
return any(
|
||||
bool(getattr(self, field, []))
|
||||
for field in self.METADATA_FIELDS
|
||||
)
|
||||
|
||||
def to_metadata_dict(self) -> dict:
|
||||
"""返回 8 个元数据字段的字典(不含 aliases),用于 Neo4j 回写。"""
|
||||
return {
|
||||
field: getattr(self, field, [])
|
||||
for field in self.METADATA_FIELDS
|
||||
}
|
||||
|
||||
@@ -37,6 +37,7 @@ class Entity(BaseModel):
|
||||
name: str = Field(..., description="Name of the entity")
|
||||
name_embedding: Optional[List[float]] = Field(None, description="Embedding vector for the entity name")
|
||||
type: str = Field(..., description="Type/category of the entity")
|
||||
type_description: str = Field(default="", description="Chinese definition of the entity type from ontology")
|
||||
description: str = Field(..., description="Description of the entity")
|
||||
example: str = Field(
|
||||
default="",
|
||||
@@ -79,6 +80,7 @@ class Triplet(BaseModel):
|
||||
subject_name: str = Field(..., description="Name of the subject entity")
|
||||
subject_id: int = Field(..., description="ID of the subject entity")
|
||||
predicate: str = Field(..., description="Relationship/predicate between subject and object")
|
||||
predicate_description: str = Field(default="", description="Chinese definition of the predicate from ontology")
|
||||
object_name: str = Field(..., description="Name of the object entity")
|
||||
object_id: int = Field(..., description="ID of the object entity")
|
||||
value: Optional[str] = Field(None, description="Additional value or context")
|
||||
|
||||
@@ -149,3 +149,16 @@ class ExtractionPipelineConfig(BaseModel):
|
||||
temporal_extraction: TemporalExtractionConfig = Field(default_factory=TemporalExtractionConfig)
|
||||
deduplication: DedupConfig = Field(default_factory=DedupConfig)
|
||||
forgetting_engine: ForgettingEngineConfig = Field(default_factory=ForgettingEngineConfig)
|
||||
# 情绪引擎(旁路模块,SidecarStepFactory 通过此字段判断是否启用)
|
||||
emotion_enabled: bool = Field(default=False, description="是否启用情绪提取旁路")
|
||||
|
||||
# TODO 设置控制并发数量以适配LLM的QPM限流
|
||||
# # 流水线 LLM 并发上限(statement + triplet 共享),防止 QPM 爆掉
|
||||
# # 可通过环境变量 MAX_CONCURRENT_LLM_CALLS 覆盖
|
||||
# max_concurrent_llm_calls: int = Field(
|
||||
# default_factory=lambda: int(
|
||||
# __import__("os").environ.get("MAX_CONCURRENT_LLM_CALLS", "5")
|
||||
# ),
|
||||
# ge=1, le=64,
|
||||
# description="Maximum concurrent LLM calls in the extraction pipeline",
|
||||
# )
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,15 +23,12 @@ from app.core.memory.models.ontology_extraction_models import OntologyTypeInfo,
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 默认核心通用类型
|
||||
# 默认核心通用类型 —— 与 ontology.md Entity Ontology 对齐的 13 类
|
||||
DEFAULT_CORE_GENERAL_TYPES: Set[str] = {
|
||||
"Person", "Organization", "Company", "GovernmentAgency",
|
||||
"Place", "Location", "City", "Country", "Building",
|
||||
"Event", "SportsEvent", "MusicEvent", "SocialEvent",
|
||||
"Work", "Book", "Film", "Software", "Album",
|
||||
"Concept", "TopicalConcept", "AcademicSubject",
|
||||
"Device", "Food", "Drug", "ChemicalSubstance",
|
||||
"TimePeriod", "Year",
|
||||
"人物", "组织", "群体", "角色职业",
|
||||
"地点设施", "物品设备", "软件平台", "识别联系信息",
|
||||
"文档媒体", "知识能力", "偏好习惯", "具体目标",
|
||||
"称呼别名",
|
||||
}
|
||||
|
||||
|
||||
@@ -129,10 +126,12 @@ class OntologyTypeMerger:
|
||||
if type_name not in seen_names and remaining_slots > 0:
|
||||
general_type = self.general_registry.get_type(type_name)
|
||||
if general_type:
|
||||
# 优先使用 rdfs:comment(完整定义),其次才是 label;
|
||||
# 对中文 13 类本体,label 与 class_name 相同,单独展示无增益。
|
||||
description = (
|
||||
general_type.labels.get("zh") or
|
||||
general_type.description or
|
||||
general_type.get_label("en") or
|
||||
general_type.description or
|
||||
general_type.labels.get("zh") or
|
||||
general_type.get_label("en") or
|
||||
type_name
|
||||
)
|
||||
core_types_added.append(OntologyTypeInfo(
|
||||
@@ -157,8 +156,8 @@ class OntologyTypeMerger:
|
||||
parent_type = self.general_registry.get_type(parent_name)
|
||||
if parent_type:
|
||||
description = (
|
||||
parent_type.labels.get("zh") or
|
||||
parent_type.description or
|
||||
parent_type.description or
|
||||
parent_type.labels.get("zh") or
|
||||
parent_name
|
||||
)
|
||||
related_types_added.append(OntologyTypeInfo(
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
Memory Pipelines — 记忆模块流水线编排层
|
||||
|
||||
每条 Pipeline 定义一个完整的业务流程,按顺序编排多个 Engine 的调用。
|
||||
Pipeline 不包含业务逻辑实现,只做步骤编排和数据传递。
|
||||
"""
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
"""延迟导入,避免循环依赖"""
|
||||
if name in ("WritePipeline", "ExtractionResult", "WriteResult"):
|
||||
from app.core.memory.pipelines.write_pipeline import (
|
||||
ExtractionResult,
|
||||
WritePipeline,
|
||||
WriteResult,
|
||||
)
|
||||
|
||||
_exports = {
|
||||
"WritePipeline": WritePipeline,
|
||||
"ExtractionResult": ExtractionResult,
|
||||
"WriteResult": WriteResult,
|
||||
}
|
||||
return _exports[name]
|
||||
if name in ("PilotWritePipeline", "PilotWriteResult"):
|
||||
from app.core.memory.pipelines.pilot_write_pipeline import (
|
||||
PilotWritePipeline,
|
||||
PilotWriteResult,
|
||||
)
|
||||
|
||||
_exports = {
|
||||
"PilotWritePipeline": PilotWritePipeline,
|
||||
"PilotWriteResult": PilotWriteResult,
|
||||
}
|
||||
return _exports[name]
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
|
||||
__all__ = [
|
||||
"WritePipeline",
|
||||
"ExtractionResult",
|
||||
"WriteResult",
|
||||
"PilotWritePipeline",
|
||||
"PilotWriteResult",
|
||||
]
|
||||
|
||||
181
api/app/core/memory/pipelines/pilot_write_pipeline.py
Normal file
181
api/app/core/memory/pipelines/pilot_write_pipeline.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""PilotWritePipeline — 试运行专用萃取流水线。
|
||||
|
||||
职责边界:
|
||||
- 只执行"萃取相关"链路:statement -> triplet -> graph_build -> 第一层去重消歧
|
||||
- 不负责 Neo4j 写入、聚类、摘要、缓存更新
|
||||
- 自行管理客户端初始化和本体类型加载(与 WritePipeline 对齐)
|
||||
|
||||
依赖方向:Facade → Pipeline → Engine → Repository(单向,不允许反向调用)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Optional
|
||||
|
||||
from app.core.memory.models.message_models import DialogData
|
||||
from app.core.memory.storage_services.extraction_engine.steps.dedup_step import (
|
||||
DedupResult,
|
||||
run_dedup,
|
||||
)
|
||||
from app.core.memory.storage_services.extraction_engine.extraction_pipeline_orchestrator import (
|
||||
NewExtractionOrchestrator,
|
||||
)
|
||||
from app.core.memory.storage_services.extraction_engine.steps.graph_build_step import (
|
||||
GraphBuildResult,
|
||||
build_graph_nodes_and_edges,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.schemas.memory_config_schema import MemoryConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PilotWriteResult:
|
||||
"""试运行流水线输出。"""
|
||||
|
||||
dialog_data_list: List[DialogData]
|
||||
graph: GraphBuildResult
|
||||
dedup: DedupResult
|
||||
|
||||
@property
|
||||
def stats(self) -> Dict[str, int]:
|
||||
return {
|
||||
"chunk_count": len(self.graph.chunk_nodes),
|
||||
"statement_count": len(self.graph.statement_nodes),
|
||||
"entity_count_before_dedup": len(self.graph.entity_nodes),
|
||||
"entity_count_after_dedup": len(self.dedup.entity_nodes),
|
||||
"relation_count_before_dedup": len(self.graph.entity_entity_edges),
|
||||
"relation_count_after_dedup": len(self.dedup.entity_entity_edges),
|
||||
}
|
||||
|
||||
|
||||
class PilotWritePipeline:
|
||||
"""重构后试运行专用流水线。
|
||||
|
||||
构造函数只接收 memory_config,客户端初始化和本体加载在 run() 内部完成,
|
||||
与 WritePipeline 保持一致的生命周期管理模式。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
memory_config: MemoryConfig,
|
||||
end_user_id: str,
|
||||
language: str = "zh",
|
||||
progress_callback: Optional[
|
||||
Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]
|
||||
] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
memory_config: 不可变的记忆配置对象(从数据库加载)
|
||||
end_user_id: 终端用户 ID
|
||||
language: 语言 ("zh" | "en")
|
||||
progress_callback: 可选的进度回调
|
||||
"""
|
||||
self.memory_config = memory_config
|
||||
self.end_user_id = end_user_id
|
||||
self.language = language
|
||||
self.progress_callback = progress_callback
|
||||
|
||||
# 延迟初始化的客户端
|
||||
self._llm_client = None
|
||||
self._embedder_client = None
|
||||
|
||||
async def run(self, dialog_data_list: List[DialogData]) -> PilotWriteResult:
|
||||
"""执行试运行萃取链路。
|
||||
|
||||
内部完成客户端初始化 → 本体加载 → 萃取 → 图构建 → 去重。
|
||||
"""
|
||||
from app.core.memory.utils.config.config_utils import get_pipeline_config
|
||||
|
||||
self._init_clients()
|
||||
pipeline_config = get_pipeline_config(self.memory_config)
|
||||
ontology_types = self._load_ontology_types()
|
||||
|
||||
orchestrator = NewExtractionOrchestrator(
|
||||
llm_client=self._llm_client,
|
||||
embedder_client=self._embedder_client,
|
||||
config=pipeline_config,
|
||||
embedding_id=str(self.memory_config.embedding_model_id),
|
||||
ontology_types=ontology_types,
|
||||
language=self.language,
|
||||
is_pilot_run=True,
|
||||
progress_callback=self.progress_callback,
|
||||
)
|
||||
extracted_dialogs = await orchestrator.run(dialog_data_list)
|
||||
|
||||
graph = await build_graph_nodes_and_edges(
|
||||
dialog_data_list=extracted_dialogs,
|
||||
embedder_client=self._embedder_client,
|
||||
progress_callback=self.progress_callback,
|
||||
)
|
||||
|
||||
dedup = await run_dedup(
|
||||
entity_nodes=graph.entity_nodes,
|
||||
statement_entity_edges=graph.stmt_entity_edges,
|
||||
entity_entity_edges=graph.entity_entity_edges,
|
||||
dialog_data_list=extracted_dialogs,
|
||||
pipeline_config=pipeline_config,
|
||||
connector=None, # pilot: no layer-2 db dedup
|
||||
llm_client=self._llm_client,
|
||||
is_pilot_run=True,
|
||||
progress_callback=self.progress_callback,
|
||||
)
|
||||
|
||||
return PilotWriteResult(
|
||||
dialog_data_list=extracted_dialogs,
|
||||
graph=graph,
|
||||
dedup=dedup,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 辅助方法
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def _init_clients(self) -> None:
|
||||
"""从 MemoryConfig 构建 LLM 和 Embedding 客户端。"""
|
||||
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
|
||||
from app.db import get_db_context
|
||||
|
||||
with get_db_context() as db:
|
||||
factory = MemoryClientFactory(db)
|
||||
self._llm_client = factory.get_llm_client_from_config(self.memory_config)
|
||||
self._embedder_client = factory.get_embedder_client_from_config(
|
||||
self.memory_config
|
||||
)
|
||||
logger.info("Pilot pipeline: LLM and embedding clients constructed")
|
||||
|
||||
def _load_ontology_types(self):
|
||||
"""加载本体类型配置(如果配置了 scene_id)。"""
|
||||
if not self.memory_config.scene_id:
|
||||
return None
|
||||
|
||||
try:
|
||||
from app.core.memory.ontology_services.ontology_type_loader import (
|
||||
load_ontology_types_for_scene,
|
||||
)
|
||||
from app.db import get_db_context
|
||||
|
||||
with get_db_context() as db:
|
||||
ontology_types = load_ontology_types_for_scene(
|
||||
scene_id=self.memory_config.scene_id,
|
||||
workspace_id=self.memory_config.workspace_id,
|
||||
db=db,
|
||||
)
|
||||
if ontology_types:
|
||||
logger.info(
|
||||
f"Loaded {len(ontology_types.types)} ontology types "
|
||||
f"for scene_id: {self.memory_config.scene_id}"
|
||||
)
|
||||
return ontology_types
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to load ontology types for scene_id "
|
||||
f"{self.memory_config.scene_id}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
903
api/app/core/memory/pipelines/write_pipeline.py
Normal file
903
api/app/core/memory/pipelines/write_pipeline.py
Normal file
@@ -0,0 +1,903 @@
|
||||
"""
|
||||
WritePipeline — 记忆写入流水线
|
||||
|
||||
编排完整的写入流程:预处理 → 萃取 → 存储 → 聚类 → 摘要。
|
||||
不包含业务逻辑实现,只做步骤编排和数据传递。
|
||||
|
||||
设计原则:
|
||||
- Pipeline 不直接操作数据库,通过 Engine / Repository 完成
|
||||
- Pipeline 不包含 LLM 调用逻辑,通过 ExtractionOrchestrator 完成
|
||||
- Pipeline 负责资源生命周期管理(客户端初始化 / 连接关闭)
|
||||
- Pipeline 负责错误边界划分(哪些错误中断流程,哪些吞掉继续)
|
||||
|
||||
依赖方向:Facade → Pipeline → Engine → Repository(单向,不允许反向调用)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Optional
|
||||
|
||||
from app.core.memory.utils.log.bear_logger import BearLogger
|
||||
|
||||
from pydantic import BaseModel, Field, ConfigDict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.core.memory.models.message_models import DialogData
|
||||
from app.schemas.memory_config_schema import MemoryConfig
|
||||
|
||||
from app.core.memory.models.graph_models import (
|
||||
ChunkNode,
|
||||
DialogueNode,
|
||||
EntityEntityEdge,
|
||||
ExtractedEntityNode,
|
||||
PerceptualEdge,
|
||||
PerceptualNode,
|
||||
StatementChunkEdge,
|
||||
StatementEntityEdge,
|
||||
StatementNode,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
bear = BearLogger("memory.pipeline")
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 数据结构
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
|
||||
class ExtractionResult(BaseModel):
|
||||
"""萃取 + 图构建 + 去重消歧后的结构化输出。
|
||||
|
||||
作为 Pipeline 层的阶段间数据载体,确保下游步骤(_store、_cluster)
|
||||
接收到的图节点和边结构完整、类型正确。
|
||||
|
||||
字段对应 ExtractionOrchestrator 产出的图节点/边:
|
||||
dialogue_nodes — 对话节点
|
||||
chunk_nodes — 分块节点
|
||||
statement_nodes — 陈述句节点
|
||||
entity_nodes — 实体节点(去重消歧后)
|
||||
perceptual_nodes — 感知节点
|
||||
stmt_chunk_edges — 陈述句 → 分块 边
|
||||
stmt_entity_edges — 陈述句 → 实体 边
|
||||
entity_entity_edges — 实体 → 实体 边(去重消歧后)
|
||||
perceptual_edges — 感知 → 分块 边
|
||||
dialog_data_list — 原始 DialogData(供摘要阶段使用)
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
dialogue_nodes: List[DialogueNode]
|
||||
chunk_nodes: List[ChunkNode]
|
||||
statement_nodes: List[StatementNode]
|
||||
entity_nodes: List[ExtractedEntityNode]
|
||||
perceptual_nodes: List[PerceptualNode]
|
||||
stmt_chunk_edges: List[StatementChunkEdge]
|
||||
stmt_entity_edges: List[StatementEntityEdge]
|
||||
entity_entity_edges: List[EntityEntityEdge]
|
||||
perceptual_edges: List[PerceptualEdge]
|
||||
assistant_original_nodes: List[Any] = Field(default_factory=list)
|
||||
assistant_pruned_nodes: List[Any] = Field(default_factory=list)
|
||||
assistant_pruned_edges: List[Any] = Field(default_factory=list)
|
||||
assistant_dialog_edges: List[Any] = Field(default_factory=list)
|
||||
dialog_data_list: List[Any] = Field(
|
||||
default_factory=list,
|
||||
description="原始 DialogData 列表,类型为 Any 以避免循环依赖",
|
||||
)
|
||||
|
||||
@property
|
||||
def stats(self) -> Dict[str, int]:
|
||||
"""返回统计摘要,用于 WriteResult 和日志"""
|
||||
return {
|
||||
"dialogue_count": len(self.dialogue_nodes),
|
||||
"chunk_count": len(self.chunk_nodes),
|
||||
"statement_count": len(self.statement_nodes),
|
||||
"entity_count": len(self.entity_nodes),
|
||||
"perceptual_count": len(self.perceptual_nodes),
|
||||
"relation_count": len(self.entity_entity_edges),
|
||||
}
|
||||
|
||||
|
||||
class WriteResult(BaseModel):
|
||||
"""写入流水线的最终输出,返回给 MemoryService / MemoryAgentService"""
|
||||
|
||||
status: str # "success" | "pilot_complete" | "failed"
|
||||
extraction: Optional[Dict[str, int]] = None # ExtractionResult.stats
|
||||
error: Optional[str] = None # 失败时的错误信息
|
||||
elapsed_seconds: float = 0.0 # 总耗时(秒)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# WritePipeline
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
|
||||
class WritePipeline:
|
||||
"""
|
||||
记忆写入流水线
|
||||
|
||||
编排完整的写入流程:预处理 → 萃取 → 存储 → 聚类 → 摘要。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
memory_config: MemoryConfig,
|
||||
end_user_id: str,
|
||||
language: str = "zh",
|
||||
progress_callback: Optional[
|
||||
Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]
|
||||
] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
memory_config: 不可变的记忆配置对象(从数据库加载)
|
||||
end_user_id: 终端用户 ID
|
||||
language: 语言 ("zh" | "en")
|
||||
progress_callback: 可选的进度回调,签名 (stage, message, data?) -> Awaitable[None] 供pilot run使用
|
||||
"""
|
||||
self.memory_config = memory_config
|
||||
self.end_user_id = end_user_id
|
||||
self.language = language
|
||||
self.progress_callback = progress_callback
|
||||
|
||||
# 延迟初始化的客户端
|
||||
self._llm_client = None
|
||||
self._embedder_client = None
|
||||
self._neo4j_connector = None
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 公开接口
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def run(
|
||||
self,
|
||||
messages: List[dict],
|
||||
ref_id: str = "",
|
||||
is_pilot_run: bool = False,
|
||||
) -> WriteResult:
|
||||
"""
|
||||
执行完整的写入流水线。
|
||||
|
||||
Args:
|
||||
messages: 结构化消息 [{"role": "user"/"assistant", "content": "..."}]
|
||||
ref_id: 引用 ID,为空则自动生成
|
||||
is_pilot_run: 试运行模式(只萃取不写入)
|
||||
|
||||
Returns:
|
||||
WriteResult 包含状态和统计信息
|
||||
"""
|
||||
if not ref_id:
|
||||
ref_id = uuid.uuid4().hex
|
||||
|
||||
mode = "试运行" if is_pilot_run else "正式"
|
||||
extraction_result = None
|
||||
|
||||
try:
|
||||
async with bear.pipeline(
|
||||
"WritePipeline",
|
||||
mode=mode,
|
||||
config_name=self.memory_config.config_name,
|
||||
end_user_id=self.end_user_id,
|
||||
):
|
||||
# 初始化客户端和连接
|
||||
self._init_clients()
|
||||
self._init_neo4j_connector()
|
||||
|
||||
# 初始化快照记录器(提前创建,供预处理阶段的剪枝使用)
|
||||
from app.core.memory.utils.debug.write_snapshot_recorder import (
|
||||
WriteSnapshotRecorder,
|
||||
)
|
||||
|
||||
self._recorder = WriteSnapshotRecorder("new")
|
||||
|
||||
# Step 1: 预处理 - 消息分块 + AI消息语义剪枝
|
||||
async with bear.step(1, 5, "预处理", "消息分块") as s:
|
||||
chunked_dialogs = await self._preprocess(messages, ref_id)
|
||||
s.metadata(chunks=sum(len(d.chunks) for d in chunked_dialogs))
|
||||
|
||||
# Step 2: 萃取 - 知识提取 + 第一层去重 + 别名归并(内存侧)
|
||||
async with bear.step(2, 5, "萃取", "知识提取") as s:
|
||||
extraction_result = await self._extract(
|
||||
chunked_dialogs, is_pilot_run
|
||||
)
|
||||
# 别名归并(内存侧):在写入前完成,确保写入的数据已归并
|
||||
self._merge_alias_in_memory(extraction_result)
|
||||
stats = extraction_result.stats
|
||||
s.metadata(
|
||||
entities=stats["entity_count"],
|
||||
statements=stats["statement_count"],
|
||||
relations=stats["relation_count"],
|
||||
)
|
||||
|
||||
# 试运行模式到此结束
|
||||
if is_pilot_run:
|
||||
return WriteResult(
|
||||
status="pilot_complete",
|
||||
extraction=extraction_result.stats,
|
||||
elapsed_seconds=0.0,
|
||||
)
|
||||
|
||||
# Step 3: 存储 - 写入 Neo4j
|
||||
async with bear.step(3, 5, "存储", "写入 Neo4j"):
|
||||
await self._store(extraction_result)
|
||||
|
||||
# Step 3.5: 异步后处理(别名归并 Neo4j 侧 + 第二层去重 + 情绪 + 元数据)
|
||||
await self._post_store_async_tasks(extraction_result)
|
||||
|
||||
# Step 4: 聚类 - 增量更新社区(异步,不阻塞)
|
||||
async with bear.step(4, 5, "聚类", "增量更新社区") as s:
|
||||
await self._cluster(extraction_result)
|
||||
s.metadata(mode="async")
|
||||
|
||||
# Step 5: 摘要 - 生成情景记忆摘要
|
||||
async with bear.step(5, 5, "摘要", "生成情景记忆"):
|
||||
await self._summarize(chunked_dialogs)
|
||||
|
||||
# 更新活动统计缓存
|
||||
await self._update_stats_cache(extraction_result)
|
||||
|
||||
return WriteResult(
|
||||
status="success",
|
||||
extraction=extraction_result.stats,
|
||||
elapsed_seconds=0.0,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
finally:
|
||||
await self._cleanup()
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Step 1: 预处理
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def _preprocess(self, messages: List[dict], ref_id: str) -> List[DialogData]:
|
||||
"""
|
||||
预处理:消息校验 → AI消息语义剪枝 → 对话分块。
|
||||
|
||||
委托给 get_chunked_dialogs(),保持现有预处理逻辑不变。
|
||||
get_dialogs.py 内部已包含:
|
||||
- 消息格式校验(role/content 必填)
|
||||
- AI消息语义剪枝(根据 config 中 pruning_enabled 决定)
|
||||
- DialogueChunker 分块
|
||||
"""
|
||||
from app.core.memory.agent.utils.get_dialogs import get_chunked_dialogs
|
||||
|
||||
recorder = getattr(self, "_recorder", None)
|
||||
snapshot = recorder.snapshot if recorder else None
|
||||
|
||||
return await get_chunked_dialogs(
|
||||
chunker_strategy=self.memory_config.chunker_strategy,
|
||||
end_user_id=self.end_user_id,
|
||||
messages=messages,
|
||||
ref_id=ref_id,
|
||||
config_id=str(self.memory_config.config_id),
|
||||
workspace_id=self.memory_config.workspace_id,
|
||||
snapshot=snapshot,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Step 2: 萃取
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def _extract(
|
||||
self,
|
||||
chunked_dialogs: List[DialogData],
|
||||
is_pilot_run: bool,
|
||||
) -> ExtractionResult:
|
||||
"""
|
||||
萃取:初始化引擎 → 执行知识提取 → 构建图节点/边 → 去重 → 返回结构化结果。
|
||||
|
||||
使用 NewExtractionOrchestrator(ExtractionStep 范式)完成 LLM 萃取,
|
||||
然后通过独立的 graph_build_step 和 dedup_step 完成图构建和去重,
|
||||
不依赖旧编排器 ExtractionOrchestrator。
|
||||
|
||||
执行流程:
|
||||
1. NewExtractionOrchestrator.run() → 萃取并赋值到 DialogData
|
||||
2. build_graph_nodes_and_edges() → 从 DialogData 构建图节点和边
|
||||
3. run_dedup() → 两阶段去重消歧
|
||||
"""
|
||||
from app.core.memory.storage_services.extraction_engine.steps.dedup_step import (
|
||||
run_dedup,
|
||||
)
|
||||
from app.core.memory.storage_services.extraction_engine.steps.graph_build_step import (
|
||||
build_graph_nodes_and_edges,
|
||||
)
|
||||
from app.core.memory.storage_services.extraction_engine.extraction_pipeline_orchestrator import (
|
||||
NewExtractionOrchestrator,
|
||||
)
|
||||
|
||||
from app.core.memory.utils.config.config_utils import get_pipeline_config
|
||||
from app.core.memory.utils.debug.write_snapshot_recorder import (
|
||||
WriteSnapshotRecorder,
|
||||
)
|
||||
|
||||
pipeline_config = get_pipeline_config(self.memory_config)
|
||||
ontology_types = self._load_ontology_types()
|
||||
|
||||
# 复用 run() 中已创建的 recorder(剪枝阶段已使用同一实例)
|
||||
recorder = getattr(self, "_recorder", None) or WriteSnapshotRecorder("new")
|
||||
self._recorder = recorder
|
||||
|
||||
# ── 新编排器:LLM 萃取 + 数据赋值 ──
|
||||
new_orchestrator = NewExtractionOrchestrator(
|
||||
llm_client=self._llm_client,
|
||||
embedder_client=self._embedder_client,
|
||||
config=pipeline_config,
|
||||
embedding_id=str(self.memory_config.embedding_model_id),
|
||||
ontology_types=ontology_types,
|
||||
language=self.language,
|
||||
is_pilot_run=is_pilot_run,
|
||||
progress_callback=self.progress_callback,
|
||||
)
|
||||
# step1: 执行知识提取
|
||||
dialog_data_list = await new_orchestrator.run(chunked_dialogs)
|
||||
|
||||
# 收集需要异步情绪提取的 statements(由编排器在 Phase 4 后收集)
|
||||
# 注意:实际 dispatch 在 _store 之后,确保 Statement 节点已写入 Neo4j
|
||||
self._emotion_statements = new_orchestrator.emotion_statements
|
||||
|
||||
# ── Snapshot: 各阶段萃取结果 ──
|
||||
recorder.record_stage_outputs(new_orchestrator.last_stage_outputs)
|
||||
|
||||
# step2: 构建图节点和边
|
||||
graph = await build_graph_nodes_and_edges(
|
||||
dialog_data_list=dialog_data_list,
|
||||
embedder_client=self._embedder_client,
|
||||
progress_callback=self.progress_callback,
|
||||
)
|
||||
|
||||
# Snapshot: 图节点和边(去重前)
|
||||
recorder.record_graph_before_dedup(graph)
|
||||
|
||||
# step3: 第一层去重消歧(同一轮对话内的实体碎片合并)
|
||||
# 第二层(Neo4j 联合去重)后移到 _store 之后异步执行
|
||||
dedup_result = await run_dedup(
|
||||
entity_nodes=graph.entity_nodes,
|
||||
statement_entity_edges=graph.stmt_entity_edges,
|
||||
entity_entity_edges=graph.entity_entity_edges,
|
||||
dialog_data_list=dialog_data_list,
|
||||
pipeline_config=pipeline_config,
|
||||
connector=None,
|
||||
llm_client=self._llm_client,
|
||||
is_pilot_run=True,
|
||||
progress_callback=self.progress_callback,
|
||||
)
|
||||
|
||||
# Snapshot: 去重后
|
||||
recorder.record_dedup_result(dedup_result)
|
||||
|
||||
# step4: 构造最终结果
|
||||
result = ExtractionResult(
|
||||
dialogue_nodes=graph.dialogue_nodes,
|
||||
chunk_nodes=graph.chunk_nodes,
|
||||
statement_nodes=graph.statement_nodes,
|
||||
entity_nodes=dedup_result.entity_nodes,
|
||||
perceptual_nodes=graph.perceptual_nodes,
|
||||
stmt_chunk_edges=graph.stmt_chunk_edges,
|
||||
stmt_entity_edges=dedup_result.statement_entity_edges,
|
||||
entity_entity_edges=dedup_result.entity_entity_edges,
|
||||
perceptual_edges=graph.perceptual_edges,
|
||||
assistant_original_nodes=graph.assistant_original_nodes,
|
||||
assistant_pruned_nodes=graph.assistant_pruned_nodes,
|
||||
assistant_pruned_edges=graph.assistant_pruned_edges,
|
||||
assistant_dialog_edges=graph.assistant_dialog_edges,
|
||||
dialog_data_list=dialog_data_list,
|
||||
)
|
||||
|
||||
recorder.record_summary(result.stats)
|
||||
return result
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Step 3: 存储
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def _store(self, result: ExtractionResult) -> None:
|
||||
"""
|
||||
存储:别名清洗 → Neo4j 写入(含死锁重试)。
|
||||
|
||||
错误策略:
|
||||
- 别名清洗失败 → 警告日志,继续写入
|
||||
- Neo4j 写入死锁 → 指数退避重试 3 次
|
||||
- Neo4j 写入非死锁异常 → 直接抛出,中断流程
|
||||
"""
|
||||
from app.repositories.neo4j.graph_saver import (
|
||||
save_dialog_and_statements_to_neo4j,
|
||||
)
|
||||
|
||||
# 1. 写入前别名清洗(失败不中断)
|
||||
await self._clean_cross_role_aliases(result.entity_nodes)
|
||||
|
||||
# 2. Neo4j 写入(含死锁重试)
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
success = await save_dialog_and_statements_to_neo4j(
|
||||
dialogue_nodes=result.dialogue_nodes,
|
||||
chunk_nodes=result.chunk_nodes,
|
||||
statement_nodes=result.statement_nodes,
|
||||
entity_nodes=result.entity_nodes,
|
||||
perceptual_nodes=result.perceptual_nodes,
|
||||
statement_chunk_edges=result.stmt_chunk_edges,
|
||||
statement_entity_edges=result.stmt_entity_edges,
|
||||
entity_edges=result.entity_entity_edges,
|
||||
perceptual_edges=result.perceptual_edges,
|
||||
connector=self._neo4j_connector,
|
||||
assistant_original_nodes=result.assistant_original_nodes,
|
||||
assistant_pruned_nodes=result.assistant_pruned_nodes,
|
||||
assistant_pruned_edges=result.assistant_pruned_edges,
|
||||
assistant_dialog_edges=result.assistant_dialog_edges,
|
||||
)
|
||||
if success:
|
||||
logger.debug("Successfully saved all data to Neo4j")
|
||||
return
|
||||
# 写入返回 False(部分失败)
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning(
|
||||
f"Neo4j 写入部分失败,重试 ({attempt + 2}/{max_retries})"
|
||||
)
|
||||
await asyncio.sleep(1 * (attempt + 1))
|
||||
else:
|
||||
logger.error(f"Neo4j 写入在 {max_retries} 次尝试后仍部分失败")
|
||||
except Exception as e:
|
||||
if self._is_deadlock(e) and attempt < max_retries - 1:
|
||||
logger.warning(f"Neo4j 死锁,重试 ({attempt + 2}/{max_retries})")
|
||||
await asyncio.sleep(1 * (attempt + 1))
|
||||
else:
|
||||
raise
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Step 3.2: 别名归并(内存侧)
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def _merge_alias_in_memory(self, result: ExtractionResult) -> None:
|
||||
"""别名归并(内存侧):处理 predicate="别名属于" 和 predicate="别名失效" 的边。
|
||||
|
||||
在写入 Neo4j 之前执行,确保写入的数据已经完成别名归并:
|
||||
- 别名属于:将别名实体的 name 追加到目标实体的 aliases
|
||||
- 别名属于:将别名实体的 description 拼接到目标实体的 description
|
||||
- 别名失效:从目标实体的 aliases 中移除对应的旧别名
|
||||
- 重定向指向别名节点的边到目标节点
|
||||
|
||||
纯内存操作,不涉及 Neo4j。
|
||||
"""
|
||||
ALIAS_PREDICATE = "别名属于"
|
||||
ALIAS_INVALID_PREDICATE = "别名失效"
|
||||
|
||||
alias_edges = [
|
||||
e
|
||||
for e in result.entity_entity_edges
|
||||
if getattr(e, "relation_type", "") == ALIAS_PREDICATE
|
||||
or getattr(e, "predicate", "") == ALIAS_PREDICATE
|
||||
]
|
||||
invalid_alias_edges = [
|
||||
e
|
||||
for e in result.entity_entity_edges
|
||||
if getattr(e, "relation_type", "") == ALIAS_INVALID_PREDICATE
|
||||
or getattr(e, "predicate", "") == ALIAS_INVALID_PREDICATE
|
||||
]
|
||||
|
||||
if not alias_edges and not invalid_alias_edges:
|
||||
logger.debug("[AliasMerge] 无 '别名属于'/'别名失效' 关系,跳过")
|
||||
return
|
||||
|
||||
try:
|
||||
entity_map = {e.id: e for e in result.entity_nodes}
|
||||
alias_to_target: dict[str, str] = {}
|
||||
|
||||
# ── 处理 别名属于:追加 aliases ──
|
||||
for edge in alias_edges:
|
||||
source_node = entity_map.get(edge.source)
|
||||
target_node = entity_map.get(edge.target)
|
||||
if not source_node or not target_node:
|
||||
continue
|
||||
|
||||
alias_to_target[edge.source] = edge.target
|
||||
|
||||
# 将 source.name 追加到 target.aliases(去重,忽略大小写)
|
||||
source_name = (source_node.name or "").strip()
|
||||
if source_name:
|
||||
existing_lower = {a.lower() for a in (target_node.aliases or [])}
|
||||
if source_name.lower() not in existing_lower:
|
||||
target_node.aliases = list(target_node.aliases or []) + [
|
||||
source_name
|
||||
]
|
||||
|
||||
# 将 source.description 拼接到 target.description(分号分隔,去重)
|
||||
src_desc = (source_node.description or "").strip()
|
||||
if src_desc:
|
||||
tgt_desc = (target_node.description or "").strip()
|
||||
if src_desc not in tgt_desc:
|
||||
target_node.description = (
|
||||
f"{tgt_desc};{src_desc}" if tgt_desc else src_desc
|
||||
)
|
||||
|
||||
# ── 处理 别名失效:从 aliases 中移除旧别名 ──
|
||||
invalid_alias_to_target: dict[str, str] = {}
|
||||
for edge in invalid_alias_edges:
|
||||
source_node = entity_map.get(edge.source)
|
||||
target_node = entity_map.get(edge.target)
|
||||
if not source_node or not target_node:
|
||||
continue
|
||||
|
||||
invalid_alias_to_target[edge.source] = edge.target
|
||||
|
||||
# 从 target.aliases 中移除 source.name(忽略大小写)
|
||||
invalid_name = (source_node.name or "").strip()
|
||||
if invalid_name and target_node.aliases:
|
||||
target_node.aliases = [
|
||||
a for a in target_node.aliases
|
||||
if a.lower() != invalid_name.lower()
|
||||
]
|
||||
logger.debug(
|
||||
f"[AliasMerge] 从 '{target_node.name}' 的 aliases 中移除失效别名 '{invalid_name}'"
|
||||
)
|
||||
|
||||
# 重定向指向别名节点的边到目标节点
|
||||
alias_ids = set(alias_to_target.keys()) | set(invalid_alias_to_target.keys())
|
||||
all_alias_map = {**alias_to_target, **invalid_alias_to_target}
|
||||
redirected_ee_count = 0
|
||||
redirected_se_count = 0
|
||||
|
||||
for edge in result.entity_entity_edges:
|
||||
rel_type = getattr(edge, "relation_type", "")
|
||||
if rel_type in (ALIAS_PREDICATE, ALIAS_INVALID_PREDICATE):
|
||||
continue
|
||||
if edge.source in alias_ids:
|
||||
edge.source = all_alias_map[edge.source]
|
||||
redirected_ee_count += 1
|
||||
if edge.target in alias_ids:
|
||||
edge.target = all_alias_map[edge.target]
|
||||
redirected_ee_count += 1
|
||||
|
||||
for edge in result.stmt_entity_edges:
|
||||
if edge.target in alias_ids:
|
||||
edge.target = all_alias_map[edge.target]
|
||||
redirected_se_count += 1
|
||||
|
||||
logger.info(
|
||||
f"[AliasMerge] 内存归并完成,处理 {len(alias_edges)} 条 '别名属于' 边,"
|
||||
f"{len(invalid_alias_edges)} 条 '别名失效' 边,"
|
||||
f"重定向 entity_entity 边 {redirected_ee_count} 次,"
|
||||
f"重定向 stmt_entity 边 {redirected_se_count} 次"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[AliasMerge] 内存归并失败(不影响主流程): {e}", exc_info=True
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Step 3.5: 异步后处理(Neo4j 别名归并 + 第二层去重)
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def _post_store_async_tasks(self, result: ExtractionResult) -> None:
|
||||
"""提交写入后的异步 Celery 任务(全部 fire-and-forget,失败不影响主流程):
|
||||
|
||||
1. Neo4j 别名归并 + 第二层去重
|
||||
2. 异步情绪提取
|
||||
3. 异步元数据提取
|
||||
"""
|
||||
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import (
|
||||
collect_user_entities_for_metadata,
|
||||
)
|
||||
|
||||
llm_model_id = (
|
||||
str(self.memory_config.llm_model_id)
|
||||
if self.memory_config.llm_model_id
|
||||
else None
|
||||
)
|
||||
recorder = getattr(self, "_recorder", None)
|
||||
snapshot_dir = (
|
||||
recorder.snapshot_dir
|
||||
if recorder is not None and recorder.enabled
|
||||
else None
|
||||
)
|
||||
|
||||
# ── 1. Neo4j 别名归并 + 第二层去重 ──
|
||||
self._submit_celery_task(
|
||||
"PostStore",
|
||||
"app.tasks.post_store_dedup_and_alias_merge",
|
||||
{
|
||||
"end_user_id": self.end_user_id,
|
||||
"entity_ids": [e.id for e in result.entity_nodes],
|
||||
"llm_model_id": llm_model_id,
|
||||
"snapshot_dir": snapshot_dir,
|
||||
},
|
||||
)
|
||||
|
||||
# ── 2. 异步情绪提取 ──
|
||||
emotion_statements = getattr(self, "_emotion_statements", [])
|
||||
if emotion_statements and llm_model_id:
|
||||
self._submit_celery_task(
|
||||
"Emotion",
|
||||
"app.tasks.extract_emotion_batch",
|
||||
{
|
||||
"statements": emotion_statements,
|
||||
"llm_model_id": llm_model_id,
|
||||
"language": self.language,
|
||||
"snapshot_dir": snapshot_dir,
|
||||
},
|
||||
)
|
||||
|
||||
# ── 3. 异步元数据提取 ──
|
||||
user_entities = collect_user_entities_for_metadata(result.entity_nodes)
|
||||
if user_entities and llm_model_id:
|
||||
self._submit_celery_task(
|
||||
"Metadata",
|
||||
"app.tasks.extract_metadata_batch",
|
||||
{
|
||||
"user_entities": user_entities,
|
||||
"llm_model_id": llm_model_id,
|
||||
"language": self.language,
|
||||
"snapshot_dir": snapshot_dir,
|
||||
},
|
||||
)
|
||||
|
||||
def _submit_celery_task(
|
||||
self, label: str, task_name: str, kwargs: dict
|
||||
) -> None:
|
||||
"""提交 Celery 异步任务的通用方法。失败只记日志,不抛异常。"""
|
||||
try:
|
||||
from app.celery_app import celery_app
|
||||
|
||||
task_result = celery_app.send_task(task_name, kwargs=kwargs)
|
||||
logger.info(f"[{label}] 异步任务已提交 - task_id={task_result.id}")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"[{label}] 提交异步任务失败(不影响主流程): {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Step 4: 聚类
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def _cluster(self, result: ExtractionResult) -> None:
|
||||
"""
|
||||
聚类:提交 Celery 异步任务进行增量社区更新。
|
||||
|
||||
聚类不阻塞主写入流程,失败不影响写入结果。
|
||||
通过 Celery 异步执行,由 LabelPropagationEngine 完成实际计算。
|
||||
|
||||
注意:ExtractionResult.entity_nodes 已经是经过 _extract() 中
|
||||
两阶段去重消歧(_run_dedup_and_write_summary)后的结果,
|
||||
聚类直接基于去重后的实体 ID 执行。
|
||||
"""
|
||||
if not result.entity_nodes:
|
||||
return
|
||||
|
||||
try:
|
||||
from app.tasks import run_incremental_clustering
|
||||
|
||||
new_entity_ids = [e.id for e in result.entity_nodes]
|
||||
task = run_incremental_clustering.apply_async(
|
||||
kwargs={
|
||||
"end_user_id": self.end_user_id,
|
||||
"new_entity_ids": new_entity_ids,
|
||||
"llm_model_id": (
|
||||
str(self.memory_config.llm_model_id)
|
||||
if self.memory_config.llm_model_id
|
||||
else None
|
||||
),
|
||||
"embedding_model_id": (
|
||||
str(self.memory_config.embedding_model_id)
|
||||
if self.memory_config.embedding_model_id
|
||||
else None
|
||||
),
|
||||
},
|
||||
priority=3,
|
||||
)
|
||||
logger.info(
|
||||
f"[Clustering] 增量聚类任务已提交 - "
|
||||
f"task_id = {task.id}, "
|
||||
f"entity_count = {len(new_entity_ids)}, "
|
||||
f"source=dedup"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"[Clustering] 提交聚类任务失败(不影响主流程): {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Step 5: 摘要
|
||||
# (+ entity_description)+ meta_data部分在此提取
|
||||
# ──────────────────────────────────────────────
|
||||
# TODO 乐力齐 需要做成异步celery任务
|
||||
async def _summarize(self, chunked_dialogs: List[DialogData]) -> None:
|
||||
"""
|
||||
摘要:生成情景记忆摘要 → 写入 Neo4j。
|
||||
|
||||
摘要生成失败不影响主流程(try/except 吞掉异常)。
|
||||
使用独立的 Neo4j 连接器,避免与主连接器的事务冲突。
|
||||
"""
|
||||
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.memory_summary import (
|
||||
memory_summary_generation,
|
||||
)
|
||||
from app.repositories.neo4j.add_edges import (
|
||||
add_memory_summary_statement_edges,
|
||||
)
|
||||
from app.repositories.neo4j.add_nodes import add_memory_summary_nodes
|
||||
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
|
||||
|
||||
try:
|
||||
summaries = await memory_summary_generation(
|
||||
chunked_dialogs,
|
||||
llm_client=self._llm_client,
|
||||
embedder_client=self._embedder_client,
|
||||
language=self.language,
|
||||
)
|
||||
ms_connector = Neo4jConnector()
|
||||
try:
|
||||
await add_memory_summary_nodes(summaries, ms_connector)
|
||||
await add_memory_summary_statement_edges(summaries, ms_connector)
|
||||
finally:
|
||||
try:
|
||||
await ms_connector.close()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Memory summary step failed: {e}", exc_info=True)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 辅助方法
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def _init_clients(self) -> None:
|
||||
"""
|
||||
从 MemoryConfig 构建 LLM 和 Embedding 客户端。
|
||||
|
||||
使用 MemoryClientFactory 工厂模式,需要短暂的 DB session 来
|
||||
查询模型配置(API key、base_url 等),查询完毕立即释放。
|
||||
"""
|
||||
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
|
||||
from app.db import get_db_context
|
||||
|
||||
with get_db_context() as db:
|
||||
factory = MemoryClientFactory(db)
|
||||
self._llm_client = factory.get_llm_client_from_config(self.memory_config)
|
||||
self._embedder_client = factory.get_embedder_client_from_config(
|
||||
self.memory_config
|
||||
)
|
||||
logger.info("LLM and embedding clients constructed")
|
||||
|
||||
def _init_neo4j_connector(self) -> None:
|
||||
"""初始化 Neo4j 连接器。"""
|
||||
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
|
||||
|
||||
self._neo4j_connector = Neo4jConnector()
|
||||
|
||||
def _load_ontology_types(self):
|
||||
"""
|
||||
加载本体类型配置。
|
||||
|
||||
如果 memory_config 中配置了 scene_id,则从数据库加载
|
||||
该场景关联的本体类型列表,用于指导三元组提取。
|
||||
"""
|
||||
if not self.memory_config.scene_id:
|
||||
return None
|
||||
|
||||
try:
|
||||
from app.core.memory.ontology_services.ontology_type_loader import (
|
||||
load_ontology_types_for_scene,
|
||||
)
|
||||
from app.db import get_db_context
|
||||
|
||||
with get_db_context() as db:
|
||||
ontology_types = load_ontology_types_for_scene(
|
||||
scene_id=self.memory_config.scene_id,
|
||||
workspace_id=self.memory_config.workspace_id,
|
||||
db=db,
|
||||
)
|
||||
if ontology_types:
|
||||
logger.info(
|
||||
f"Loaded {len(ontology_types.types)} ontology types "
|
||||
f"for scene_id: {self.memory_config.scene_id}"
|
||||
)
|
||||
return ontology_types
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to load ontology types for scene_id "
|
||||
f"{self.memory_config.scene_id}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
|
||||
async def _clean_cross_role_aliases(
|
||||
self, entity_nodes: List[ExtractedEntityNode]
|
||||
) -> None:
|
||||
"""
|
||||
清洗用户/AI助手实体之间的别名交叉污染。
|
||||
|
||||
从 Neo4j 查询已有的 AI 助手别名,与本轮实体中的 AI 助手别名合并,
|
||||
确保用户实体的 aliases 不包含 AI 助手的名字。
|
||||
失败不中断主流程。
|
||||
"""
|
||||
try:
|
||||
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
|
||||
clean_cross_role_aliases,
|
||||
fetch_neo4j_assistant_aliases,
|
||||
)
|
||||
|
||||
neo4j_assistant_aliases = set()
|
||||
if entity_nodes:
|
||||
eu_id = entity_nodes[0].end_user_id
|
||||
if eu_id:
|
||||
neo4j_assistant_aliases = await fetch_neo4j_assistant_aliases(
|
||||
self._neo4j_connector, eu_id
|
||||
)
|
||||
clean_cross_role_aliases(
|
||||
entity_nodes,
|
||||
external_assistant_aliases=neo4j_assistant_aliases,
|
||||
)
|
||||
logger.info(
|
||||
f"别名清洗完成,AI助手别名排除集大小: {len(neo4j_assistant_aliases)}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"别名清洗失败(不影响主流程): {e}")
|
||||
|
||||
@staticmethod
|
||||
def _is_deadlock(e: Exception) -> bool:
|
||||
"""判断异常是否为 Neo4j 死锁错误"""
|
||||
msg = str(e).lower()
|
||||
return "deadlockdetected" in msg or "deadlock" in msg
|
||||
|
||||
async def _update_stats_cache(self, result: ExtractionResult) -> None:
|
||||
"""
|
||||
将提取统计写入 Redis 活动缓存,按 workspace_id 存储。
|
||||
失败不中断主流程。
|
||||
"""
|
||||
try:
|
||||
from app.cache.memory.activity_stats_cache import (
|
||||
ActivityStatsCache,
|
||||
)
|
||||
|
||||
stats = {
|
||||
"chunk_count": result.stats["chunk_count"],
|
||||
"statements_count": result.stats["statement_count"],
|
||||
"triplet_entities_count": result.stats["entity_count"],
|
||||
"triplet_relations_count": result.stats["relation_count"],
|
||||
"temporal_count": 0,
|
||||
}
|
||||
await ActivityStatsCache.set_activity_stats(
|
||||
workspace_id=str(self.memory_config.workspace_id),
|
||||
stats=stats,
|
||||
)
|
||||
logger.info(
|
||||
f"活动统计已写入 Redis: workspace_id={self.memory_config.workspace_id}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"写入活动统计缓存失败(不影响主流程): {e}")
|
||||
|
||||
async def _cleanup(self) -> None:
|
||||
"""
|
||||
清理资源:关闭 Neo4j 连接器和 HTTP 客户端。
|
||||
在 run() 的 finally 块中调用,确保资源释放。
|
||||
"""
|
||||
# 关闭 Neo4j 连接器
|
||||
if self._neo4j_connector:
|
||||
try:
|
||||
await self._neo4j_connector.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing Neo4j connector: {e}")
|
||||
|
||||
# 关闭 LLM/Embedder 底层 httpx 客户端
|
||||
# 防止 'RuntimeError: Event loop is closed' 在垃圾回收时触发
|
||||
for client_obj in (self._llm_client, self._embedder_client):
|
||||
try:
|
||||
underlying = getattr(client_obj, "client", None) or getattr(
|
||||
client_obj, "model", None
|
||||
)
|
||||
if underlying is None:
|
||||
continue
|
||||
inner = getattr(underlying, "_model", underlying)
|
||||
http_client = getattr(inner, "async_client", None)
|
||||
if http_client is not None and hasattr(http_client, "aclose"):
|
||||
await http_client.aclose()
|
||||
except Exception:
|
||||
pass
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
"""
|
||||
场景特定配置 - 统一填充词库
|
||||
|
||||
重要性判断已完全交由 extracat_Pruning.jinja2 提示词 + LLM preserve_tokens 机制承担。
|
||||
重要性判断已完全交由 extract_pruning.jinja2 提示词 + LLM preserve_tokens 机制承担。
|
||||
本模块仅保留统一填充词库(filler_phrases),用于识别无意义寒暄/表情/口头禅。
|
||||
所有场景共用同一份词库,场景差异由 LLM 语义判断处理。
|
||||
"""
|
||||
|
||||
@@ -117,12 +117,18 @@ def _merge_attribute(canonical: ExtractedEntityNode, ent: ExtractedEntityNode):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 描述与事实摘要(保留更长者)
|
||||
# 描述合并(去重拼接,分号分隔)
|
||||
try:
|
||||
desc_a = getattr(canonical, "description", "") or ""
|
||||
desc_b = getattr(ent, "description", "") or ""
|
||||
if len(desc_b) > len(desc_a):
|
||||
canonical.description = desc_b
|
||||
desc_a = (getattr(canonical, "description", "") or "").strip()
|
||||
desc_b = (getattr(ent, "description", "") or "").strip()
|
||||
if desc_b and desc_b != desc_a:
|
||||
if desc_a:
|
||||
# 将已有 description 按分号拆分,检查新 description 是否已存在
|
||||
existing_parts = {p.strip() for p in desc_a.replace(";", ";").split(";") if p.strip()}
|
||||
if desc_b not in existing_parts:
|
||||
canonical.description = f"{desc_a};{desc_b}"
|
||||
else:
|
||||
canonical.description = desc_b
|
||||
# 合并事实摘要:统一保留一个“实体: name”行,来源行去重保序
|
||||
# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用
|
||||
# fact_a = getattr(canonical, "fact_summary", "") or ""
|
||||
@@ -177,14 +183,8 @@ def _merge_attribute(canonical: ExtractedEntityNode, ent: ExtractedEntityNode):
|
||||
|
||||
# 时间范围合并
|
||||
try:
|
||||
# 统一使用 created_at / expired_at
|
||||
if getattr(ent, "created_at", None) and getattr(canonical, "created_at", None) and ent.created_at < canonical.created_at:
|
||||
canonical.created_at = ent.created_at
|
||||
if getattr(ent, "expired_at", None) and getattr(canonical, "expired_at", None):
|
||||
if canonical.expired_at is None:
|
||||
canonical.expired_at = ent.expired_at
|
||||
elif ent.expired_at and ent.expired_at > canonical.expired_at:
|
||||
canonical.expired_at = ent.expired_at
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -1112,6 +1112,39 @@ async def deduplicate_entities_and_edges(
|
||||
# 在主流程这里 这里是之后关系去重和消歧的地方,方法可以写在其他地方
|
||||
# 此处统一对边进行处理,使用累积的 id_redirect 把边的 source/target 改成规范ID
|
||||
# 4) 边重定向与去重
|
||||
# 4.0 预处理:将 "别名属于" 关系的 source.name/description 归并到 target 节点
|
||||
# 必须在边重定向之前执行,此时 id_redirect 已包含精确/模糊/LLM 的合并结果
|
||||
try:
|
||||
entity_by_id: Dict[str, ExtractedEntityNode] = {e.id: e for e in deduped_entities}
|
||||
for edge in entity_entity_edges:
|
||||
if getattr(edge, "relation_type", "") != "别名属于":
|
||||
continue
|
||||
# 通过 id_redirect 找到合并后的规范节点
|
||||
source_id = id_redirect.get(edge.source, edge.source)
|
||||
target_id = id_redirect.get(edge.target, edge.target)
|
||||
if source_id == target_id:
|
||||
continue
|
||||
source_node = entity_by_id.get(source_id)
|
||||
target_node = entity_by_id.get(target_id)
|
||||
if not source_node or not target_node:
|
||||
continue
|
||||
|
||||
# 将 source.name 追加到 target.aliases(去重,忽略大小写)
|
||||
source_name = (source_node.name or "").strip()
|
||||
if source_name:
|
||||
existing_lower = {a.lower() for a in (target_node.aliases or [])}
|
||||
if source_name.lower() not in existing_lower and source_name.lower() != (target_node.name or "").lower():
|
||||
target_node.aliases = list(target_node.aliases or []) + [source_name]
|
||||
|
||||
# 将 source.description 追加到 target.description(分号分隔,去重)
|
||||
src_desc = (source_node.description or "").strip()
|
||||
if src_desc:
|
||||
tgt_desc = (target_node.description or "").strip()
|
||||
if src_desc not in tgt_desc:
|
||||
target_node.description = f"{tgt_desc};{src_desc}" if tgt_desc else src_desc
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 4.1 语句→实体边:重复时优先保留 strong
|
||||
stmt_ent_map: Dict[str, StatementEntityEdge] = {}
|
||||
for edge in statement_entity_edges:
|
||||
|
||||
@@ -65,7 +65,6 @@ def _row_to_entity(row: Dict[str, Any]) -> ExtractedEntityNode:
|
||||
user_id=row.get("user_id") or "",
|
||||
apply_id=row.get("apply_id") or "",
|
||||
created_at=_parse_dt(row.get("created_at")),
|
||||
expired_at=_parse_dt(row.get("expired_at")) if row.get("expired_at") else None,
|
||||
entity_idx=int(row.get("entity_idx") or 0),
|
||||
statement_id=row.get("statement_id") or "",
|
||||
entity_type=row.get("entity_type") or "",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,932 @@
|
||||
"""Refactored ExtractionOrchestrator using the unified ExtractionStep paradigm.
|
||||
|
||||
This module provides ``NewExtractionOrchestrator`` — a slimmed-down orchestrator
|
||||
(~500 lines vs ~2500) that delegates extraction work to concrete ExtractionStep
|
||||
instances and uses SidecarStepFactory for hot-pluggable sidecar modules.
|
||||
|
||||
The new orchestrator coexists with the legacy ``ExtractionOrchestrator`` until
|
||||
the team explicitly switches over.
|
||||
|
||||
Execution phases:
|
||||
1. Statement extraction + concurrent chunk/dialog embedding
|
||||
2. Triplet extraction + concurrent after_statement sidecars + statement embedding
|
||||
3. Entity embedding + concurrent after_triplet sidecars
|
||||
4. Data assignment back to dialog_data_list
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
from app.core.memory.models.message_models import DialogData
|
||||
from app.core.memory.models.variate_config import ExtractionPipelineConfig
|
||||
|
||||
from .steps.base import ExtractionStep, StepContext
|
||||
from .steps.embedding_step import EmbeddingStep
|
||||
from .sidecar_factory import SidecarStepFactory, SidecarTiming
|
||||
from .steps.statement_temporal_step import StatementTemporalExtractionStep
|
||||
from .steps.triplet_step import TripletExtractionStep
|
||||
from .steps.schema import (
|
||||
EmbeddingStepInput,
|
||||
EmbeddingStepOutput,
|
||||
EmotionStepInput,
|
||||
EmotionStepOutput,
|
||||
MessageItem,
|
||||
StatementStepInput,
|
||||
StatementStepOutput,
|
||||
SupportingContext,
|
||||
TripletStepInput,
|
||||
TripletStepOutput,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NewExtractionOrchestrator:
|
||||
"""Slimmed-down extraction orchestrator using the ExtractionStep paradigm.
|
||||
|
||||
Responsibilities:
|
||||
* Initialise all steps and sidecar groups via ``SidecarStepFactory``
|
||||
* Route data between stages (``_convert_to_*`` helpers)
|
||||
* Orchestrate concurrent execution (``_run_with_sidecars``)
|
||||
* Assign extracted results back to ``DialogData`` objects
|
||||
|
||||
The orchestrator does **not** own dedup, node/edge creation, or Neo4j writes.
|
||||
Those remain in ``WritePipeline`` / ``dedup_step``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm_client: Any,
|
||||
embedder_client: Any,
|
||||
config: Optional[ExtractionPipelineConfig] = None,
|
||||
embedding_id: Optional[str] = None,
|
||||
ontology_types: Any = None,
|
||||
language: str = "zh",
|
||||
is_pilot_run: bool = False,
|
||||
progress_callback: Optional[
|
||||
Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]
|
||||
] = None,
|
||||
) -> None:
|
||||
self.config = config or ExtractionPipelineConfig()
|
||||
self.is_pilot_run = is_pilot_run
|
||||
self.embedding_id = embedding_id
|
||||
self.progress_callback = progress_callback
|
||||
|
||||
# Build shared context for all LLM-based steps
|
||||
self.context = StepContext(
|
||||
llm_client=llm_client,
|
||||
language=language,
|
||||
config=self.config,
|
||||
is_pilot_run=is_pilot_run,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
# ── Critical (main-line) steps ──
|
||||
self.statement_temporal_step = StatementTemporalExtractionStep(self.context)
|
||||
self.triplet_step = TripletExtractionStep(
|
||||
self.context, ontology_types=ontology_types
|
||||
)
|
||||
|
||||
# ── Embedding step (non-LLM, separate client) ──
|
||||
self.embedding_step = EmbeddingStep(
|
||||
embedder_client=embedder_client,
|
||||
is_pilot_run=is_pilot_run,
|
||||
)
|
||||
|
||||
# ── Sidecar steps (auto-discovered via @register decorator) ──
|
||||
sidecar_groups = SidecarStepFactory.create_sidecars(self.config, self.context)
|
||||
self.after_statement_sidecars: List[ExtractionStep] = sidecar_groups[
|
||||
SidecarTiming.AFTER_STATEMENT
|
||||
]
|
||||
self.after_triplet_sidecars: List[ExtractionStep] = sidecar_groups[
|
||||
SidecarTiming.AFTER_TRIPLET
|
||||
]
|
||||
|
||||
logger.debug(
|
||||
"NewExtractionOrchestrator initialised — "
|
||||
"after_statement sidecars: %d, after_triplet sidecars: %d",
|
||||
len(self.after_statement_sidecars),
|
||||
len(self.after_triplet_sidecars),
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 1. 并发执行引擎
|
||||
# 负责主线路 + 旁路的安全并发调度
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
async def _run_sidecar_safe(
|
||||
step: ExtractionStep, input_data: Any
|
||||
) -> Any:
|
||||
"""Run a sidecar step, returning its default output on failure."""
|
||||
try:
|
||||
return await step.run(input_data)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Sidecar '%s' raised during gather — using default output: %s",
|
||||
step.name,
|
||||
exc,
|
||||
)
|
||||
return step.get_default_output()
|
||||
|
||||
async def _run_with_sidecars(
|
||||
self,
|
||||
critical_coro: Any,
|
||||
sidecars: List[Tuple[ExtractionStep, Any]],
|
||||
extra_coros: Optional[List[Any]] = None,
|
||||
) -> Tuple[Any, List[Any], List[Any]]:
|
||||
"""Run a critical coroutine concurrently with sidecar steps.
|
||||
|
||||
Args:
|
||||
critical_coro: The awaitable for the critical (main-line) step.
|
||||
sidecars: List of ``(step, input_data)`` pairs for sidecar steps.
|
||||
extra_coros: Additional non-sidecar coroutines to run concurrently
|
||||
(e.g. embedding generation).
|
||||
|
||||
Returns:
|
||||
A 3-tuple of:
|
||||
* The critical step result (exception propagated if it fails).
|
||||
* A list of sidecar results (default outputs on failure).
|
||||
* A list of extra coroutine results (empty list if none).
|
||||
|
||||
Raises:
|
||||
Exception: If the critical coroutine fails, the exception propagates.
|
||||
"""
|
||||
sidecar_coros = [
|
||||
self._run_sidecar_safe(step, inp) for step, inp in sidecars
|
||||
]
|
||||
extra = extra_coros or []
|
||||
|
||||
# Gather everything concurrently
|
||||
all_coros = [critical_coro] + sidecar_coros + extra
|
||||
results = await asyncio.gather(*all_coros, return_exceptions=True)
|
||||
|
||||
# Unpack: first result is critical, then sidecars, then extras
|
||||
critical_result = results[0]
|
||||
n_sidecars = len(sidecar_coros)
|
||||
sidecar_results = list(results[1 : 1 + n_sidecars])
|
||||
extra_results = list(results[1 + n_sidecars :])
|
||||
|
||||
# Critical step failure → propagate
|
||||
if isinstance(critical_result, BaseException):
|
||||
raise critical_result
|
||||
|
||||
# Sidecar failures should already be handled by _run_sidecar_safe,
|
||||
# but guard against unexpected exceptions from gather
|
||||
for i, res in enumerate(sidecar_results):
|
||||
if isinstance(res, BaseException):
|
||||
step = sidecars[i][0]
|
||||
logger.warning(
|
||||
"Sidecar '%s' unexpected exception in gather: %s",
|
||||
step.name,
|
||||
res,
|
||||
)
|
||||
sidecar_results[i] = step.get_default_output()
|
||||
|
||||
# Extra coroutine failures → log and replace with None
|
||||
for i, res in enumerate(extra_results):
|
||||
if isinstance(res, BaseException):
|
||||
logger.warning("Extra coroutine %d failed: %s", i, res)
|
||||
extra_results[i] = None
|
||||
|
||||
return critical_result, sidecar_results, extra_results
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 2. 阶段间数据转换
|
||||
# 将上一阶段的 StepOutput 转换为下一阶段的 StepInput
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
def _build_supporting_context(
|
||||
dialog: DialogData,
|
||||
) -> SupportingContext:
|
||||
"""Build a SupportingContext from a dialog's content for pronoun resolution."""
|
||||
msgs: List[MessageItem] = []
|
||||
if hasattr(dialog, "content") and dialog.content:
|
||||
# dialog.content is the raw conversation string; wrap as single msg
|
||||
msgs.append(MessageItem(role="context", msg=dialog.content))
|
||||
return SupportingContext(msgs=msgs)
|
||||
|
||||
@staticmethod
|
||||
def _convert_to_triplet_input(
|
||||
stmt_out: StatementStepOutput,
|
||||
supporting_context: SupportingContext,
|
||||
) -> TripletStepInput:
|
||||
"""Convert a StatementStepOutput into a TripletStepInput."""
|
||||
return TripletStepInput(
|
||||
statement_id=stmt_out.statement_id,
|
||||
statement_text=stmt_out.statement_text,
|
||||
statement_type=stmt_out.statement_type,
|
||||
temporal_type=stmt_out.temporal_type,
|
||||
supporting_context=supporting_context,
|
||||
speaker=stmt_out.speaker,
|
||||
dialog_at=stmt_out.dialog_at or "",
|
||||
valid_at=stmt_out.valid_at,
|
||||
invalid_at=stmt_out.invalid_at,
|
||||
has_unsolved_reference=stmt_out.has_unsolved_reference,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _convert_to_emotion_input(
|
||||
stmt_out: StatementStepOutput,
|
||||
) -> EmotionStepInput:
|
||||
"""Convert a StatementStepOutput into an EmotionStepInput."""
|
||||
return EmotionStepInput(
|
||||
statement_id=stmt_out.statement_id,
|
||||
statement_text=stmt_out.statement_text,
|
||||
speaker=stmt_out.speaker,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 3. 流水线执行入口
|
||||
# 公开接口 run() → 分发到 pilot / full 模式
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def run(
|
||||
self,
|
||||
dialog_data_list: List[DialogData],
|
||||
) -> List[DialogData]:
|
||||
"""Run the full extraction pipeline on *dialog_data_list*.
|
||||
|
||||
Returns the mutated *dialog_data_list* with extracted data assigned
|
||||
to each statement (triplets, temporal info, emotions, embeddings).
|
||||
|
||||
The orchestrator does NOT create graph nodes/edges or run dedup —
|
||||
those responsibilities remain in WritePipeline.
|
||||
"""
|
||||
mode = "pilot" if self.is_pilot_run else "full"
|
||||
logger.info(
|
||||
"Starting extraction pipeline (%s mode), %d dialogs",
|
||||
mode,
|
||||
len(dialog_data_list),
|
||||
)
|
||||
|
||||
if self.is_pilot_run:
|
||||
return await self._run_pilot(dialog_data_list)
|
||||
return await self._run_full(dialog_data_list)
|
||||
|
||||
# ── 3a. 试运行模式:仅 statement + triplet,不生成 embedding 和旁路 ──
|
||||
|
||||
async def _run_pilot(
|
||||
self, dialog_data_list: List[DialogData]
|
||||
) -> List[DialogData]:
|
||||
"""Pilot mode: statement + triplet extraction only, no sidecars or embeddings."""
|
||||
# Phase 1: Statement extraction (chunk-level parallel)
|
||||
logger.debug("Pilot phase 1/2: Statement extraction")
|
||||
all_stmt_results = await self._extract_all_statements(dialog_data_list)
|
||||
|
||||
# Phase 2: Triplet extraction (statement-level parallel)
|
||||
logger.debug("Pilot phase 2/2: Triplet extraction")
|
||||
all_triplet_results = await self._extract_all_triplets(
|
||||
dialog_data_list, all_stmt_results
|
||||
)
|
||||
|
||||
# Assign results back to dialog_data_list
|
||||
self._assign_results(
|
||||
dialog_data_list,
|
||||
all_stmt_results,
|
||||
all_triplet_results,
|
||||
emotion_results={},
|
||||
embedding_output=None,
|
||||
)
|
||||
|
||||
# Store raw step outputs for snapshot/debugging
|
||||
self._last_stage_outputs = {
|
||||
"statement_results": all_stmt_results,
|
||||
"triplet_results": all_triplet_results,
|
||||
"emotion_results": {},
|
||||
"embedding_output": None,
|
||||
}
|
||||
|
||||
if self.progress_callback:
|
||||
statements_count = sum(
|
||||
len(stmts)
|
||||
for chunk_stmts in all_stmt_results.values()
|
||||
for stmts in chunk_stmts.values()
|
||||
)
|
||||
entities_count = sum(
|
||||
len(t_out.entities)
|
||||
for stmt_triplets in all_triplet_results.values()
|
||||
for t_out in stmt_triplets.values()
|
||||
)
|
||||
triplets_count = sum(
|
||||
len(t_out.triplets)
|
||||
for stmt_triplets in all_triplet_results.values()
|
||||
for t_out in stmt_triplets.values()
|
||||
)
|
||||
await self.progress_callback(
|
||||
"knowledge_extraction_complete",
|
||||
"知识抽取完成",
|
||||
{
|
||||
"entities_count": entities_count,
|
||||
"statements_count": statements_count,
|
||||
"temporal_ranges_count": 0,
|
||||
"triplets_count": triplets_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.debug("Pilot extraction complete")
|
||||
return dialog_data_list
|
||||
|
||||
# ── 3b. 正式模式:四阶段并发执行 ──
|
||||
|
||||
async def _run_full(
|
||||
self, dialog_data_list: List[DialogData]
|
||||
) -> List[DialogData]:
|
||||
"""Full mode: all four phases with concurrent sidecars and embeddings."""
|
||||
|
||||
# ── Phase 1: Statement extraction + chunk/dialog embedding ──
|
||||
logger.debug("Phase 1/4: Statement extraction + chunk/dialog embedding")
|
||||
chunk_dialog_emb_input = self._build_chunk_dialog_embedding_input(
|
||||
dialog_data_list
|
||||
)
|
||||
|
||||
stmt_coro = self._extract_all_statements(dialog_data_list)
|
||||
emb_coro = self.embedding_step.run(chunk_dialog_emb_input)
|
||||
|
||||
phase1_results = await asyncio.gather(
|
||||
stmt_coro, emb_coro, return_exceptions=True
|
||||
)
|
||||
|
||||
all_stmt_results: Dict[str, Dict[str, List[StatementStepOutput]]] = (
|
||||
phase1_results[0]
|
||||
if not isinstance(phase1_results[0], BaseException)
|
||||
else {}
|
||||
)
|
||||
if isinstance(phase1_results[0], BaseException):
|
||||
raise phase1_results[0]
|
||||
|
||||
chunk_dialog_emb: Optional[EmbeddingStepOutput] = (
|
||||
phase1_results[1]
|
||||
if not isinstance(phase1_results[1], BaseException)
|
||||
else None
|
||||
)
|
||||
if isinstance(phase1_results[1], BaseException):
|
||||
logger.warning("Chunk/dialog embedding failed: %s", phase1_results[1])
|
||||
|
||||
# ── Phase 2: Triplet extraction + after_statement sidecars + statement embedding ──
|
||||
logger.debug(
|
||||
"Phase 2/4: Triplet extraction + sidecars + statement embedding"
|
||||
)
|
||||
stmt_emb_input = self._build_statement_embedding_input(
|
||||
dialog_data_list, all_stmt_results
|
||||
)
|
||||
|
||||
# Build sidecar inputs for after_statement sidecars (emotion excluded — async Celery)
|
||||
sidecar_pairs = self._build_after_statement_sidecar_inputs(
|
||||
dialog_data_list, all_stmt_results
|
||||
)
|
||||
|
||||
triplet_coro = self._extract_all_triplets(
|
||||
dialog_data_list, all_stmt_results
|
||||
)
|
||||
stmt_emb_coro = self.embedding_step.run(stmt_emb_input)
|
||||
|
||||
triplet_results, sidecar_results, extra_results = (
|
||||
await self._run_with_sidecars(
|
||||
triplet_coro,
|
||||
sidecar_pairs,
|
||||
extra_coros=[stmt_emb_coro],
|
||||
)
|
||||
)
|
||||
all_triplet_results = triplet_results
|
||||
stmt_emb: Optional[EmbeddingStepOutput] = (
|
||||
extra_results[0] if extra_results else None
|
||||
)
|
||||
|
||||
# Collect sidecar outputs keyed by step name
|
||||
sidecar_steps = [step for step, _inp in sidecar_pairs]
|
||||
sidecar_output_map = self._collect_sidecar_outputs(
|
||||
sidecar_steps, sidecar_results
|
||||
)
|
||||
|
||||
# ── Phase 3: Entity embedding + after_triplet sidecars ──
|
||||
logger.debug("Phase 3/4: Entity embedding + after_triplet sidecars")
|
||||
entity_emb_input = self._build_entity_embedding_input(all_triplet_results)
|
||||
|
||||
after_triplet_pairs: List[Tuple[ExtractionStep, Any]] = []
|
||||
# Future after_triplet sidecars would be wired here
|
||||
|
||||
entity_emb_coro = self.embedding_step.run(entity_emb_input)
|
||||
|
||||
if after_triplet_pairs:
|
||||
_, at_sidecar_results, at_extra = await self._run_with_sidecars(
|
||||
entity_emb_coro,
|
||||
after_triplet_pairs,
|
||||
)
|
||||
entity_emb = at_extra[0] if at_extra else None
|
||||
else:
|
||||
# No after_triplet sidecars — just run embedding
|
||||
entity_emb_result = await entity_emb_coro
|
||||
entity_emb = (
|
||||
entity_emb_result
|
||||
if not isinstance(entity_emb_result, BaseException)
|
||||
else None
|
||||
)
|
||||
|
||||
# Merge all embedding outputs
|
||||
merged_emb = self._merge_embeddings(chunk_dialog_emb, stmt_emb, entity_emb)
|
||||
|
||||
# ── Phase 4: Data assignment ──
|
||||
logger.debug("Phase 4/4: Data assignment")
|
||||
|
||||
self._assign_results(
|
||||
dialog_data_list,
|
||||
all_stmt_results,
|
||||
all_triplet_results,
|
||||
emotion_results={},
|
||||
embedding_output=merged_emb,
|
||||
)
|
||||
|
||||
# ── Fire-and-forget: collect statements for async emotion extraction ──
|
||||
self._emotion_statements: List[Dict[str, str]] = []
|
||||
if self.config.emotion_enabled:
|
||||
self._emotion_statements = self._collect_emotion_statements(all_stmt_results)
|
||||
|
||||
# Store raw step outputs for snapshot/debugging
|
||||
self._last_stage_outputs = {
|
||||
"statement_results": all_stmt_results,
|
||||
"triplet_results": all_triplet_results,
|
||||
"emotion_results": {},
|
||||
"embedding_output": merged_emb,
|
||||
}
|
||||
|
||||
logger.debug("Full extraction pipeline complete")
|
||||
return dialog_data_list
|
||||
|
||||
@property
|
||||
def last_stage_outputs(self) -> Dict[str, Any]:
|
||||
"""Return the raw step outputs from the last run for snapshot/debugging."""
|
||||
return getattr(self, "_last_stage_outputs", {})
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 4. 萃取执行器
|
||||
# chunk 级并行 statement 提取、statement 级并行 triplet 提取
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def _extract_all_statements(
|
||||
self,
|
||||
dialog_data_list: List[DialogData],
|
||||
) -> Dict[str, Dict[str, List[StatementStepOutput]]]:
|
||||
"""Extract statements from all chunks across all dialogs (chunk-level parallel).
|
||||
|
||||
Returns:
|
||||
Nested dict: ``{dialog_id: {chunk_id: [StatementStepOutput, ...]}}``
|
||||
"""
|
||||
# Collect all (chunk, metadata) pairs
|
||||
tasks: List[Any] = []
|
||||
task_meta: List[Tuple[str, str, str, SupportingContext]] = []
|
||||
|
||||
for dialog in dialog_data_list:
|
||||
ctx = self._build_supporting_context(dialog)
|
||||
dialogue_content = (
|
||||
dialog.content
|
||||
if getattr(
|
||||
self.config, "statement_extraction", None
|
||||
)
|
||||
and getattr(
|
||||
self.config.statement_extraction,
|
||||
"include_dialogue_context",
|
||||
True,
|
||||
)
|
||||
else None
|
||||
)
|
||||
for chunk in dialog.chunks:
|
||||
# 仅跳过明确标记为 assistant 的 chunk;speaker=None(混合分块)正常处理。
|
||||
chunk_speaker = getattr(chunk, "speaker", None)
|
||||
if chunk_speaker == "assistant":
|
||||
continue
|
||||
inp = StatementStepInput(
|
||||
chunk_id=chunk.id,
|
||||
end_user_id=dialog.end_user_id,
|
||||
target_content=chunk.content,
|
||||
target_message_date=str(
|
||||
getattr(dialog, "created_at", "") or ""
|
||||
),
|
||||
dialog_at=getattr(chunk, "dialog_at", "") or "",
|
||||
supporting_context=ctx,
|
||||
)
|
||||
tasks.append(self.statement_temporal_step.run(inp))
|
||||
task_meta.append(
|
||||
(dialog.id, chunk.id, chunk_speaker, ctx)
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Organise into nested dict
|
||||
stmt_map: Dict[str, Dict[str, List[StatementStepOutput]]] = {}
|
||||
for i, result in enumerate(results):
|
||||
dialog_id, chunk_id, speaker, _ = task_meta[i]
|
||||
if dialog_id not in stmt_map:
|
||||
stmt_map[dialog_id] = {}
|
||||
|
||||
if isinstance(result, BaseException):
|
||||
logger.error("Statement extraction failed for chunk %s: %s", chunk_id, result)
|
||||
stmt_map[dialog_id][chunk_id] = []
|
||||
else:
|
||||
# Override speaker from chunk metadata
|
||||
stmts: List[StatementStepOutput] = result if isinstance(result, list) else []
|
||||
for s in stmts:
|
||||
s.speaker = speaker
|
||||
stmt_map[dialog_id][chunk_id] = stmts
|
||||
if self.progress_callback:
|
||||
# Frontend consumes knowledge_extraction_result with data.statement.
|
||||
# Emit one event per statement to keep payload contract simple.
|
||||
for s in stmts:
|
||||
await self.progress_callback(
|
||||
"knowledge_extraction_result",
|
||||
"知识抽取中",
|
||||
{"statement": s.statement_text},
|
||||
)
|
||||
|
||||
return stmt_map
|
||||
|
||||
async def _extract_all_triplets(
|
||||
self,
|
||||
dialog_data_list: List[DialogData],
|
||||
all_stmt_results: Dict[str, Dict[str, List[StatementStepOutput]]],
|
||||
) -> Dict[str, Dict[str, TripletStepOutput]]:
|
||||
"""Extract triplets for every statement (statement-level parallel).
|
||||
|
||||
Returns:
|
||||
Nested dict: ``{dialog_id: {statement_id: TripletStepOutput}}``
|
||||
"""
|
||||
tasks: List[Any] = []
|
||||
task_meta: List[Tuple[str, str]] = [] # (dialog_id, statement_id)
|
||||
|
||||
for dialog in dialog_data_list:
|
||||
ctx = self._build_supporting_context(dialog)
|
||||
chunk_stmts = all_stmt_results.get(dialog.id, {})
|
||||
for _chunk_id, stmts in chunk_stmts.items():
|
||||
for stmt in stmts:
|
||||
# 防御性过滤:跳过明确标记为 assistant 的 statement。
|
||||
# speaker=None(混合分块)正常处理。
|
||||
if getattr(stmt, "speaker", None) == "assistant":
|
||||
continue
|
||||
inp = self._convert_to_triplet_input(stmt, ctx)
|
||||
tasks.append(self.triplet_step.run(inp))
|
||||
task_meta.append((dialog.id, stmt.statement_id))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
triplet_map: Dict[str, Dict[str, TripletStepOutput]] = {}
|
||||
for i, result in enumerate(results):
|
||||
dialog_id, stmt_id = task_meta[i]
|
||||
if dialog_id not in triplet_map:
|
||||
triplet_map[dialog_id] = {}
|
||||
|
||||
if isinstance(result, BaseException):
|
||||
logger.error(
|
||||
"Triplet extraction failed for statement %s: %s",
|
||||
stmt_id,
|
||||
result,
|
||||
)
|
||||
triplet_map[dialog_id][stmt_id] = self.triplet_step.get_default_output()
|
||||
else:
|
||||
triplet_map[dialog_id][stmt_id] = result
|
||||
if self.progress_callback:
|
||||
await self.progress_callback(
|
||||
"extract_triplet_result",
|
||||
f"statement {stmt_id} 提取完成",
|
||||
{
|
||||
"statement_id": stmt_id,
|
||||
"triplet_count": len(result.triplets),
|
||||
"entity_count": len(result.entities),
|
||||
"triplets": [
|
||||
{
|
||||
"subject_name": t.subject_name,
|
||||
"predicate": t.predicate,
|
||||
"object_name": t.object_name,
|
||||
}
|
||||
for t in result.triplets[:5]
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
return triplet_map
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 5. Embedding 输入构建器
|
||||
# 为不同阶段构建 EmbeddingStepInput(chunk/statement/entity)
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
def _build_chunk_dialog_embedding_input(
|
||||
dialog_data_list: List[DialogData],
|
||||
) -> EmbeddingStepInput:
|
||||
"""Build embedding input for chunks and dialogs (phase 1)."""
|
||||
chunk_texts: Dict[str, str] = {}
|
||||
dialog_texts: List[str] = []
|
||||
|
||||
for dialog in dialog_data_list:
|
||||
if hasattr(dialog, "content") and dialog.content:
|
||||
dialog_texts.append(dialog.content)
|
||||
for chunk in dialog.chunks:
|
||||
chunk_texts[chunk.id] = chunk.content
|
||||
|
||||
return EmbeddingStepInput(
|
||||
chunk_texts=chunk_texts,
|
||||
dialog_texts=dialog_texts,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _build_statement_embedding_input(
|
||||
dialog_data_list: List[DialogData],
|
||||
all_stmt_results: Dict[str, Dict[str, List[StatementStepOutput]]],
|
||||
) -> EmbeddingStepInput:
|
||||
"""Build embedding input for statements (phase 2)."""
|
||||
stmt_texts: Dict[str, str] = {}
|
||||
for _dialog_id, chunk_stmts in all_stmt_results.items():
|
||||
for _chunk_id, stmts in chunk_stmts.items():
|
||||
for s in stmts:
|
||||
stmt_texts[s.statement_id] = s.statement_text
|
||||
return EmbeddingStepInput(statement_texts=stmt_texts)
|
||||
|
||||
@staticmethod
|
||||
def _build_entity_embedding_input(
|
||||
all_triplet_results: Dict[str, Dict[str, TripletStepOutput]],
|
||||
) -> EmbeddingStepInput:
|
||||
"""Build embedding input for entities (phase 3)."""
|
||||
entity_names: Dict[str, str] = {}
|
||||
entity_descs: Dict[str, str] = {}
|
||||
seen: set = set()
|
||||
|
||||
for _dialog_id, stmt_triplets in all_triplet_results.items():
|
||||
for _stmt_id, triplet_out in stmt_triplets.items():
|
||||
for ent in triplet_out.entities:
|
||||
key = f"{ent.entity_idx}_{ent.name}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
entity_names[key] = ent.name
|
||||
entity_descs[key] = ent.description
|
||||
|
||||
return EmbeddingStepInput(
|
||||
entity_names=entity_names,
|
||||
entity_descriptions=entity_descs,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 6. 旁路输入构建与结果收集
|
||||
# 为 after_statement / after_triplet 旁路构建输入,合并 embedding 输出
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def _build_after_statement_sidecar_inputs(
|
||||
self,
|
||||
dialog_data_list: List[DialogData],
|
||||
all_stmt_results: Dict[str, Dict[str, List[StatementStepOutput]]],
|
||||
) -> List[Tuple[ExtractionStep, Any]]:
|
||||
"""Build (step, input) pairs for after_statement sidecars.
|
||||
|
||||
Emotion extraction is excluded here — it runs asynchronously via Celery.
|
||||
"""
|
||||
if not self.after_statement_sidecars:
|
||||
return []
|
||||
|
||||
# Collect all user statements for sidecar processing
|
||||
all_user_stmts: List[StatementStepOutput] = []
|
||||
for _dialog_id, chunk_stmts in all_stmt_results.items():
|
||||
for _chunk_id, stmts in chunk_stmts.items():
|
||||
for s in stmts:
|
||||
if s.speaker == "user":
|
||||
all_user_stmts.append(s)
|
||||
|
||||
pairs: List[Tuple[ExtractionStep, Any]] = []
|
||||
for sidecar in self.after_statement_sidecars:
|
||||
if sidecar.name == "emotion_extraction":
|
||||
# Skip — emotion is dispatched as async Celery task after Phase 4
|
||||
continue
|
||||
# Generic sidecar: pass first statement as representative input
|
||||
if all_user_stmts:
|
||||
inp = self._convert_to_emotion_input(all_user_stmts[0])
|
||||
pairs.append((sidecar, inp))
|
||||
|
||||
return pairs
|
||||
|
||||
@staticmethod
|
||||
def _collect_sidecar_outputs(
|
||||
sidecars: List[ExtractionStep],
|
||||
results: List[Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""Map sidecar results by step name."""
|
||||
output: Dict[str, Any] = {}
|
||||
for i, sidecar in enumerate(sidecars):
|
||||
if i < len(results):
|
||||
output[sidecar.name] = results[i]
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def _merge_embeddings(
|
||||
chunk_dialog: Optional[EmbeddingStepOutput],
|
||||
statement: Optional[EmbeddingStepOutput],
|
||||
entity: Optional[Any],
|
||||
) -> Optional[EmbeddingStepOutput]:
|
||||
"""Merge partial embedding outputs into a single EmbeddingStepOutput."""
|
||||
merged = EmbeddingStepOutput()
|
||||
if chunk_dialog:
|
||||
merged.chunk_embeddings = chunk_dialog.chunk_embeddings
|
||||
merged.dialog_embeddings = chunk_dialog.dialog_embeddings
|
||||
if statement:
|
||||
merged.statement_embeddings = statement.statement_embeddings
|
||||
if entity and isinstance(entity, EmbeddingStepOutput):
|
||||
merged.entity_embeddings = entity.entity_embeddings
|
||||
return merged
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 6.5 异步情绪提取调度
|
||||
# 收集 user statement,fire-and-forget 发送 Celery task
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def _collect_emotion_statements(
|
||||
self,
|
||||
all_stmt_results: Dict[str, Dict[str, List[StatementStepOutput]]],
|
||||
) -> List[Dict[str, str]]:
|
||||
"""Collect user statements for async emotion extraction.
|
||||
|
||||
Returns a list of dicts ready to be sent as Celery task payload.
|
||||
"""
|
||||
statements_payload: List[Dict[str, str]] = []
|
||||
for _dialog_id, chunk_stmts in all_stmt_results.items():
|
||||
for _chunk_id, stmts in chunk_stmts.items():
|
||||
for s in stmts:
|
||||
if s.speaker == "user":
|
||||
statements_payload.append({
|
||||
"statement_id": s.statement_id,
|
||||
"statement_text": s.statement_text,
|
||||
"speaker": s.speaker,
|
||||
})
|
||||
return statements_payload
|
||||
|
||||
@property
|
||||
def emotion_statements(self) -> List[Dict[str, str]]:
|
||||
"""Statements collected for async emotion extraction after last run."""
|
||||
return getattr(self, "_emotion_statements", [])
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 7. 数据赋值
|
||||
# 将各阶段 StepOutput 组装为 Statement 对象,替换 chunk.statements
|
||||
# ──────────────────────────────────────────────
|
||||
# TODO 乐力齐 函数内容密集较长,需要优化
|
||||
def _assign_results(
|
||||
self,
|
||||
dialog_data_list: List[DialogData],
|
||||
all_stmt_results: Dict[str, Dict[str, List[StatementStepOutput]]],
|
||||
all_triplet_results: Dict[str, Dict[str, TripletStepOutput]],
|
||||
emotion_results: Dict[str, EmotionStepOutput],
|
||||
embedding_output: Optional[EmbeddingStepOutput],
|
||||
) -> None:
|
||||
"""Assign extraction results back to dialog_data_list in-place.
|
||||
|
||||
Replaces chunk.statements with new Statement objects built from step
|
||||
outputs, because the new orchestrator generates its own statement IDs
|
||||
that don't match the original chunk statement IDs.
|
||||
"""
|
||||
from app.core.memory.models.message_models import (
|
||||
Statement,
|
||||
TemporalValidityRange,
|
||||
)
|
||||
from app.core.memory.models.triplet_models import (
|
||||
TripletExtractionResponse,
|
||||
Entity as TripletEntity,
|
||||
Triplet as TripletRelation,
|
||||
)
|
||||
from app.core.memory.utils.data.ontology import (
|
||||
RelevenceInfo,
|
||||
StatementType,
|
||||
TemporalInfo,
|
||||
)
|
||||
|
||||
# Map string values to enums
|
||||
_STMT_TYPE_MAP = {
|
||||
"FACT": StatementType.FACT,
|
||||
"OPINION": StatementType.OPINION,
|
||||
"PREDICTION": StatementType.PREDICTION,
|
||||
"SUGGESTION": StatementType.SUGGESTION,
|
||||
}
|
||||
_TEMPORAL_MAP = {
|
||||
"STATIC": TemporalInfo.STATIC,
|
||||
"DYNAMIC": TemporalInfo.DYNAMIC,
|
||||
"ATEMPORAL": TemporalInfo.ATEMPORAL,
|
||||
}
|
||||
|
||||
total_stmts = 0
|
||||
assigned_triplets = 0
|
||||
assigned_emotions = 0
|
||||
assigned_stmt_emb = 0
|
||||
assigned_chunk_emb = 0
|
||||
assigned_dialog_emb = 0
|
||||
|
||||
for dialog in dialog_data_list:
|
||||
dialog_stmts = all_stmt_results.get(dialog.id, {})
|
||||
dialog_triplets = all_triplet_results.get(dialog.id, {})
|
||||
|
||||
# Assign dialog embedding
|
||||
if embedding_output and embedding_output.dialog_embeddings:
|
||||
idx = dialog_data_list.index(dialog)
|
||||
if idx < len(embedding_output.dialog_embeddings):
|
||||
dialog.dialog_embedding = embedding_output.dialog_embeddings[idx]
|
||||
assigned_dialog_emb += 1
|
||||
|
||||
for chunk in dialog.chunks:
|
||||
# Assign chunk embedding
|
||||
if embedding_output and chunk.id in embedding_output.chunk_embeddings:
|
||||
chunk.chunk_embedding = embedding_output.chunk_embeddings[chunk.id]
|
||||
assigned_chunk_emb += 1
|
||||
|
||||
# Build new Statement objects from step outputs
|
||||
chunk_stmt_outputs = dialog_stmts.get(chunk.id, [])
|
||||
new_statements = []
|
||||
|
||||
for stmt_out in chunk_stmt_outputs:
|
||||
total_stmts += 1
|
||||
|
||||
# Temporal validity
|
||||
valid_at = stmt_out.valid_at if stmt_out.valid_at != "NULL" else None
|
||||
invalid_at = stmt_out.invalid_at if stmt_out.invalid_at != "NULL" else None
|
||||
|
||||
# Triplet info
|
||||
triplet_info = None
|
||||
triplet_out = dialog_triplets.get(stmt_out.statement_id)
|
||||
if triplet_out and (triplet_out.entities or triplet_out.triplets):
|
||||
entities = [
|
||||
TripletEntity(
|
||||
entity_idx=e.entity_idx,
|
||||
name=e.name,
|
||||
type=e.type,
|
||||
type_description=getattr(e, "type_description", ""),
|
||||
description=e.description,
|
||||
is_explicit_memory=e.is_explicit_memory,
|
||||
)
|
||||
for e in triplet_out.entities
|
||||
]
|
||||
triplets = [
|
||||
TripletRelation(
|
||||
subject_name=t.subject_name,
|
||||
subject_id=t.subject_id,
|
||||
predicate=t.predicate,
|
||||
predicate_description=getattr(t, "predicate_description", ""),
|
||||
object_name=t.object_name,
|
||||
object_id=t.object_id,
|
||||
)
|
||||
for t in triplet_out.triplets
|
||||
]
|
||||
triplet_info = TripletExtractionResponse(
|
||||
entities=entities, triplets=triplets,
|
||||
)
|
||||
assigned_triplets += 1
|
||||
|
||||
# Emotion info
|
||||
emo = emotion_results.get(stmt_out.statement_id)
|
||||
emotion_kwargs = {}
|
||||
if emo:
|
||||
emotion_kwargs = {
|
||||
"emotion_type": emo.emotion_type,
|
||||
"emotion_intensity": emo.emotion_intensity,
|
||||
"emotion_keywords": emo.emotion_keywords,
|
||||
}
|
||||
assigned_emotions += 1
|
||||
|
||||
# Statement embedding
|
||||
stmt_embedding = None
|
||||
if (
|
||||
embedding_output
|
||||
and stmt_out.statement_id in embedding_output.statement_embeddings
|
||||
):
|
||||
stmt_embedding = embedding_output.statement_embeddings[stmt_out.statement_id]
|
||||
assigned_stmt_emb += 1
|
||||
|
||||
# Build the Statement object that _create_nodes_and_edges expects
|
||||
stmt = Statement(
|
||||
id=stmt_out.statement_id,
|
||||
chunk_id=chunk.id,
|
||||
end_user_id=dialog.end_user_id,
|
||||
statement=stmt_out.statement_text,
|
||||
speaker=stmt_out.speaker,
|
||||
stmt_type=_STMT_TYPE_MAP.get(stmt_out.statement_type, StatementType.FACT),
|
||||
temporal_info=_TEMPORAL_MAP.get(stmt_out.temporal_type, TemporalInfo.ATEMPORAL),
|
||||
# relevence_info=RelevenceInfo.RELEVANT if stmt_out.relevance == "RELEVANT" else RelevenceInfo.IRRELEVANT,
|
||||
temporal_validity=TemporalValidityRange(valid_at=valid_at, invalid_at=invalid_at),
|
||||
has_unsolved_reference=stmt_out.has_unsolved_reference,
|
||||
has_emotional_state=stmt_out.has_emotional_state,
|
||||
triplet_extraction_info=triplet_info,
|
||||
statement_embedding=stmt_embedding,
|
||||
dialog_at=getattr(chunk, "dialog_at", None),
|
||||
**emotion_kwargs,
|
||||
)
|
||||
new_statements.append(stmt)
|
||||
|
||||
# Replace chunk.statements with newly built objects
|
||||
chunk.statements = new_statements
|
||||
|
||||
logger.info(
|
||||
"Data assignment complete — statements: %d, triplets: %d, "
|
||||
"emotions: %d, stmt_emb: %d, chunk_emb: %d, dialog_emb: %d",
|
||||
total_stmts,
|
||||
assigned_triplets,
|
||||
assigned_emotions,
|
||||
assigned_stmt_emb,
|
||||
assigned_chunk_emb,
|
||||
assigned_dialog_emb,
|
||||
)
|
||||
@@ -53,7 +53,7 @@ class DialogueChunker:
|
||||
)
|
||||
|
||||
self.chunker_strategy = chunker_strategy
|
||||
logger.info(f"Initializing DialogueChunker with strategy: {chunker_strategy}")
|
||||
logger.debug(f"Initializing DialogueChunker with strategy: {chunker_strategy}")
|
||||
|
||||
try:
|
||||
# Load and validate configuration
|
||||
@@ -71,7 +71,7 @@ class DialogueChunker:
|
||||
else:
|
||||
self.chunker_client = ChunkerClient(self.chunker_config)
|
||||
|
||||
logger.info(f"DialogueChunker initialized successfully with strategy: {chunker_strategy}")
|
||||
logger.debug(f"DialogueChunker initialized successfully with strategy: {chunker_strategy}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize DialogueChunker: {e}", exc_info=True)
|
||||
@@ -101,7 +101,7 @@ class DialogueChunker:
|
||||
f"Messages: {len(dialogue.context.msgs) if dialogue.context else 0}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
logger.debug(
|
||||
f"Processing dialogue {dialogue.ref_id} with {len(dialogue.context.msgs)} messages "
|
||||
f"using strategy: {self.chunker_strategy}"
|
||||
)
|
||||
@@ -121,7 +121,7 @@ class DialogueChunker:
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully generated {len(chunks)} chunks for dialogue {dialogue.ref_id}. "
|
||||
f"Successfully generated {len(chunks)} chunks for dialogue_id: {dialogue.ref_id}. "
|
||||
f"Total characters processed: {len(dialogue.content) if dialogue.content else 0}"
|
||||
)
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ async def generate_title_and_type_for_summary(
|
||||
f"已归一化为 '{episodic_type}'"
|
||||
)
|
||||
|
||||
logger.info(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}")
|
||||
logger.debug(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}")
|
||||
return (title, episodic_type)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
@@ -197,7 +197,7 @@ async def _process_chunk_summary(
|
||||
llm_client=llm_client,
|
||||
language=language
|
||||
)
|
||||
logger.info(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}")
|
||||
logger.debug(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to generate title and type for chunk {chunk.id}: {e}")
|
||||
# Continue without title and type
|
||||
@@ -215,7 +215,6 @@ async def _process_chunk_summary(
|
||||
apply_id=dialog.end_user_id,
|
||||
run_id=dialog.run_id, # 使用 dialog 的 run_id
|
||||
created_at=datetime.now(),
|
||||
expired_at=datetime(9999, 12, 31),
|
||||
dialog_id=dialog.id,
|
||||
chunk_ids=[chunk.id],
|
||||
content=summary_text,
|
||||
|
||||
@@ -1,176 +1,71 @@
|
||||
"""
|
||||
Metadata extractor module.
|
||||
Metadata extractor utilities.
|
||||
|
||||
Collects user-related statements from post-dedup graph data and
|
||||
extracts user metadata via an independent LLM call.
|
||||
Provides helper functions for identifying user entities from post-dedup
|
||||
graph data. The actual LLM extraction logic lives in MetadataExtractionStep.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from typing import Dict, List
|
||||
|
||||
from app.core.memory.models.graph_models import (
|
||||
ExtractedEntityNode,
|
||||
StatementEntityEdge,
|
||||
StatementNode,
|
||||
)
|
||||
from app.core.memory.models.graph_models import ExtractedEntityNode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Reuse the same user-entity detection logic from dedup module
|
||||
_USER_NAMES = {"用户", "我", "user", "i"}
|
||||
_CANONICAL_USER_TYPE = "用户"
|
||||
# 用户实体判定常量
|
||||
USER_NAMES = {"用户", "我", "user", "i"}
|
||||
CANONICAL_USER_TYPE = "用户"
|
||||
|
||||
|
||||
def _is_user_entity(ent: ExtractedEntityNode) -> bool:
|
||||
"""判断实体是否为用户实体"""
|
||||
name = (getattr(ent, "name", "") or "").strip().lower()
|
||||
etype = (getattr(ent, "entity_type", "") or "").strip()
|
||||
return name in _USER_NAMES or etype == _CANONICAL_USER_TYPE
|
||||
def is_user_entity(entity: ExtractedEntityNode) -> bool:
|
||||
"""判断实体是否为用户实体。"""
|
||||
name = (getattr(entity, "name", "") or "").strip().lower()
|
||||
etype = (getattr(entity, "entity_type", "") or "").strip()
|
||||
return name in USER_NAMES or etype == CANONICAL_USER_TYPE
|
||||
|
||||
|
||||
class MetadataExtractor:
|
||||
"""Extracts user metadata from post-dedup graph data via independent LLM call."""
|
||||
def collect_user_entities_for_metadata(
|
||||
entity_nodes: List[ExtractedEntityNode],
|
||||
) -> List[Dict]:
|
||||
"""从去重后的实体列表中筛选用户实体,构造元数据提取的输入。
|
||||
|
||||
def __init__(self, llm_client, language: Optional[str] = None):
|
||||
self.llm_client = llm_client
|
||||
self.language = language
|
||||
将每个用户实体的 description 按分号拆分为列表,
|
||||
作为 Celery 异步元数据提取任务的输入。
|
||||
|
||||
@staticmethod
|
||||
def detect_language(statements: List[str]) -> str:
|
||||
"""根据 statement 文本内容检测语言。
|
||||
如果文本中包含中文字符则返回 "zh",否则返回 "en"。
|
||||
"""
|
||||
import re
|
||||
Args:
|
||||
entity_nodes: 去重后的实体节点列表
|
||||
|
||||
combined = " ".join(statements)
|
||||
if re.search(r"[\u4e00-\u9fff]", combined):
|
||||
return "zh"
|
||||
return "en"
|
||||
Returns:
|
||||
用户实体字典列表,每项包含 entity_id、entity_name、descriptions
|
||||
"""
|
||||
user_entities = []
|
||||
for entity in entity_nodes:
|
||||
if not is_user_entity(entity):
|
||||
continue
|
||||
|
||||
def collect_user_related_statements(
|
||||
self,
|
||||
entity_nodes: List[ExtractedEntityNode],
|
||||
statement_nodes: List[StatementNode],
|
||||
statement_entity_edges: List[StatementEntityEdge],
|
||||
) -> List[str]:
|
||||
"""
|
||||
从去重后的数据中筛选与用户直接相关且由用户发言的 statement 文本。
|
||||
desc = (getattr(entity, "description", "") or "").strip()
|
||||
if not desc:
|
||||
continue
|
||||
|
||||
筛选逻辑:
|
||||
1. 用户实体 → StatementEntityEdge → statement(直接关联)
|
||||
2. 只保留 speaker="user" 的 statement(过滤 assistant 回复的噪声)
|
||||
|
||||
Returns:
|
||||
用户发言的 statement 文本列表
|
||||
"""
|
||||
# Find user entity IDs
|
||||
user_entity_ids = set()
|
||||
for ent in entity_nodes:
|
||||
if _is_user_entity(ent):
|
||||
user_entity_ids.add(ent.id)
|
||||
|
||||
if not user_entity_ids:
|
||||
logger.debug("未找到用户实体节点,跳过 statement 收集")
|
||||
return []
|
||||
|
||||
# 用户实体 → StatementEntityEdge → statement
|
||||
target_stmt_ids = set()
|
||||
for edge in statement_entity_edges:
|
||||
if edge.target in user_entity_ids:
|
||||
target_stmt_ids.add(edge.source)
|
||||
|
||||
# Collect: only speaker="user" statements, preserving order
|
||||
result = []
|
||||
seen = set()
|
||||
total_associated = 0
|
||||
skipped_non_user = 0
|
||||
for stmt_node in statement_nodes:
|
||||
if stmt_node.id in target_stmt_ids and stmt_node.id not in seen:
|
||||
total_associated += 1
|
||||
speaker = getattr(stmt_node, "speaker", None) or "unknown"
|
||||
if speaker == "user":
|
||||
text = (stmt_node.statement or "").strip()
|
||||
if text:
|
||||
result.append(text)
|
||||
else:
|
||||
skipped_non_user += 1
|
||||
seen.add(stmt_node.id)
|
||||
# 将分号分隔的 description 拆分为列表
|
||||
descriptions = [
|
||||
d.strip() for d in desc.replace(";", ";").split(";")
|
||||
if d.strip()
|
||||
]
|
||||
if descriptions:
|
||||
user_entities.append({
|
||||
"entity_id": entity.id,
|
||||
"entity_name": entity.name,
|
||||
"descriptions": descriptions,
|
||||
"aliases": list(entity.aliases or []),
|
||||
"end_user_id": entity.end_user_id,
|
||||
})
|
||||
|
||||
if user_entities:
|
||||
logger.info(
|
||||
f"收集到 {len(result)} 条用户发言 statement "
|
||||
f"(直接关联: {total_associated}, speaker=user: {len(result)}, "
|
||||
f"跳过非user: {skipped_non_user})"
|
||||
f"收集到 {len(user_entities)} 个用户实体用于元数据提取"
|
||||
)
|
||||
if result:
|
||||
for i, text in enumerate(result):
|
||||
logger.info(f" [user statement {i + 1}] {text}")
|
||||
if total_associated > 0 and len(result) == 0:
|
||||
logger.warning(
|
||||
f"有 {total_associated} 条直接关联 statement 但全部被 speaker 过滤,"
|
||||
f"可能本次写入不包含 user 消息"
|
||||
)
|
||||
return result
|
||||
else:
|
||||
logger.debug("未找到用户实体,跳过元数据提取")
|
||||
|
||||
async def extract_metadata(
|
||||
self,
|
||||
statements: List[str],
|
||||
existing_metadata: Optional[dict] = None,
|
||||
existing_aliases: Optional[List[str]] = None,
|
||||
) -> Optional[tuple]:
|
||||
"""
|
||||
对筛选后的 statement 列表调用 LLM 提取元数据增量变更和用户别名。
|
||||
|
||||
Args:
|
||||
statements: 用户发言的 statement 文本列表
|
||||
existing_metadata: 数据库已有的元数据(可选)
|
||||
existing_aliases: 数据库已有的用户别名列表(可选)
|
||||
|
||||
Returns:
|
||||
(List[MetadataFieldChange], List[str], List[str]) tuple:
|
||||
(metadata_changes, aliases_to_add, aliases_to_remove) on success, None on failure
|
||||
"""
|
||||
if not statements:
|
||||
return None
|
||||
|
||||
try:
|
||||
from app.core.memory.utils.prompt.prompt_utils import prompt_env
|
||||
|
||||
if self.language:
|
||||
detected_language = self.language
|
||||
logger.info(f"元数据提取使用显式指定语言: {detected_language}")
|
||||
else:
|
||||
detected_language = self.detect_language(statements)
|
||||
logger.info(f"元数据提取语言自动检测结果: {detected_language}")
|
||||
|
||||
template = prompt_env.get_template("extract_user_metadata.jinja2")
|
||||
prompt = template.render(
|
||||
statements=statements,
|
||||
language=detected_language,
|
||||
existing_metadata=existing_metadata,
|
||||
existing_aliases=existing_aliases,
|
||||
json_schema="",
|
||||
)
|
||||
|
||||
from app.core.memory.models.metadata_models import (
|
||||
MetadataExtractionResponse,
|
||||
)
|
||||
|
||||
response = await self.llm_client.response_structured(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_model=MetadataExtractionResponse,
|
||||
)
|
||||
|
||||
if response:
|
||||
changes = response.metadata_changes if response.metadata_changes else []
|
||||
to_add = response.aliases_to_add if response.aliases_to_add else []
|
||||
to_remove = (
|
||||
response.aliases_to_remove if response.aliases_to_remove else []
|
||||
)
|
||||
return changes, to_add, to_remove
|
||||
|
||||
logger.warning("LLM 返回的响应为空")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"元数据提取 LLM 调用失败: {e}", exc_info=True)
|
||||
return None
|
||||
return user_entities
|
||||
|
||||
@@ -51,7 +51,7 @@ class OntologyExtractor:
|
||||
self.validator = OntologyValidator()
|
||||
self.owl_validator = OWLValidator()
|
||||
|
||||
logger.info("OntologyExtractor initialized")
|
||||
logger.debug("OntologyExtractor initialized")
|
||||
|
||||
async def extract_ontology_classes(
|
||||
self,
|
||||
|
||||
@@ -12,16 +12,22 @@ from app.core.memory.utils.data.ontology import (
|
||||
TemporalInfo,
|
||||
)
|
||||
from app.core.memory.utils.prompt.prompt_utils import render_statement_extraction_prompt
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
from pydantic import AliasChoices, BaseModel, Field, field_validator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ExtractedStatement(BaseModel):
|
||||
"""Schema for extracted statement from LLM"""
|
||||
statement: str = Field(..., description="The extracted statement text")
|
||||
statement: str = Field(
|
||||
...,
|
||||
validation_alias=AliasChoices("statement", "statement_text"),
|
||||
description="The extracted statement text",
|
||||
)
|
||||
statement_type: str = Field(..., description="FACT, OPINION, SUGGESTION or PREDICTION")
|
||||
temporal_type: str = Field(..., description="STATIC, DYNAMIC, ATEMPORAL")
|
||||
relevence: str = Field(..., description="RELEVANT or IRRELEVANT")
|
||||
# New prompt no longer outputs relevence; keep backward-compatible default.
|
||||
relevence: str = Field("RELEVANT", description="RELEVANT or IRRELEVANT")
|
||||
has_unsolved_reference: bool = Field(False, description="Whether the statement has unresolved references")
|
||||
|
||||
class StatementExtractionResponse(BaseModel):
|
||||
statements: List[ExtractedStatement] = Field(default_factory=list, description="List of extracted statements")
|
||||
@@ -40,7 +46,7 @@ class StatementExtractionResponse(BaseModel):
|
||||
valid_statements = []
|
||||
filtered_count = 0
|
||||
for i, stmt in enumerate(v):
|
||||
if isinstance(stmt, dict) and stmt.get('statement'):
|
||||
if isinstance(stmt, dict) and (stmt.get("statement") or stmt.get("statement_text")):
|
||||
valid_statements.append(stmt)
|
||||
elif isinstance(stmt, dict):
|
||||
# Log which statement was filtered
|
||||
@@ -95,6 +101,11 @@ class StatementExtractor:
|
||||
"""
|
||||
chunk_content = chunk.content
|
||||
chunk_speaker = self._get_speaker_from_chunk(chunk)
|
||||
logger.info(
|
||||
"[LegacyStatementExtractor] chunk_id=%s content_len=%d",
|
||||
getattr(chunk, "id", ""),
|
||||
len(chunk_content or ""),
|
||||
)
|
||||
|
||||
if not chunk_content or len(chunk_content.strip()) < 5:
|
||||
logger.warning(f"Chunk {chunk.id} content too short or empty, skipping")
|
||||
@@ -107,7 +118,18 @@ class StatementExtractor:
|
||||
granularity=self.config.statement_granularity,
|
||||
include_dialogue_context=self.config.include_dialogue_context,
|
||||
dialogue_content=dialogue_content,
|
||||
max_dialogue_chars=self.config.max_dialogue_context_chars
|
||||
max_dialogue_chars=self.config.max_dialogue_context_chars,
|
||||
input_json={
|
||||
"chunk_id": getattr(chunk, "id", ""),
|
||||
"end_user_id": end_user_id or "",
|
||||
"target_content": chunk_content,
|
||||
"target_message_date": datetime.now().isoformat(),
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{"role": "context", "msg": dialogue_content}
|
||||
] if dialogue_content else []
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
# Simple system message
|
||||
@@ -159,6 +181,8 @@ class StatementExtractor:
|
||||
chunk_id=chunk.id,
|
||||
end_user_id=end_user_id,
|
||||
speaker=chunk_speaker,
|
||||
dialog_at=getattr(chunk, "dialog_at", None),
|
||||
has_unsolved_reference=getattr(extracted_stmt, "has_unsolved_reference", False),
|
||||
)
|
||||
|
||||
chunk_statements.append(chunk_statement)
|
||||
|
||||
@@ -4,7 +4,7 @@ from typing import List, Dict, Optional
|
||||
from app.core.logging_config import get_memory_logger
|
||||
from app.core.memory.llm_tools.openai_client import OpenAIClient
|
||||
from app.core.memory.utils.prompt.prompt_utils import render_triplet_extraction_prompt
|
||||
from app.core.memory.utils.data.ontology import PREDICATE_DEFINITIONS, Predicate # 引入枚举 Predicate 白名单过滤
|
||||
from app.core.memory.utils.data.ontology import PREDICATE_DEFINITIONS
|
||||
from app.core.memory.models.triplet_models import TripletExtractionResponse
|
||||
from app.core.memory.models.message_models import DialogData, Statement
|
||||
from app.core.memory.models.ontology_extraction_models import OntologyTypeList
|
||||
@@ -73,15 +73,9 @@ class TripletExtractor:
|
||||
try:
|
||||
# Get structured response from LLM
|
||||
response = await self.llm_client.response_structured(messages, TripletExtractionResponse)
|
||||
# Filter triplets to only allowed predicates from ontology
|
||||
# 这里过滤掉了不在 Predicate 枚举中的谓语 但是容易造成谓语太严格,有点语句的谓语没有在枚举中,就被判断为弱关系
|
||||
allowed_predicates = {p.value for p in Predicate}
|
||||
filtered_triplets = [t for t in response.triplets if getattr(t, "predicate", "") in allowed_predicates]
|
||||
# 仅保留predicate ∈ Predicate 的三元组,其余全部剔除
|
||||
|
||||
# Create new triplets with statement_id set during creation
|
||||
updated_triplets = []
|
||||
for triplet in filtered_triplets: # 仅保留 predicate ∈ Predicate 的三元组
|
||||
for triplet in response.triplets:
|
||||
updated_triplet = triplet.model_copy(update={"statement_id": statement.id})
|
||||
updated_triplets.append(updated_triplet)
|
||||
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
"""SidecarStepFactory — decorator-based registry for sidecar (non-critical) steps.
|
||||
|
||||
New sidecar modules self-register via ``@SidecarStepFactory.register`` and are
|
||||
automatically discovered and instantiated by the orchestrator without any
|
||||
changes to orchestrator code.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Tuple, Type
|
||||
|
||||
from .steps.base import ExtractionStep, StepContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SidecarTiming(str, Enum):
|
||||
"""Declares when a sidecar step runs relative to the main pipeline."""
|
||||
|
||||
AFTER_STATEMENT = "after_statement"
|
||||
AFTER_TRIPLET = "after_triplet"
|
||||
|
||||
|
||||
class SidecarStepFactory:
|
||||
"""Factory that manages sidecar step registration and creation.
|
||||
|
||||
Registry maps ``config_key`` → ``(step_class, timing)``.
|
||||
Adding a new sidecar only requires the ``@register`` decorator on the
|
||||
step class — no orchestrator modifications needed.
|
||||
"""
|
||||
|
||||
_registry: Dict[str, Tuple[Type[ExtractionStep], SidecarTiming]] = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, config_key: str, timing: SidecarTiming):
|
||||
"""Class decorator that registers a sidecar step.
|
||||
|
||||
Args:
|
||||
config_key: Configuration flag name (e.g. ``"emotion_enabled"``).
|
||||
The step is instantiated only when this flag is ``True``.
|
||||
timing: When the sidecar runs relative to the main pipeline.
|
||||
|
||||
Returns:
|
||||
The original class, unmodified.
|
||||
"""
|
||||
|
||||
def decorator(step_class: Type[ExtractionStep]):
|
||||
cls._registry[config_key] = (step_class, timing)
|
||||
logger.debug(
|
||||
"Registered sidecar '%s' (config_key=%s, timing=%s)",
|
||||
step_class.__name__,
|
||||
config_key,
|
||||
timing.value,
|
||||
)
|
||||
return step_class
|
||||
|
||||
return decorator
|
||||
|
||||
@classmethod
|
||||
def create_sidecars(
|
||||
cls, config: Any, context: StepContext
|
||||
) -> Dict[SidecarTiming, List[ExtractionStep]]:
|
||||
"""Instantiate enabled sidecar steps, grouped by timing.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration object. Each registered
|
||||
``config_key`` is looked up via ``getattr(config, key, False)``.
|
||||
context: Shared :class:`StepContext` injected into every step.
|
||||
|
||||
Returns:
|
||||
A dict keyed by :class:`SidecarTiming`, each value a list of
|
||||
instantiated sidecar steps whose config flag is ``True``.
|
||||
"""
|
||||
result: Dict[SidecarTiming, List[ExtractionStep]] = {
|
||||
timing: [] for timing in SidecarTiming
|
||||
}
|
||||
for config_key, (step_class, timing) in cls._registry.items():
|
||||
if getattr(config, config_key, False):
|
||||
step = step_class(context)
|
||||
result[timing].append(step)
|
||||
logger.debug(
|
||||
"Created sidecar '%s' (timing=%s)",
|
||||
step_class.__name__,
|
||||
timing.value,
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
"Skipped sidecar '%s' (config_key=%s is disabled)",
|
||||
step_class.__name__,
|
||||
config_key,
|
||||
)
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def clear_registry(cls) -> None:
|
||||
"""Remove all registered sidecars. Useful for testing."""
|
||||
cls._registry.clear()
|
||||
@@ -0,0 +1,16 @@
|
||||
"""Extraction pipeline steps — unified ExtractionStep paradigm.
|
||||
|
||||
Importing this package triggers @register decorator self-registration
|
||||
for all sidecar (non-critical) steps via SidecarStepFactory.
|
||||
"""
|
||||
|
||||
from ..sidecar_factory import SidecarStepFactory, SidecarTiming # noqa: F401
|
||||
|
||||
# Step implementations — importing triggers @register self-registration.
|
||||
from .statement_temporal_step import StatementTemporalExtractionStep # noqa: F401
|
||||
from .triplet_step import TripletExtractionStep # noqa: F401
|
||||
from .emotion_step import EmotionExtractionStep # noqa: F401
|
||||
from .embedding_step import EmbeddingStep # noqa: F401
|
||||
|
||||
# Refactored orchestrator
|
||||
from app.core.memory.storage_services.extraction_engine.extraction_pipeline_orchestrator import NewExtractionOrchestrator # noqa: F401
|
||||
@@ -0,0 +1,182 @@
|
||||
"""ExtractionStep abstract base class and StepContext.
|
||||
|
||||
Provides the unified paradigm for all LLM extraction stages:
|
||||
render_prompt → call_llm → parse_response → post_process
|
||||
|
||||
Critical steps retry on failure with exponential backoff.
|
||||
Sidecar (non-critical) steps return a default output on failure without retry.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Generic, Optional, TypeVar
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
InputT = TypeVar("InputT")
|
||||
OutputT = TypeVar("OutputT")
|
||||
|
||||
|
||||
@dataclass
|
||||
class StepContext:
|
||||
"""Shared context injected into every ExtractionStep by the orchestrator.
|
||||
|
||||
Attributes:
|
||||
llm_client: LLM client instance for generating completions.
|
||||
language: Target language code (e.g. "en", "zh").
|
||||
config: Pipeline configuration object (ExtractionPipelineConfig).
|
||||
is_pilot_run: When True, run in lightweight preview mode.
|
||||
progress_callback: Optional callable for reporting progress.
|
||||
"""
|
||||
|
||||
llm_client: Any
|
||||
language: str
|
||||
config: Any
|
||||
is_pilot_run: bool = False
|
||||
progress_callback: Optional[Any] = None
|
||||
|
||||
|
||||
class ExtractionStep(ABC, Generic[InputT, OutputT]):
|
||||
"""Abstract base class for all LLM extraction stages.
|
||||
|
||||
Lifecycle:
|
||||
1. ``__init__(context)`` — receive shared context, bind config params
|
||||
2. ``should_skip()`` — check whether to skip (config-driven / pilot mode)
|
||||
3. ``run(input_data)`` — execute full flow (with retry for critical steps)
|
||||
Internally: render_prompt → call_llm → parse_response → post_process
|
||||
4. ``on_failure(error)`` — critical steps raise; sidecar steps return default
|
||||
|
||||
Type Parameters:
|
||||
InputT: The Pydantic model type accepted by this step.
|
||||
OutputT: The Pydantic model type produced by this step.
|
||||
"""
|
||||
|
||||
def __init__(self, context: StepContext) -> None:
|
||||
self.context = context
|
||||
self.llm_client = context.llm_client
|
||||
self.language = context.language
|
||||
self.config = context.config
|
||||
|
||||
# ── Subclasses must implement ──
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""Human-readable step name for logging."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def render_prompt(self, input_data: InputT) -> Any:
|
||||
"""Build the prompt from *input_data* and bound config."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def call_llm(self, prompt: Any) -> Any:
|
||||
"""Send *prompt* to the LLM and return the raw response."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def parse_response(self, raw_response: Any, input_data: InputT) -> OutputT:
|
||||
"""Parse *raw_response* into a typed OutputT (Pydantic model)."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get_default_output(self) -> OutputT:
|
||||
"""Return a safe default when the step is skipped or fails gracefully."""
|
||||
...
|
||||
|
||||
# ── Overridable properties ──
|
||||
|
||||
@property
|
||||
def is_critical(self) -> bool:
|
||||
"""``True`` = critical step (failure aborts pipeline).
|
||||
|
||||
``False`` = sidecar step (failure degrades gracefully).
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def max_retries(self) -> int:
|
||||
"""Maximum retry attempts (only effective for critical steps)."""
|
||||
return 2
|
||||
|
||||
@property
|
||||
def retry_backoff_base(self) -> float:
|
||||
"""Backoff base in seconds. Actual wait = base × 2^attempt."""
|
||||
return 1.0
|
||||
|
||||
# ── Overridable hooks ──
|
||||
|
||||
def should_skip(self) -> bool:
|
||||
"""Config-driven skip check. Subclasses may override."""
|
||||
return False
|
||||
|
||||
async def post_process(self, parsed_data: OutputT, input_data: InputT) -> OutputT:
|
||||
"""Post-processing hook. Default is identity (returns *parsed_data* unchanged)."""
|
||||
return parsed_data
|
||||
|
||||
# ── Core execution logic ──
|
||||
|
||||
async def run(self, input_data: InputT) -> OutputT:
|
||||
"""Execute the full step lifecycle with retry logic.
|
||||
|
||||
For critical steps (``is_critical=True``):
|
||||
Attempt up to ``max_retries + 1`` times with exponential backoff.
|
||||
If all attempts fail, delegate to ``on_failure`` which raises.
|
||||
|
||||
For sidecar steps (``is_critical=False``):
|
||||
Attempt exactly once. On failure, delegate to ``on_failure``
|
||||
which returns ``get_default_output()``.
|
||||
"""
|
||||
if self.should_skip():
|
||||
logger.info("Step '%s' skipped", self.name)
|
||||
return self.get_default_output()
|
||||
|
||||
last_error: Optional[Exception] = None
|
||||
attempts = self.max_retries + 1 if self.is_critical else 1
|
||||
|
||||
for attempt in range(attempts):
|
||||
try:
|
||||
prompt = await self.render_prompt(input_data)
|
||||
raw_response = await self.call_llm(prompt)
|
||||
parsed = await self.parse_response(raw_response, input_data)
|
||||
result = await self.post_process(parsed, input_data)
|
||||
return result
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
logger.warning(
|
||||
"Step '%s' attempt %d/%d failed: %s",
|
||||
self.name,
|
||||
attempt + 1,
|
||||
attempts,
|
||||
exc,
|
||||
)
|
||||
if attempt < attempts - 1:
|
||||
wait = self.retry_backoff_base * (2 ** attempt)
|
||||
logger.info(
|
||||
"Step '%s' retrying in %.1fs …", self.name, wait
|
||||
)
|
||||
await asyncio.sleep(wait)
|
||||
|
||||
# All attempts exhausted — delegate to failure handler
|
||||
return self.on_failure(last_error) # type: ignore[arg-type]
|
||||
|
||||
def on_failure(self, error: Exception) -> OutputT:
|
||||
"""Handle step failure.
|
||||
|
||||
Critical steps: re-raise the exception to abort the pipeline.
|
||||
Sidecar steps: return ``get_default_output()`` for graceful degradation.
|
||||
"""
|
||||
if self.is_critical:
|
||||
logger.error(
|
||||
"Critical step '%s' failed after retries: %s", self.name, error
|
||||
)
|
||||
raise error
|
||||
logger.warning(
|
||||
"Sidecar step '%s' failed, returning default output: %s",
|
||||
self.name,
|
||||
error,
|
||||
)
|
||||
return self.get_default_output()
|
||||
@@ -0,0 +1,506 @@
|
||||
"""Independent deduplication module for the extraction pipeline.
|
||||
|
||||
Extracts dedup logic from ExtractionOrchestrator into standalone functions
|
||||
so the orchestrator stays thin and dedup can be tested/evolved independently.
|
||||
|
||||
The module exposes:
|
||||
- ``DedupResult`` — structured output of the dedup process
|
||||
- ``run_dedup()`` — async entry point called by WritePipeline
|
||||
- Helper functions migrated from ExtractionOrchestrator:
|
||||
``save_dedup_details``, ``analyze_entity_merges``,
|
||||
``analyze_entity_disambiguation``, ``send_dedup_progress_callback``,
|
||||
``parse_dedup_report``
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
from app.core.memory.models.graph_models import (
|
||||
EntityEntityEdge,
|
||||
ExtractedEntityNode,
|
||||
StatementEntityEdge,
|
||||
)
|
||||
from app.core.memory.models.message_models import DialogData
|
||||
from app.core.memory.models.variate_config import ExtractionPipelineConfig
|
||||
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DedupResult dataclass (Requirement 10.2)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class DedupResult:
|
||||
"""Structured output of the two-stage entity deduplication process.
|
||||
|
||||
Attributes:
|
||||
entity_nodes: Deduplicated entity node list.
|
||||
statement_entity_edges: Deduplicated statement-entity edges.
|
||||
entity_entity_edges: Deduplicated entity-entity edges.
|
||||
dedup_details: Raw detail dict returned by the first-layer dedup.
|
||||
merge_records: Parsed merge records (exact / fuzzy / LLM).
|
||||
disamb_records: Parsed disambiguation records.
|
||||
"""
|
||||
|
||||
entity_nodes: List[ExtractedEntityNode]
|
||||
statement_entity_edges: List[StatementEntityEdge]
|
||||
entity_entity_edges: List[EntityEntityEdge]
|
||||
dedup_details: Dict[str, Any] = field(default_factory=dict)
|
||||
merge_records: List[Dict[str, Any]] = field(default_factory=list)
|
||||
disamb_records: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def stats(self) -> Dict[str, int]:
|
||||
"""Summary statistics for the dedup run."""
|
||||
return {
|
||||
"entity_count": len(self.entity_nodes),
|
||||
"merge_count": len(self.merge_records),
|
||||
"disamb_count": len(self.disamb_records),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Migrated helpers (from ExtractionOrchestrator) — Requirement 10.4
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def save_dedup_details(
|
||||
dedup_details: Dict[str, Any],
|
||||
original_entities: List[ExtractedEntityNode],
|
||||
final_entities: List[ExtractedEntityNode],
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], Dict[str, str]]:
|
||||
"""Parse raw *dedup_details* into structured merge / disamb records.
|
||||
|
||||
Returns:
|
||||
(merge_records, disamb_records, id_redirect_map)
|
||||
"""
|
||||
merge_records: List[Dict[str, Any]] = []
|
||||
disamb_records: List[Dict[str, Any]] = []
|
||||
id_redirect_map: Dict[str, str] = {}
|
||||
|
||||
try:
|
||||
id_redirect_map = dedup_details.get("id_redirect", {})
|
||||
|
||||
# --- exact-match merges ---
|
||||
exact_merge_map = dedup_details.get("exact_merge_map", {})
|
||||
for _key, info in exact_merge_map.items():
|
||||
merged_ids = info.get("merged_ids", set())
|
||||
if merged_ids:
|
||||
merge_records.append({
|
||||
"type": "精确匹配",
|
||||
"canonical_id": info.get("canonical_id"),
|
||||
"entity_name": info.get("name"),
|
||||
"entity_type": info.get("entity_type"),
|
||||
"merged_count": len(merged_ids),
|
||||
"merged_ids": list(merged_ids),
|
||||
})
|
||||
|
||||
# --- fuzzy-match merges ---
|
||||
for record in dedup_details.get("fuzzy_merge_records", []):
|
||||
try:
|
||||
match = re.search(
|
||||
r"规范实体 (\S+) \(([^|]+)\|([^|]+)\|([^)]+)\) <- 合并实体 (\S+)",
|
||||
record,
|
||||
)
|
||||
if match:
|
||||
merge_records.append({
|
||||
"type": "模糊匹配",
|
||||
"canonical_id": match.group(1),
|
||||
"entity_name": match.group(3),
|
||||
"entity_type": match.group(4),
|
||||
"merged_count": 1,
|
||||
"merged_ids": [match.group(5)],
|
||||
})
|
||||
except Exception as e:
|
||||
logger.debug("解析模糊匹配记录失败: %s, 错误: %s", record, e)
|
||||
|
||||
# --- LLM-based merges ---
|
||||
for record in dedup_details.get("llm_decision_records", []):
|
||||
if "[LLM去重]" in str(record):
|
||||
try:
|
||||
match = re.search(
|
||||
r"同名类型相似 ([^(]+)(([^)]+))\|([^(]+)(([^)]+))",
|
||||
record,
|
||||
)
|
||||
if match:
|
||||
merge_records.append({
|
||||
"type": "LLM去重",
|
||||
"entity_name": match.group(1),
|
||||
"entity_type": f"{match.group(2)}|{match.group(4)}",
|
||||
"merged_count": 1,
|
||||
"merged_ids": [],
|
||||
})
|
||||
except Exception as e:
|
||||
logger.debug("解析LLM去重记录失败: %s, 错误: %s", record, e)
|
||||
|
||||
# --- disambiguation records ---
|
||||
for record in dedup_details.get("disamb_records", []):
|
||||
if "[DISAMB阻断]" in str(record):
|
||||
try:
|
||||
content = str(record).replace("[DISAMB阻断]", "").strip()
|
||||
match = re.search(
|
||||
r"([^(]+)(([^)]+))\|([^(]+)(([^)]+))", content
|
||||
)
|
||||
if match:
|
||||
entity1_name = match.group(1).strip()
|
||||
entity1_type = match.group(2)
|
||||
entity2_type = match.group(4)
|
||||
|
||||
conf_match = re.search(r"conf=([0-9.]+)", str(record))
|
||||
confidence = conf_match.group(1) if conf_match else "unknown"
|
||||
|
||||
reason_match = re.search(r"reason=([^|]+)", str(record))
|
||||
reason = reason_match.group(1).strip() if reason_match else ""
|
||||
|
||||
disamb_records.append({
|
||||
"entity_name": entity1_name,
|
||||
"disamb_type": f"消歧阻断:{entity1_type} vs {entity2_type}",
|
||||
"confidence": confidence,
|
||||
"reason": (reason[:100] + "...") if len(reason) > 100 else reason,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.debug("解析消歧记录失败: %s, 错误: %s", record, e)
|
||||
|
||||
logger.info(
|
||||
"保存去重消歧记录:%d 个合并记录,%d 个消歧记录",
|
||||
len(merge_records),
|
||||
len(disamb_records),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("保存去重消歧详情失败: %s", e, exc_info=True)
|
||||
|
||||
return merge_records, disamb_records, id_redirect_map
|
||||
|
||||
|
||||
def analyze_entity_merges(
|
||||
merge_records: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Return merge info sorted by merged_count (descending)."""
|
||||
if not merge_records:
|
||||
return []
|
||||
sorted_records = sorted(
|
||||
merge_records, key=lambda x: x.get("merged_count", 0), reverse=True
|
||||
)
|
||||
return [
|
||||
{
|
||||
"main_entity_name": r.get("entity_name", "未知实体"),
|
||||
"merged_count": r.get("merged_count", 1),
|
||||
}
|
||||
for r in sorted_records
|
||||
]
|
||||
|
||||
|
||||
def analyze_entity_disambiguation(
|
||||
disamb_records: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Return disambiguation records (pass-through)."""
|
||||
return disamb_records if disamb_records else []
|
||||
|
||||
|
||||
def parse_dedup_report(
|
||||
merge_records: List[Dict[str, Any]],
|
||||
disamb_records: List[Dict[str, Any]],
|
||||
) -> Dict[str, Any]:
|
||||
"""Build a summary report dict from parsed records."""
|
||||
try:
|
||||
dedup_examples: List[Dict[str, Any]] = []
|
||||
disamb_examples: List[Dict[str, Any]] = []
|
||||
total_merges = 0
|
||||
total_disambiguations = 0
|
||||
|
||||
for record in merge_records:
|
||||
merge_count = record.get("merged_count", 0)
|
||||
total_merges += merge_count
|
||||
dedup_examples.append({
|
||||
"type": record.get("type", "未知"),
|
||||
"entity_name": record.get("entity_name", "未知实体"),
|
||||
"entity_type": record.get("entity_type", "未知类型"),
|
||||
"merge_count": merge_count,
|
||||
"description": f"{record.get('entity_name', '未知实体')}实体去重合并{merge_count}个",
|
||||
})
|
||||
|
||||
for record in disamb_records:
|
||||
total_disambiguations += 1
|
||||
disamb_type = record.get("disamb_type", "")
|
||||
entity_name = record.get("entity_name", "未知实体")
|
||||
disamb_examples.append({
|
||||
"entity1_name": entity_name,
|
||||
"entity1_type": (
|
||||
disamb_type.split("vs")[0].replace("消歧阻断:", "").strip()
|
||||
if "vs" in disamb_type
|
||||
else "未知"
|
||||
),
|
||||
"entity2_name": entity_name,
|
||||
"entity2_type": (
|
||||
disamb_type.split("vs")[1].strip() if "vs" in disamb_type else "未知"
|
||||
),
|
||||
"description": f"{entity_name},消歧区分成功",
|
||||
})
|
||||
|
||||
return {
|
||||
"dedup_examples": dedup_examples[:5],
|
||||
"disamb_examples": disamb_examples[:5],
|
||||
"total_merges": total_merges,
|
||||
"total_disambiguations": total_disambiguations,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("获取去重报告失败: %s", e, exc_info=True)
|
||||
return {
|
||||
"dedup_examples": [],
|
||||
"disamb_examples": [],
|
||||
"total_merges": 0,
|
||||
"total_disambiguations": 0,
|
||||
}
|
||||
|
||||
|
||||
async def send_dedup_progress_callback(
|
||||
progress_callback: Callable,
|
||||
merge_records: List[Dict[str, Any]],
|
||||
disamb_records: List[Dict[str, Any]],
|
||||
original_entities: int,
|
||||
final_entities: int,
|
||||
original_stmt_edges: int,
|
||||
final_stmt_edges: int,
|
||||
original_ent_edges: int,
|
||||
final_ent_edges: int,
|
||||
) -> None:
|
||||
"""Send dedup completion progress via *progress_callback*."""
|
||||
try:
|
||||
dedup_details = parse_dedup_report(merge_records, disamb_records)
|
||||
|
||||
entities_reduced = original_entities - final_entities
|
||||
stmt_edges_reduced = original_stmt_edges - final_stmt_edges
|
||||
ent_edges_reduced = original_ent_edges - final_ent_edges
|
||||
|
||||
dedup_stats = {
|
||||
"entities": {
|
||||
"original_count": original_entities,
|
||||
"final_count": final_entities,
|
||||
"reduced_count": entities_reduced,
|
||||
"reduction_rate": (
|
||||
round(entities_reduced / original_entities * 100, 1)
|
||||
if original_entities > 0
|
||||
else 0
|
||||
),
|
||||
},
|
||||
"statement_entity_edges": {
|
||||
"original_count": original_stmt_edges,
|
||||
"final_count": final_stmt_edges,
|
||||
"reduced_count": stmt_edges_reduced,
|
||||
},
|
||||
"entity_entity_edges": {
|
||||
"original_count": original_ent_edges,
|
||||
"final_count": final_ent_edges,
|
||||
"reduced_count": ent_edges_reduced,
|
||||
},
|
||||
"dedup_examples": dedup_details.get("dedup_examples", []),
|
||||
"disamb_examples": dedup_details.get("disamb_examples", []),
|
||||
"summary": {
|
||||
"total_merges": dedup_details.get("total_merges", 0),
|
||||
"total_disambiguations": dedup_details.get("total_disambiguations", 0),
|
||||
},
|
||||
}
|
||||
|
||||
await progress_callback("dedup_disambiguation_complete", "去重消歧完成", dedup_stats)
|
||||
except Exception as e:
|
||||
logger.error("发送去重消歧进度回调失败: %s", e, exc_info=True)
|
||||
try:
|
||||
basic_stats = {
|
||||
"entities": {
|
||||
"original_count": original_entities,
|
||||
"final_count": final_entities,
|
||||
"reduced_count": original_entities - final_entities,
|
||||
},
|
||||
"summary": f"实体去重合并{original_entities - final_entities}个",
|
||||
}
|
||||
await progress_callback("dedup_disambiguation_complete", "去重消歧完成", basic_stats)
|
||||
except Exception as e2:
|
||||
logger.error("发送基本去重统计失败: %s", e2, exc_info=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# run_dedup — main entry point (Requirements 10.1, 10.3)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_dedup(
|
||||
entity_nodes: List[ExtractedEntityNode],
|
||||
statement_entity_edges: List[StatementEntityEdge],
|
||||
entity_entity_edges: List[EntityEntityEdge],
|
||||
dialog_data_list: List[DialogData],
|
||||
pipeline_config: ExtractionPipelineConfig,
|
||||
connector: Optional[Neo4jConnector] = None,
|
||||
llm_client: Optional[Any] = None,
|
||||
is_pilot_run: bool = False,
|
||||
progress_callback: Optional[Callable] = None,
|
||||
) -> DedupResult:
|
||||
"""Two-stage entity deduplication and disambiguation.
|
||||
|
||||
Full mode:
|
||||
Layer 1 — exact / fuzzy / LLM matching
|
||||
Layer 2 — Neo4j joint dedup + cross-role alias cleaning
|
||||
|
||||
Pilot-run mode:
|
||||
Layer 1 only (skip Neo4j layer 2 and alias cleaning).
|
||||
|
||||
Args:
|
||||
entity_nodes: Pre-dedup entity nodes.
|
||||
statement_entity_edges: Pre-dedup statement-entity edges.
|
||||
entity_entity_edges: Pre-dedup entity-entity edges.
|
||||
dialog_data_list: Source dialogue data (used to detect end_user_id).
|
||||
pipeline_config: Pipeline configuration (contains DedupConfig).
|
||||
connector: Optional Neo4j connector for layer-2 dedup.
|
||||
llm_client: Optional LLM client for LLM-based dedup decisions.
|
||||
is_pilot_run: When True, only execute layer-1 dedup.
|
||||
progress_callback: Optional async callable for progress reporting.
|
||||
|
||||
Returns:
|
||||
A ``DedupResult`` with deduplicated nodes, edges, and statistics.
|
||||
"""
|
||||
logger.info("开始两阶段实体去重和消歧")
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback("deduplication", "正在去重消歧...")
|
||||
|
||||
logger.info(
|
||||
"去重前: %d 个实体节点, %d 条陈述句-实体边, %d 条实体-实体边",
|
||||
len(entity_nodes),
|
||||
len(statement_entity_edges),
|
||||
len(entity_entity_edges),
|
||||
)
|
||||
|
||||
original_entity_count = len(entity_nodes)
|
||||
original_stmt_edge_count = len(statement_entity_edges)
|
||||
original_ent_edge_count = len(entity_entity_edges)
|
||||
|
||||
try:
|
||||
if is_pilot_run:
|
||||
# --- pilot run: layer 1 only ---
|
||||
logger.info("试运行模式:仅执行第一层去重,跳过第二层数据库去重")
|
||||
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import (
|
||||
deduplicate_entities_and_edges,
|
||||
)
|
||||
|
||||
(
|
||||
dedup_entity_nodes,
|
||||
dedup_stmt_edges,
|
||||
dedup_ent_edges,
|
||||
raw_details,
|
||||
) = await deduplicate_entities_and_edges(
|
||||
entity_nodes,
|
||||
statement_entity_edges,
|
||||
entity_entity_edges,
|
||||
report_stage="第一层去重消歧(试运行)",
|
||||
report_append=False,
|
||||
dedup_config=pipeline_config.deduplication,
|
||||
llm_client=llm_client,
|
||||
)
|
||||
|
||||
final_entities = dedup_entity_nodes
|
||||
final_stmt_edges = dedup_stmt_edges
|
||||
final_ent_edges = dedup_ent_edges
|
||||
else:
|
||||
# --- full mode: two-stage dedup ---
|
||||
from app.core.memory.storage_services.extraction_engine.deduplication.two_stage_dedup import (
|
||||
dedup_layers_and_merge_and_return,
|
||||
)
|
||||
|
||||
(
|
||||
_dialogue_nodes,
|
||||
_chunk_nodes,
|
||||
_statement_nodes,
|
||||
final_entities,
|
||||
_statement_chunk_edges,
|
||||
final_stmt_edges,
|
||||
final_ent_edges,
|
||||
raw_details,
|
||||
) = await dedup_layers_and_merge_and_return(
|
||||
dialogue_nodes=[],
|
||||
chunk_nodes=[],
|
||||
statement_nodes=[],
|
||||
entity_nodes=entity_nodes,
|
||||
statement_chunk_edges=[],
|
||||
statement_entity_edges=statement_entity_edges,
|
||||
entity_entity_edges=entity_entity_edges,
|
||||
dialog_data_list=dialog_data_list,
|
||||
pipeline_config=pipeline_config,
|
||||
connector=connector,
|
||||
llm_client=llm_client,
|
||||
)
|
||||
|
||||
# Parse raw details into structured records
|
||||
merge_records, disamb_records, _id_redirect = save_dedup_details(
|
||||
raw_details, entity_nodes, final_entities
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"去重后: %d 个实体节点, %d 条陈述句-实体边, %d 条实体-实体边",
|
||||
len(final_entities),
|
||||
len(final_stmt_edges),
|
||||
len(final_ent_edges),
|
||||
)
|
||||
logger.info(
|
||||
"去重效果: 实体减少 %d, 陈述句-实体边减少 %d, 实体-实体边减少 %d",
|
||||
original_entity_count - len(final_entities),
|
||||
original_stmt_edge_count - len(final_stmt_edges),
|
||||
original_ent_edge_count - len(final_ent_edges),
|
||||
)
|
||||
|
||||
# --- progress callbacks ---
|
||||
if progress_callback:
|
||||
merge_info = analyze_entity_merges(merge_records)
|
||||
for i, detail in enumerate(merge_info[:5]):
|
||||
dedup_result = {
|
||||
"result_type": "entity_merge",
|
||||
"merged_entity_name": detail["main_entity_name"],
|
||||
"merged_count": detail["merged_count"],
|
||||
"merge_progress": f"{i + 1}/{min(len(merge_info), 5)}",
|
||||
"message": (
|
||||
f"{detail['main_entity_name']}合并{detail['merged_count']}个:相似实体已合并"
|
||||
),
|
||||
}
|
||||
await progress_callback("dedup_disambiguation_result", "实体去重中", dedup_result)
|
||||
|
||||
disamb_info = analyze_entity_disambiguation(disamb_records)
|
||||
for i, detail in enumerate(disamb_info[:5]):
|
||||
disamb_result = {
|
||||
"result_type": "entity_disambiguation",
|
||||
"disambiguated_entity_name": detail["entity_name"],
|
||||
"disambiguation_type": detail["disamb_type"],
|
||||
"confidence": detail.get("confidence", "unknown"),
|
||||
"reason": detail.get("reason", ""),
|
||||
"disamb_progress": f"{i + 1}/{min(len(disamb_info), 5)}",
|
||||
"message": f"{detail['entity_name']}消歧完成:{detail['disamb_type']}",
|
||||
}
|
||||
await progress_callback("dedup_disambiguation_result", "实体消歧中", disamb_result)
|
||||
|
||||
await send_dedup_progress_callback(
|
||||
progress_callback,
|
||||
merge_records,
|
||||
disamb_records,
|
||||
original_entity_count,
|
||||
len(final_entities),
|
||||
original_stmt_edge_count,
|
||||
len(final_stmt_edges),
|
||||
original_ent_edge_count,
|
||||
len(final_ent_edges),
|
||||
)
|
||||
|
||||
return DedupResult(
|
||||
entity_nodes=final_entities,
|
||||
statement_entity_edges=final_stmt_edges,
|
||||
entity_entity_edges=final_ent_edges,
|
||||
dedup_details=raw_details,
|
||||
merge_records=merge_records,
|
||||
disamb_records=disamb_records,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("两阶段去重失败: %s", e, exc_info=True)
|
||||
raise
|
||||
@@ -0,0 +1,124 @@
|
||||
"""EmbeddingStep — generates vector embeddings for statements, chunks, dialogs, and entities.
|
||||
|
||||
Unlike the LLM-based ExtractionSteps, EmbeddingStep calls an embedder client
|
||||
rather than an LLM. It still follows the ``should_skip`` / ``run`` /
|
||||
``get_default_output`` contract so the orchestrator can treat it uniformly.
|
||||
|
||||
Supports **partial** embedding runs — the caller can populate only the fields
|
||||
it needs (e.g. only ``statement_texts``) and leave the rest empty.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from .schema import EmbeddingStepInput, EmbeddingStepOutput
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EmbeddingStep:
|
||||
"""Generate vector embeddings for text inputs.
|
||||
|
||||
This step does **not** inherit from ``ExtractionStep`` because it does not
|
||||
follow the render_prompt → call_llm → parse_response lifecycle. It does,
|
||||
however, expose the same ``run`` / ``should_skip`` / ``get_default_output``
|
||||
interface so the orchestrator can use it interchangeably.
|
||||
|
||||
Pilot-run mode skips execution entirely and returns empty dicts.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedder_client: Any,
|
||||
is_pilot_run: bool = False,
|
||||
batch_size: int = 100,
|
||||
) -> None:
|
||||
self.embedder_client = embedder_client
|
||||
self.is_pilot_run = is_pilot_run
|
||||
self.batch_size = batch_size
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "embedding_generation"
|
||||
|
||||
@property
|
||||
def is_critical(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def max_retries(self) -> int:
|
||||
return 1
|
||||
|
||||
@property
|
||||
def retry_backoff_base(self) -> float:
|
||||
return 1.0
|
||||
|
||||
def should_skip(self) -> bool:
|
||||
return self.is_pilot_run
|
||||
|
||||
def get_default_output(self) -> EmbeddingStepOutput:
|
||||
return EmbeddingStepOutput()
|
||||
|
||||
# ── Core execution ──
|
||||
|
||||
async def run(self, input_data: EmbeddingStepInput) -> EmbeddingStepOutput:
|
||||
"""Generate embeddings for all non-empty text fields in *input_data*."""
|
||||
if self.should_skip():
|
||||
logger.info("EmbeddingStep skipped (pilot run)")
|
||||
return self.get_default_output()
|
||||
|
||||
try:
|
||||
stmt_emb, chunk_emb, dialog_emb, entity_emb = await asyncio.gather(
|
||||
self._embed_dict(input_data.statement_texts),
|
||||
self._embed_dict(input_data.chunk_texts),
|
||||
self._embed_list(input_data.dialog_texts),
|
||||
self._embed_dict(input_data.entity_names),
|
||||
)
|
||||
return EmbeddingStepOutput(
|
||||
statement_embeddings=stmt_emb,
|
||||
chunk_embeddings=chunk_emb,
|
||||
dialog_embeddings=dialog_emb,
|
||||
entity_embeddings=entity_emb,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("EmbeddingStep failed, returning empty output: %s", exc)
|
||||
return self.get_default_output()
|
||||
|
||||
# ── Internal helpers ──
|
||||
|
||||
async def _embed_dict(
|
||||
self, texts: Dict[str, str]
|
||||
) -> Dict[str, List[float]]:
|
||||
"""Embed a dict of ``{id: text}`` and return ``{id: embedding}``."""
|
||||
if not texts:
|
||||
return {}
|
||||
|
||||
ids = list(texts.keys())
|
||||
text_list = list(texts.values())
|
||||
embeddings = await self._batch_embed(text_list)
|
||||
|
||||
return dict(zip(ids, embeddings))
|
||||
|
||||
async def _embed_list(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed a plain list of texts."""
|
||||
if not texts:
|
||||
return []
|
||||
return await self._batch_embed(texts)
|
||||
|
||||
async def _batch_embed(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Call the embedder in batches of ``self.batch_size``."""
|
||||
if len(texts) <= self.batch_size:
|
||||
return await self.embedder_client.response(texts)
|
||||
|
||||
batches = [
|
||||
texts[i : i + self.batch_size]
|
||||
for i in range(0, len(texts), self.batch_size)
|
||||
]
|
||||
batch_results = await asyncio.gather(
|
||||
*(self.embedder_client.response(b) for b in batches)
|
||||
)
|
||||
embeddings: List[List[float]] = []
|
||||
for result in batch_results:
|
||||
embeddings.extend(result)
|
||||
return embeddings
|
||||
@@ -0,0 +1,80 @@
|
||||
"""EmotionExtractionStep — sidecar step for extracting emotion from statements.
|
||||
|
||||
Replaces the legacy ``EmotionExtractionService`` with the unified ExtractionStep
|
||||
paradigm. Registered via ``@SidecarStepFactory.register`` so the orchestrator
|
||||
picks it up automatically when ``emotion_enabled`` is ``True``.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from app.core.memory.models.emotion_models import EmotionExtraction
|
||||
from app.core.memory.utils.prompt.prompt_utils import render_emotion_extraction_prompt
|
||||
|
||||
from .base import ExtractionStep, StepContext
|
||||
from ..sidecar_factory import SidecarStepFactory, SidecarTiming
|
||||
from .schema import EmotionStepInput, EmotionStepOutput
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@SidecarStepFactory.register("emotion_enabled", SidecarTiming.AFTER_STATEMENT)
|
||||
class EmotionExtractionStep(ExtractionStep[EmotionStepInput, EmotionStepOutput]):
|
||||
"""Extract emotion type, intensity, and keywords from a statement.
|
||||
|
||||
This is a **sidecar** (non-critical) step — failure returns a neutral
|
||||
default without aborting the pipeline.
|
||||
|
||||
The step self-registers with ``SidecarStepFactory`` under the config key
|
||||
``emotion_enabled`` and timing ``AFTER_STATEMENT``.
|
||||
"""
|
||||
|
||||
def __init__(self, context: StepContext) -> None:
|
||||
super().__init__(context)
|
||||
# Emotion-specific config flags (may live on a MemoryConfig object
|
||||
# attached to context.config or as top-level attributes).
|
||||
self.extract_keywords = getattr(self.config, "emotion_extract_keywords", True)
|
||||
self.enable_subject = getattr(self.config, "emotion_enable_subject", False)
|
||||
|
||||
# ── Identity ──
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "emotion_extraction"
|
||||
|
||||
@property
|
||||
def is_critical(self) -> bool:
|
||||
return False
|
||||
|
||||
# ── Config-driven skip ──
|
||||
|
||||
def should_skip(self) -> bool:
|
||||
return not getattr(self.config, "emotion_enabled", False)
|
||||
|
||||
# ── Lifecycle ──
|
||||
|
||||
async def render_prompt(self, input_data: EmotionStepInput) -> str:
|
||||
return await render_emotion_extraction_prompt(
|
||||
statement=input_data.statement_text,
|
||||
extract_keywords=self.extract_keywords,
|
||||
enable_subject=self.enable_subject,
|
||||
language=self.language,
|
||||
)
|
||||
|
||||
async def call_llm(self, prompt: Any) -> Any:
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
return await self.llm_client.response_structured(
|
||||
messages, EmotionExtraction
|
||||
)
|
||||
|
||||
async def parse_response(
|
||||
self, raw_response: Any, input_data: EmotionStepInput
|
||||
) -> EmotionStepOutput:
|
||||
return EmotionStepOutput(
|
||||
emotion_type=getattr(raw_response, "emotion_type", "neutral"),
|
||||
emotion_intensity=getattr(raw_response, "emotion_intensity", 0.0),
|
||||
emotion_keywords=getattr(raw_response, "emotion_keywords", []),
|
||||
)
|
||||
|
||||
def get_default_output(self) -> EmotionStepOutput:
|
||||
return EmotionStepOutput()
|
||||
@@ -0,0 +1,456 @@
|
||||
"""
|
||||
GraphBuildStep — 从 DialogData 构建 Neo4j 图节点和边。
|
||||
|
||||
职责:
|
||||
- 遍历 DialogData 列表,构建 DialogueNode、ChunkNode、StatementNode、
|
||||
ExtractedEntityNode、PerceptualNode 及各类 Edge
|
||||
- 不涉及 LLM 调用、去重、Neo4j 写入
|
||||
|
||||
依赖:
|
||||
- embedder_client(可选):为 PerceptualNode 生成 summary embedding
|
||||
- progress_callback(可选):流式输出关系创建进度
|
||||
|
||||
从 ExtractionOrchestrator._create_nodes_and_edges() 提取而来,
|
||||
旧编排器保留原方法不变,新旧流水线完全隔离。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any, Awaitable, Callable, Dict, List, Optional
|
||||
|
||||
from app.core.memory.models.graph_models import (
|
||||
ChunkNode,
|
||||
DialogueNode,
|
||||
EntityEntityEdge,
|
||||
ExtractedEntityNode,
|
||||
PerceptualEdge,
|
||||
PerceptualNode,
|
||||
StatementChunkEdge,
|
||||
StatementEntityEdge,
|
||||
StatementNode,
|
||||
AssistantOriginalNode,
|
||||
AssistantPrunedNode,
|
||||
AssistantPrunedEdge,
|
||||
AssistantDialogEdge,
|
||||
)
|
||||
from app.core.memory.models.message_models import DialogData, TemporalInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GraphBuildResult:
|
||||
"""图构建步骤的输出。"""
|
||||
|
||||
__slots__ = (
|
||||
"dialogue_nodes",
|
||||
"chunk_nodes",
|
||||
"statement_nodes",
|
||||
"entity_nodes",
|
||||
"perceptual_nodes",
|
||||
"stmt_chunk_edges",
|
||||
"stmt_entity_edges",
|
||||
"entity_entity_edges",
|
||||
"perceptual_edges",
|
||||
"assistant_original_nodes",
|
||||
"assistant_pruned_nodes",
|
||||
"assistant_pruned_edges",
|
||||
"assistant_dialog_edges",
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dialogue_nodes: List[DialogueNode],
|
||||
chunk_nodes: List[ChunkNode],
|
||||
statement_nodes: List[StatementNode],
|
||||
entity_nodes: List[ExtractedEntityNode],
|
||||
perceptual_nodes: List[PerceptualNode],
|
||||
stmt_chunk_edges: List[StatementChunkEdge],
|
||||
stmt_entity_edges: List[StatementEntityEdge],
|
||||
entity_entity_edges: List[EntityEntityEdge],
|
||||
perceptual_edges: List[PerceptualEdge],
|
||||
assistant_original_nodes: Optional[List[AssistantOriginalNode]] = None,
|
||||
assistant_pruned_nodes: Optional[List[AssistantPrunedNode]] = None,
|
||||
assistant_pruned_edges: Optional[List[AssistantPrunedEdge]] = None,
|
||||
assistant_dialog_edges: Optional[List[AssistantDialogEdge]] = None,
|
||||
):
|
||||
self.dialogue_nodes = dialogue_nodes
|
||||
self.chunk_nodes = chunk_nodes
|
||||
self.statement_nodes = statement_nodes
|
||||
self.entity_nodes = entity_nodes
|
||||
self.perceptual_nodes = perceptual_nodes
|
||||
self.stmt_chunk_edges = stmt_chunk_edges
|
||||
self.stmt_entity_edges = stmt_entity_edges
|
||||
self.entity_entity_edges = entity_entity_edges
|
||||
self.perceptual_edges = perceptual_edges
|
||||
self.assistant_original_nodes = assistant_original_nodes or []
|
||||
self.assistant_pruned_nodes = assistant_pruned_nodes or []
|
||||
self.assistant_pruned_edges = assistant_pruned_edges or []
|
||||
self.assistant_dialog_edges = assistant_dialog_edges or []
|
||||
|
||||
|
||||
async def build_graph_nodes_and_edges(
|
||||
dialog_data_list: List[DialogData],
|
||||
embedder_client: Any = None,
|
||||
progress_callback: Optional[
|
||||
Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]
|
||||
] = None,
|
||||
) -> GraphBuildResult:
|
||||
"""
|
||||
从 DialogData 列表构建完整的图节点和边。
|
||||
|
||||
Args:
|
||||
dialog_data_list: 经过萃取和数据赋值后的 DialogData 列表
|
||||
embedder_client: 可选的嵌入客户端,用于 PerceptualNode summary embedding
|
||||
progress_callback: 可选的进度回调
|
||||
|
||||
Returns:
|
||||
GraphBuildResult 包含所有节点和边
|
||||
"""
|
||||
logger.info("开始创建节点和边")
|
||||
|
||||
dialogue_nodes: List[DialogueNode] = []
|
||||
chunk_nodes: List[ChunkNode] = []
|
||||
statement_nodes: List[StatementNode] = []
|
||||
entity_nodes: List[ExtractedEntityNode] = []
|
||||
perceptual_nodes: List[PerceptualNode] = []
|
||||
stmt_chunk_edges: List[StatementChunkEdge] = []
|
||||
stmt_entity_edges: List[StatementEntityEdge] = []
|
||||
entity_entity_edges: List[EntityEntityEdge] = []
|
||||
perceptual_edges: List[PerceptualEdge] = []
|
||||
|
||||
entity_id_set: set = set()
|
||||
total_dialogs = len(dialog_data_list)
|
||||
processed_dialogs = 0
|
||||
|
||||
for dialog_data in dialog_data_list:
|
||||
processed_dialogs += 1
|
||||
# region TODO 乐力齐 重构流水线切换生产环境稳定后修改
|
||||
# ── 对话节点 ──
|
||||
dialogue_node = DialogueNode(
|
||||
id=dialog_data.id,
|
||||
name=f"Dialog_{dialog_data.id}",
|
||||
ref_id=dialog_data.ref_id,
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
content=dialog_data.context.content if dialog_data.context else "",
|
||||
dialog_embedding=dialog_data.dialog_embedding if hasattr(dialog_data, "dialog_embedding") else None,
|
||||
created_at=dialog_data.created_at,
|
||||
metadata=dialog_data.metadata,
|
||||
config_id=dialog_data.config_id if hasattr(dialog_data, "config_id") else None,
|
||||
)
|
||||
dialogue_nodes.append(dialogue_node)
|
||||
|
||||
# ── 分块节点 ──
|
||||
for chunk_idx, chunk in enumerate(dialog_data.chunks):
|
||||
chunk_node = ChunkNode(
|
||||
id=chunk.id,
|
||||
name=f"Chunk_{chunk.id}",
|
||||
dialog_id=dialog_data.id,
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
content=chunk.content,
|
||||
speaker=getattr(chunk, "speaker", None),
|
||||
chunk_embedding=chunk.chunk_embedding,
|
||||
sequence_number=chunk_idx,
|
||||
created_at=dialog_data.created_at,
|
||||
metadata=chunk.metadata,
|
||||
)
|
||||
chunk_nodes.append(chunk_node)
|
||||
|
||||
# ── 感知节点 ──
|
||||
for p, file_type in chunk.files:
|
||||
meta = p.meta_data or {}
|
||||
content_meta = meta.get("content", {})
|
||||
|
||||
summary_embedding = None
|
||||
if embedder_client and p.summary:
|
||||
try:
|
||||
summary_embedding = (await embedder_client.response([p.summary]))[0]
|
||||
except Exception as emb_err:
|
||||
logger.warning(f"Failed to embed perceptual summary: {emb_err}")
|
||||
|
||||
perceptual = PerceptualNode(
|
||||
name=f"Perceptual_{p.id}",
|
||||
id=str(p.id),
|
||||
end_user_id=str(p.end_user_id),
|
||||
perceptual_type=p.perceptual_type,
|
||||
file_path=p.file_path or "",
|
||||
file_name=p.file_name or "",
|
||||
file_ext=p.file_ext or "",
|
||||
summary=p.summary or "",
|
||||
keywords=content_meta.get("keywords", []),
|
||||
topic=content_meta.get("topic", ""),
|
||||
domain=content_meta.get("domain", ""),
|
||||
created_at=p.created_time.isoformat() if p.created_time else None,
|
||||
file_type=file_type,
|
||||
summary_embedding=summary_embedding,
|
||||
)
|
||||
perceptual_nodes.append(perceptual)
|
||||
perceptual_edges.append(
|
||||
PerceptualEdge(
|
||||
source=perceptual.id,
|
||||
target=chunk.id,
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
)
|
||||
)
|
||||
|
||||
# ── 陈述句节点 + 边 ──
|
||||
for statement in chunk.statements:
|
||||
statement_node = StatementNode(
|
||||
id=statement.id,
|
||||
name=f"Statement_{statement.id}",
|
||||
chunk_id=chunk.id,
|
||||
stmt_type=getattr(statement, "stmt_type", "general"),
|
||||
temporal_info=getattr(statement, "temporal_info", TemporalInfo.ATEMPORAL),
|
||||
connect_strength=(
|
||||
statement.connect_strength
|
||||
if statement.connect_strength is not None
|
||||
else "Strong"
|
||||
),
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
statement=statement.statement,
|
||||
speaker=getattr(statement, "speaker", None),
|
||||
statement_embedding=statement.statement_embedding,
|
||||
valid_at=(
|
||||
statement.temporal_validity.valid_at
|
||||
if hasattr(statement, "temporal_validity") and statement.temporal_validity
|
||||
else None
|
||||
),
|
||||
invalid_at=(
|
||||
statement.temporal_validity.invalid_at
|
||||
if hasattr(statement, "temporal_validity") and statement.temporal_validity
|
||||
else None
|
||||
),
|
||||
created_at=dialog_data.created_at,
|
||||
dialog_at=getattr(statement, "dialog_at", None),
|
||||
config_id=dialog_data.config_id if hasattr(dialog_data, "config_id") else None,
|
||||
emotion_type=getattr(statement, "emotion_type", None),
|
||||
emotion_intensity=getattr(statement, "emotion_intensity", None),
|
||||
emotion_keywords=getattr(statement, "emotion_keywords", None),
|
||||
emotion_subject=getattr(statement, "emotion_subject", None),
|
||||
emotion_target=getattr(statement, "emotion_target", None),
|
||||
)
|
||||
statement_nodes.append(statement_node)
|
||||
|
||||
stmt_chunk_edges.append(
|
||||
StatementChunkEdge(
|
||||
source=statement.id,
|
||||
target=chunk.id,
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
)
|
||||
)
|
||||
|
||||
# ── 三元组 → 实体节点 + 边 ──
|
||||
if not statement.triplet_extraction_info:
|
||||
continue
|
||||
|
||||
triplet_info = statement.triplet_extraction_info
|
||||
entity_idx_to_id: Dict[int, str] = {}
|
||||
|
||||
for entity_idx, entity in enumerate(triplet_info.entities):
|
||||
entity_idx_to_id[entity.entity_idx] = entity.id
|
||||
entity_idx_to_id[entity_idx] = entity.id
|
||||
|
||||
if entity.id not in entity_id_set:
|
||||
entity_connect_strength = getattr(entity, "connect_strength", "Strong")
|
||||
entity_node = ExtractedEntityNode(
|
||||
id=entity.id,
|
||||
name=getattr(entity, "name", f"Entity_{entity.id}"),
|
||||
entity_idx=entity.entity_idx,
|
||||
statement_id=statement.id,
|
||||
entity_type=getattr(entity, "type", "unknown"),
|
||||
type_description=getattr(entity, "type_description", ""),
|
||||
description=getattr(entity, "description", ""),
|
||||
example=getattr(entity, "example", ""),
|
||||
connect_strength=(
|
||||
entity_connect_strength
|
||||
if entity_connect_strength is not None
|
||||
else "Strong"
|
||||
),
|
||||
aliases=getattr(entity, "aliases", []) or [],
|
||||
name_embedding=getattr(entity, "name_embedding", None),
|
||||
is_explicit_memory=getattr(entity, "is_explicit_memory", False),
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
config_id=dialog_data.config_id if hasattr(dialog_data, "config_id") else None,
|
||||
)
|
||||
entity_nodes.append(entity_node)
|
||||
entity_id_set.add(entity.id)
|
||||
|
||||
entity_connect_strength = getattr(entity, "connect_strength", "Strong")
|
||||
stmt_entity_edges.append(
|
||||
StatementEntityEdge(
|
||||
source=statement.id,
|
||||
target=entity.id,
|
||||
connect_strength=(
|
||||
entity_connect_strength
|
||||
if entity_connect_strength is not None
|
||||
else "Strong"
|
||||
),
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
)
|
||||
)
|
||||
# endregion
|
||||
|
||||
for triplet in triplet_info.triplets:
|
||||
subject_entity_id = entity_idx_to_id.get(triplet.subject_id)
|
||||
object_entity_id = entity_idx_to_id.get(triplet.object_id)
|
||||
|
||||
if subject_entity_id and object_entity_id:
|
||||
_tv = getattr(statement, "temporal_validity", None)
|
||||
entity_entity_edges.append(
|
||||
EntityEntityEdge(
|
||||
source=subject_entity_id,
|
||||
target=object_entity_id,
|
||||
relation_type=triplet.predicate,
|
||||
relation_type_description=getattr(triplet, "predicate_description", ""),
|
||||
statement=statement.statement,
|
||||
source_statement_id=statement.id,
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
valid_at=_tv.valid_at if _tv else None,
|
||||
invalid_at=_tv.invalid_at if _tv else None,
|
||||
)
|
||||
)
|
||||
|
||||
if progress_callback and len(entity_entity_edges) <= 10:
|
||||
relationship_result = {
|
||||
"result_type": "relationship_creation",
|
||||
"relationship_index": len(entity_entity_edges),
|
||||
"source_entity": triplet.subject_name,
|
||||
"relation_type": triplet.predicate,
|
||||
"target_entity": triplet.object_name,
|
||||
"relationship_text": f"{triplet.subject_name} -[{triplet.predicate}]-> {triplet.object_name}",
|
||||
"dialog_progress": f"{processed_dialogs}/{total_dialogs}",
|
||||
}
|
||||
await progress_callback(
|
||||
"creating_nodes_edges_result",
|
||||
f"关系创建中 ({processed_dialogs}/{total_dialogs})",
|
||||
relationship_result,
|
||||
)
|
||||
else:
|
||||
missing_subject = "subject" if not subject_entity_id else ""
|
||||
missing_object = "object" if not object_entity_id else ""
|
||||
missing_both = " and " if (not subject_entity_id and not object_entity_id) else ""
|
||||
logger.debug(
|
||||
f"跳过三元组 - 无法找到{missing_subject}{missing_both}{missing_object}实体ID: "
|
||||
f"subject_id={triplet.subject_id} ({triplet.subject_name}), "
|
||||
f"object_id={triplet.object_id} ({triplet.object_name}), "
|
||||
f"predicate={triplet.predicate}, "
|
||||
f"statement_id={statement.id}, "
|
||||
f"available_indices={sorted(entity_idx_to_id.keys())}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"节点和边创建完成 - 对话节点: {len(dialogue_nodes)}, "
|
||||
f"分块节点: {len(chunk_nodes)}, 陈述句节点: {len(statement_nodes)}, "
|
||||
f"实体节点: {len(entity_nodes)}, 陈述句-分块边: {len(stmt_chunk_edges)}, "
|
||||
f"陈述句-实体边: {len(stmt_entity_edges)}, "
|
||||
f"实体-实体边: {len(entity_entity_edges)}"
|
||||
)
|
||||
|
||||
# ── Assistant 剪枝节点和边 ──
|
||||
assistant_original_nodes: List[AssistantOriginalNode] = []
|
||||
assistant_pruned_nodes: List[AssistantPrunedNode] = []
|
||||
assistant_pruned_edges: List[AssistantPrunedEdge] = []
|
||||
assistant_dialog_edges: List[AssistantDialogEdge] = []
|
||||
|
||||
for dialog_data in dialog_data_list:
|
||||
pruning_records = dialog_data.metadata.get("assistant_pruning_records", [])
|
||||
for record in pruning_records:
|
||||
pair_id = record["pair_id"]
|
||||
original_id = f"ao_{pair_id}"
|
||||
pruned_id = f"ap_{pair_id}"
|
||||
|
||||
# AssistantOriginal 始终创建(记录原始对话)
|
||||
original_node = AssistantOriginalNode(
|
||||
id=original_id,
|
||||
name=f"AssistantOriginal_{pair_id[:8]}",
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
pair_id=pair_id,
|
||||
dialog_id=dialog_data.id,
|
||||
text=record["original_text"],
|
||||
)
|
||||
assistant_original_nodes.append(original_node)
|
||||
|
||||
# BELONGS_TO_DIALOG: Original → Dialogue
|
||||
assistant_dialog_edges.append(AssistantDialogEdge(
|
||||
source=original_id,
|
||||
target=dialog_data.id,
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
))
|
||||
|
||||
# pruned_text 为 NULL 时不创建 AssistantPruned 节点和 PRUNED_TO 边
|
||||
if record["pruned_text"] == "NULL":
|
||||
continue
|
||||
|
||||
pruned_node = AssistantPrunedNode(
|
||||
id=pruned_id,
|
||||
name=f"AssistantPruned_{pair_id[:8]}",
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
pair_id=pair_id,
|
||||
dialog_id=dialog_data.id,
|
||||
text=record["pruned_text"],
|
||||
memory_type=record["memory_type"],
|
||||
)
|
||||
assistant_pruned_nodes.append(pruned_node)
|
||||
|
||||
# PRUNED_TO: Original → Pruned
|
||||
assistant_pruned_edges.append(AssistantPrunedEdge(
|
||||
source=original_id,
|
||||
target=pruned_id,
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id,
|
||||
created_at=dialog_data.created_at,
|
||||
pair_id=pair_id,
|
||||
))
|
||||
|
||||
if assistant_original_nodes:
|
||||
logger.info(
|
||||
f"Assistant 剪枝节点创建完成 - "
|
||||
f"原始节点: {len(assistant_original_nodes)}, "
|
||||
f"剪枝节点: {len(assistant_pruned_nodes)}"
|
||||
)
|
||||
|
||||
if progress_callback:
|
||||
nodes_edges_stats = {
|
||||
"dialogue_nodes_count": len(dialogue_nodes),
|
||||
"chunk_nodes_count": len(chunk_nodes),
|
||||
"statement_nodes_count": len(statement_nodes),
|
||||
"entity_nodes_count": len(entity_nodes),
|
||||
"statement_chunk_edges_count": len(stmt_chunk_edges),
|
||||
"statement_entity_edges_count": len(stmt_entity_edges),
|
||||
"entity_entity_edges_count": len(entity_entity_edges),
|
||||
}
|
||||
await progress_callback("creating_nodes_edges_complete", "创建节点和边完成", nodes_edges_stats)
|
||||
|
||||
return GraphBuildResult(
|
||||
dialogue_nodes=dialogue_nodes,
|
||||
chunk_nodes=chunk_nodes,
|
||||
statement_nodes=statement_nodes,
|
||||
entity_nodes=entity_nodes,
|
||||
perceptual_nodes=perceptual_nodes,
|
||||
stmt_chunk_edges=stmt_chunk_edges,
|
||||
stmt_entity_edges=stmt_entity_edges,
|
||||
entity_entity_edges=entity_entity_edges,
|
||||
perceptual_edges=perceptual_edges,
|
||||
assistant_original_nodes=assistant_original_nodes,
|
||||
assistant_pruned_nodes=assistant_pruned_nodes,
|
||||
assistant_pruned_edges=assistant_pruned_edges,
|
||||
assistant_dialog_edges=assistant_dialog_edges,
|
||||
)
|
||||
@@ -0,0 +1,89 @@
|
||||
"""MetadataExtractionStep — 用户实体元数据提取 step。
|
||||
|
||||
从用户实体的 description 中提取结构化元数据(core_facts、traits、relations 等),
|
||||
通过 Celery 异步任务在去重消歧完成后执行,结果回写到 Neo4j ExtractedEntity 节点。
|
||||
|
||||
不注册为 SidecarStepFactory 的自动旁路(因为它在去重后异步执行,不在主萃取流程中),
|
||||
而是由 Celery 任务直接实例化调用。
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from .base import ExtractionStep, StepContext
|
||||
from .schema import MetadataStepInput, MetadataStepOutput
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MetadataExtractionStep(ExtractionStep[MetadataStepInput, MetadataStepOutput]):
|
||||
"""从用户实体 description 中提取结构化元数据。
|
||||
|
||||
非 critical step — 失败返回空默认值,不中断流程。
|
||||
"""
|
||||
|
||||
def __init__(self, context: StepContext) -> None:
|
||||
super().__init__(context)
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "metadata_extraction"
|
||||
|
||||
@property
|
||||
def is_critical(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def max_retries(self) -> int:
|
||||
return 1
|
||||
|
||||
async def render_prompt(self, input_data: MetadataStepInput) -> str:
|
||||
"""使用 Jinja2 模板渲染元数据提取 prompt。"""
|
||||
from app.core.memory.utils.prompt.prompt_utils import prompt_env
|
||||
|
||||
template = prompt_env.get_template("extract_user_metadata.jinja2")
|
||||
|
||||
input_json = json.dumps(
|
||||
{
|
||||
"description": input_data.descriptions,
|
||||
"existing_metadata": input_data.existing_metadata,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
return template.render(
|
||||
language=self.language,
|
||||
input_json=input_json,
|
||||
)
|
||||
|
||||
async def call_llm(self, prompt: Any) -> Any:
|
||||
"""调用 LLM 进行结构化输出。"""
|
||||
from app.core.memory.models.metadata_models import MetadataExtractionResponse
|
||||
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
return await self.llm_client.response_structured(
|
||||
messages, MetadataExtractionResponse
|
||||
)
|
||||
|
||||
async def parse_response(
|
||||
self, raw_response: Any, input_data: MetadataStepInput
|
||||
) -> MetadataStepOutput:
|
||||
"""将 LLM 响应解析为 MetadataStepOutput。"""
|
||||
if raw_response is None:
|
||||
return self.get_default_output()
|
||||
|
||||
return MetadataStepOutput(
|
||||
core_facts=getattr(raw_response, "core_facts", []) or [],
|
||||
traits=getattr(raw_response, "traits", []) or [],
|
||||
relations=getattr(raw_response, "relations", []) or [],
|
||||
goals=getattr(raw_response, "goals", []) or [],
|
||||
interests=getattr(raw_response, "interests", []) or [],
|
||||
beliefs_or_stances=getattr(raw_response, "beliefs_or_stances", []) or [],
|
||||
anchors=getattr(raw_response, "anchors", []) or [],
|
||||
events=getattr(raw_response, "events", []) or [],
|
||||
)
|
||||
|
||||
def get_default_output(self) -> MetadataStepOutput:
|
||||
return MetadataStepOutput()
|
||||
@@ -0,0 +1,47 @@
|
||||
"""Schema package for ExtractionStep inputs and outputs.
|
||||
|
||||
Re-exports all models for convenient access:
|
||||
from .schema import StatementStepInput, EmotionStepOutput, ...
|
||||
"""
|
||||
|
||||
from .extraction_step_schema import (
|
||||
EmbeddingStepInput,
|
||||
EmbeddingStepOutput,
|
||||
EntityItem,
|
||||
MessageItem,
|
||||
StatementStepInput,
|
||||
StatementStepOutput,
|
||||
SupportingContext,
|
||||
TripletItem,
|
||||
TripletStepInput,
|
||||
TripletStepOutput,
|
||||
)
|
||||
from .sidecar_step_schema import (
|
||||
EmotionStepInput,
|
||||
EmotionStepOutput,
|
||||
MetadataStepInput,
|
||||
MetadataStepOutput,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Shared
|
||||
"MessageItem",
|
||||
"SupportingContext",
|
||||
# Statement
|
||||
"StatementStepInput",
|
||||
"StatementStepOutput",
|
||||
# Triplet
|
||||
"TripletStepInput",
|
||||
"TripletStepOutput",
|
||||
"EntityItem",
|
||||
"TripletItem",
|
||||
# Embedding
|
||||
"EmbeddingStepInput",
|
||||
"EmbeddingStepOutput",
|
||||
# Sidecar — Emotion
|
||||
"EmotionStepInput",
|
||||
"EmotionStepOutput",
|
||||
# Sidecar — Metadata
|
||||
"MetadataStepInput",
|
||||
"MetadataStepOutput",
|
||||
]
|
||||
@@ -0,0 +1,123 @@
|
||||
"""Pydantic models for base extraction pipeline inputs and outputs.
|
||||
|
||||
Covers the core (critical) stages: Statement extraction, Triplet extraction,
|
||||
Embedding generation, and shared types used across stages.
|
||||
|
||||
Malformed LLM JSON will raise ``ValidationError`` and trigger stage-level retry.
|
||||
"""
|
||||
|
||||
from typing import Dict, List
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# ── Shared types ──
|
||||
|
||||
|
||||
class MessageItem(BaseModel):
|
||||
"""Single conversation message."""
|
||||
|
||||
role: str # "User" / "Assistant"
|
||||
msg: str
|
||||
|
||||
|
||||
class SupportingContext(BaseModel):
|
||||
"""Dialogue context window (used for pronoun resolution, etc.)."""
|
||||
|
||||
msgs: List[MessageItem] = Field(default_factory=list)
|
||||
|
||||
|
||||
# ── Statement extraction ──
|
||||
class StatementStepInput(BaseModel):
|
||||
"""Input for StatementTemporalExtractionStep."""
|
||||
|
||||
chunk_id: str
|
||||
end_user_id: str
|
||||
target_content: str
|
||||
target_message_date: str
|
||||
dialog_at: str = "" # ISO 8601 timestamp of the source message; used as "now" for relative time resolution
|
||||
supporting_context: SupportingContext
|
||||
|
||||
|
||||
class StatementStepOutput(BaseModel):
|
||||
"""Single extracted statement (including temporal info)."""
|
||||
|
||||
statement_id: str
|
||||
statement_text: str
|
||||
statement_type: str # FACT / OPINION / PREDICTION / SUGGESTION
|
||||
temporal_type: str # STATIC / DYNAMIC / ATEMPORAL
|
||||
# relevance: str # RELEVANT / IRRELEVANT
|
||||
speaker: str # "user" / "assistant"
|
||||
has_emotional_state: bool = False # Whether statement reflects user's emotional state
|
||||
valid_at: str # ISO 8601 or "NULL"
|
||||
invalid_at: str # ISO 8601 or "NULL"
|
||||
has_unsolved_reference: bool = False # Whether the statement has unresolved references
|
||||
dialog_at: str = "" # Passed through from input; carried into TripletStepInput
|
||||
|
||||
|
||||
# ── Triplet extraction ──
|
||||
class TripletStepInput(BaseModel):
|
||||
"""Input for TripletExtractionStep."""
|
||||
|
||||
statement_id: str
|
||||
statement_text: str
|
||||
statement_type: str
|
||||
temporal_type: str
|
||||
supporting_context: SupportingContext
|
||||
speaker: str
|
||||
dialog_at: str = "" # ISO 8601 timestamp of the source message; helps LLM ground entity descriptions in time
|
||||
valid_at: str
|
||||
invalid_at: str
|
||||
has_unsolved_reference: bool = False # From upstream statement extraction
|
||||
|
||||
|
||||
class EntityItem(BaseModel):
|
||||
"""Single entity extracted during triplet extraction."""
|
||||
|
||||
entity_idx: int
|
||||
name: str
|
||||
type: str
|
||||
type_description: str = ""
|
||||
description: str
|
||||
is_explicit_memory: bool = False
|
||||
|
||||
|
||||
class TripletItem(BaseModel):
|
||||
"""Single triplet (subject-predicate-object) relationship."""
|
||||
|
||||
subject_name: str
|
||||
subject_id: int
|
||||
predicate: str
|
||||
predicate_description: str = ""
|
||||
object_name: str
|
||||
object_id: int
|
||||
|
||||
|
||||
class TripletStepOutput(BaseModel):
|
||||
"""Output of TripletExtractionStep."""
|
||||
|
||||
entities: List[EntityItem] = Field(default_factory=list)
|
||||
triplets: List[TripletItem] = Field(default_factory=list)
|
||||
|
||||
|
||||
# ── Embedding generation ──
|
||||
class EmbeddingStepInput(BaseModel):
|
||||
"""Input for EmbeddingStep.
|
||||
|
||||
Each dict maps an ID to the text that should be embedded.
|
||||
Fields can be left empty for partial embedding runs.
|
||||
"""
|
||||
|
||||
statement_texts: Dict[str, str] = Field(default_factory=dict)
|
||||
chunk_texts: Dict[str, str] = Field(default_factory=dict)
|
||||
dialog_texts: List[str] = Field(default_factory=list)
|
||||
entity_names: Dict[str, str] = Field(default_factory=dict)
|
||||
entity_descriptions: Dict[str, str] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class EmbeddingStepOutput(BaseModel):
|
||||
"""Output of EmbeddingStep."""
|
||||
|
||||
statement_embeddings: Dict[str, List[float]] = Field(default_factory=dict)
|
||||
chunk_embeddings: Dict[str, List[float]] = Field(default_factory=dict)
|
||||
dialog_embeddings: List[List[float]] = Field(default_factory=list)
|
||||
entity_embeddings: Dict[str, List[float]] = Field(default_factory=dict)
|
||||
@@ -0,0 +1,62 @@
|
||||
"""Pydantic models for hot-pluggable sidecar step inputs and outputs.
|
||||
|
||||
Sidecar steps are non-critical (is_critical=False) modules registered via
|
||||
``@SidecarStepFactory.register`` that run concurrently alongside the main
|
||||
extraction pipeline. Failures degrade gracefully to default outputs.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# ── Emotion extraction (sidecar) ──
|
||||
class EmotionStepInput(BaseModel):
|
||||
"""Input for EmotionExtractionStep."""
|
||||
|
||||
statement_id: str
|
||||
statement_text: str
|
||||
speaker: str
|
||||
|
||||
|
||||
class EmotionStepOutput(BaseModel):
|
||||
"""Output of EmotionExtractionStep."""
|
||||
|
||||
emotion_type: str = "neutral"
|
||||
emotion_intensity: float = 0.0
|
||||
emotion_keywords: List[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
# ── Metadata extraction (async post-dedup) ──
|
||||
class MetadataStepInput(BaseModel):
|
||||
"""Input for MetadataExtractionStep."""
|
||||
|
||||
entity_id: str
|
||||
entity_name: str
|
||||
descriptions: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="用户实体的 description 列表(可能由分号分隔拆分而来)",
|
||||
)
|
||||
existing_metadata: dict = Field(
|
||||
default_factory=dict,
|
||||
description="Neo4j 中已有的元数据,用于增量去重",
|
||||
)
|
||||
|
||||
|
||||
class MetadataStepOutput(BaseModel):
|
||||
"""Output of MetadataExtractionStep."""
|
||||
|
||||
core_facts: List[str] = Field(default_factory=list)
|
||||
traits: List[str] = Field(default_factory=list)
|
||||
relations: List[str] = Field(default_factory=list)
|
||||
goals: List[str] = Field(default_factory=list)
|
||||
interests: List[str] = Field(default_factory=list)
|
||||
beliefs_or_stances: List[str] = Field(default_factory=list)
|
||||
anchors: List[str] = Field(default_factory=list)
|
||||
events: List[str] = Field(default_factory=list)
|
||||
|
||||
def has_any(self) -> bool:
|
||||
"""是否提取到了任何元数据。"""
|
||||
return any([
|
||||
self.core_facts, self.traits, self.relations, self.goals,
|
||||
self.interests, self.beliefs_or_stances, self.anchors, self.events,
|
||||
])
|
||||
@@ -0,0 +1,174 @@
|
||||
"""StatementTemporalExtractionStep — critical step for extracting statements and temporal info from chunks.
|
||||
|
||||
Replaces the legacy ``StatementExtractor`` with the unified ExtractionStep paradigm.
|
||||
Temporal extraction logic (valid_at / invalid_at) is merged into this step,
|
||||
eliminating the need for a separate ``TemporalExtractor`` call.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Any, List
|
||||
|
||||
from pydantic import AliasChoices, BaseModel, Field, field_validator
|
||||
|
||||
from app.core.memory.utils.data.ontology import LABEL_DEFINITIONS
|
||||
from app.core.memory.utils.prompt.prompt_utils import render_statement_extraction_prompt
|
||||
|
||||
from .base import ExtractionStep, StepContext
|
||||
from .schema import StatementStepInput, StatementStepOutput
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── LLM response schemas (internal) ──
|
||||
|
||||
|
||||
class _ExtractedStatement(BaseModel):
|
||||
"""Raw statement returned by the LLM (before enrichment)."""
|
||||
|
||||
statement: str = Field(
|
||||
...,
|
||||
validation_alias=AliasChoices("statement", "statement_text"),
|
||||
description="The extracted statement text",
|
||||
)
|
||||
statement_type: str = Field(..., description="FACT / OPINION / OTHER")
|
||||
temporal_type: str = Field(..., description="STATIC / DYNAMIC / ATEMPORAL")
|
||||
# relevance: str = Field("RELEVANT", description="RELEVANT / IRRELEVANT")
|
||||
has_emotional_state: bool = Field(
|
||||
False,
|
||||
description="Whether the statement reflects user's emotional state",
|
||||
)
|
||||
dialog_at: str = Field("", description="ISO 8601 session timestamp, copied verbatim from input")
|
||||
valid_at: str = Field("NULL", description="ISO 8601 or NULL")
|
||||
invalid_at: str = Field("NULL", description="ISO 8601 or NULL")
|
||||
has_unsolved_reference: bool = Field(False, description="Whether the statement has unresolved references")
|
||||
|
||||
|
||||
class _StatementExtractionResponse(BaseModel):
|
||||
"""Structured LLM response containing a list of extracted statements."""
|
||||
|
||||
statements: List[_ExtractedStatement] = Field(default_factory=list)
|
||||
|
||||
@field_validator("statements", mode="before")
|
||||
@classmethod
|
||||
def filter_empty(cls, v: Any) -> Any:
|
||||
"""Drop empty / malformed dicts that the LLM occasionally produces."""
|
||||
if isinstance(v, list):
|
||||
return [
|
||||
s
|
||||
for s in v
|
||||
if isinstance(s, dict)
|
||||
and (s.get("statement") or s.get("statement_text"))
|
||||
]
|
||||
return v
|
||||
|
||||
|
||||
class StatementTemporalExtractionStep(ExtractionStep[StatementStepInput, List[StatementStepOutput]]):
|
||||
"""Extract atomic statements with temporal info (valid_at / invalid_at) from a dialogue chunk.
|
||||
|
||||
This is a **critical** step — failure aborts the pipeline after retries.
|
||||
|
||||
Config params bound at init (from ``StepContext.config.statement_extraction``):
|
||||
* ``definitions`` — label definitions for statement classification
|
||||
* ``json_schema`` — JSON schema for the expected LLM output
|
||||
* ``granularity`` — extraction granularity level (1-3)
|
||||
* ``include_dialogue_context`` — whether to include full dialogue context
|
||||
"""
|
||||
|
||||
def __init__(self, context: StepContext) -> None:
|
||||
super().__init__(context)
|
||||
stmt_cfg = getattr(self.config, "statement_extraction", None)
|
||||
self.definitions = LABEL_DEFINITIONS
|
||||
self.json_schema = _ExtractedStatement.model_json_schema()
|
||||
self.granularity = getattr(stmt_cfg, "statement_granularity", None)
|
||||
self.include_dialogue_context = getattr(stmt_cfg, "include_dialogue_context", True)
|
||||
self.max_dialogue_context_chars = getattr(stmt_cfg, "max_dialogue_context_chars", 2000)
|
||||
|
||||
# ── Identity ──
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "statement_extraction"
|
||||
|
||||
@property
|
||||
def is_critical(self) -> bool:
|
||||
return True
|
||||
|
||||
# ── Lifecycle ──
|
||||
|
||||
async def render_prompt(self, input_data: StatementStepInput) -> str:
|
||||
# Build optional dialogue context from supporting_context messages
|
||||
dialogue_content = None
|
||||
if self.include_dialogue_context and input_data.supporting_context.msgs:
|
||||
dialogue_content = "\n".join(
|
||||
f"{m.role}: {m.msg}" for m in input_data.supporting_context.msgs
|
||||
)
|
||||
|
||||
input_json = {
|
||||
"chunk_id": input_data.chunk_id,
|
||||
"end_user_id": input_data.end_user_id,
|
||||
"dialog_at": input_data.dialog_at or "",
|
||||
"target_content": input_data.target_content,
|
||||
"target_message_date": input_data.target_message_date,
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{"role": m.role, "msg": m.msg}
|
||||
for m in input_data.supporting_context.msgs
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
return await render_statement_extraction_prompt(
|
||||
chunk_content=input_data.target_content,
|
||||
definitions=self.definitions,
|
||||
json_schema=self.json_schema,
|
||||
granularity=self.granularity,
|
||||
include_dialogue_context=self.include_dialogue_context,
|
||||
dialogue_content=dialogue_content,
|
||||
max_dialogue_chars=self.max_dialogue_context_chars,
|
||||
language=self.language,
|
||||
input_json=input_json,
|
||||
)
|
||||
|
||||
async def call_llm(self, prompt: Any) -> Any:
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are an expert at extracting and labeling atomic statements "
|
||||
"from conversational text. Return valid JSON conforming to the schema."
|
||||
),
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
return await self.llm_client.response_structured(
|
||||
messages, _StatementExtractionResponse
|
||||
)
|
||||
|
||||
async def parse_response(
|
||||
self, raw_response: Any, input_data: StatementStepInput
|
||||
) -> List[StatementStepOutput]:
|
||||
if not hasattr(raw_response, "statements") or raw_response.statements is None:
|
||||
return []
|
||||
|
||||
results: List[StatementStepOutput] = []
|
||||
for stmt in raw_response.statements:
|
||||
results.append(
|
||||
StatementStepOutput(
|
||||
statement_id=uuid.uuid4().hex,
|
||||
statement_text=stmt.statement,
|
||||
statement_type=stmt.statement_type.strip().upper(),
|
||||
temporal_type=stmt.temporal_type.strip().upper(),
|
||||
# relevance=stmt.relevance.strip().upper(),
|
||||
speaker="user", # default; orchestrator overrides from chunk metadata
|
||||
has_emotional_state=getattr(stmt, "has_emotional_state", False),
|
||||
dialog_at=input_data.dialog_at or "", # carry through from input
|
||||
valid_at=stmt.valid_at or "NULL",
|
||||
invalid_at=stmt.invalid_at or "NULL",
|
||||
has_unsolved_reference=getattr(stmt, "has_unsolved_reference", False),
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def get_default_output(self) -> List[StatementStepOutput]:
|
||||
return []
|
||||
@@ -0,0 +1,138 @@
|
||||
"""TripletExtractionStep — critical step for extracting entities and triplets.
|
||||
|
||||
Replaces the legacy ``TripletExtractor`` with the unified ExtractionStep paradigm.
|
||||
Predicate filtering against the ontology whitelist is performed in ``parse_response``.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from app.core.memory.models.triplet_models import TripletExtractionResponse
|
||||
from app.core.memory.utils.data.ontology import PREDICATE_DEFINITIONS
|
||||
from app.core.memory.utils.prompt.prompt_utils import render_triplet_extraction_prompt
|
||||
|
||||
from .base import ExtractionStep, StepContext
|
||||
from .schema import EntityItem, TripletItem, TripletStepInput, TripletStepOutput
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TripletExtractionStep(ExtractionStep[TripletStepInput, TripletStepOutput]):
|
||||
"""Extract knowledge triplets and entities from a single statement.
|
||||
|
||||
This is a **critical** step — failure aborts the pipeline after retries.
|
||||
|
||||
Config params bound at init (from ``StepContext.config``):
|
||||
* ``ontology_types`` — predefined ontology types for entity classification
|
||||
* ``predicate_instructions`` — predicate definition guidance for the LLM
|
||||
* ``json_schema`` — JSON schema for the expected LLM output
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context: StepContext,
|
||||
ontology_types: Any = None,
|
||||
) -> None:
|
||||
super().__init__(context)
|
||||
self.ontology_types = ontology_types
|
||||
self.predicate_instructions = PREDICATE_DEFINITIONS
|
||||
self.json_schema = TripletExtractionResponse.model_json_schema()
|
||||
|
||||
# ── Identity ──
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "triplet_extraction"
|
||||
|
||||
@property
|
||||
def is_critical(self) -> bool:
|
||||
return True
|
||||
|
||||
# ── Lifecycle ──
|
||||
|
||||
async def render_prompt(self, input_data: TripletStepInput) -> str:
|
||||
# Build chunk_content from supporting_context for pronoun resolution
|
||||
chunk_content = "\n".join(
|
||||
f"{m.role}: {m.msg}" for m in input_data.supporting_context.msgs
|
||||
) if input_data.supporting_context.msgs else ""
|
||||
|
||||
input_json = {
|
||||
"statement_id": input_data.statement_id,
|
||||
"statement_text": input_data.statement_text,
|
||||
"statement_type": input_data.statement_type,
|
||||
"temporal_type": input_data.temporal_type,
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{"role": m.role, "msg": m.msg}
|
||||
for m in input_data.supporting_context.msgs
|
||||
]
|
||||
},
|
||||
"speaker": input_data.speaker,
|
||||
"dialog_at": input_data.dialog_at or "",
|
||||
"valid_at": input_data.valid_at,
|
||||
"invalid_at": input_data.invalid_at,
|
||||
"has_unsolved_reference": input_data.has_unsolved_reference,
|
||||
}
|
||||
|
||||
return await render_triplet_extraction_prompt(
|
||||
statement=input_data.statement_text,
|
||||
chunk_content=chunk_content,
|
||||
json_schema=self.json_schema,
|
||||
predicate_instructions=self.predicate_instructions,
|
||||
language=self.language,
|
||||
ontology_types=self.ontology_types,
|
||||
speaker=input_data.speaker,
|
||||
input_json=input_json,
|
||||
has_unsolved_reference=input_data.has_unsolved_reference,
|
||||
)
|
||||
|
||||
async def call_llm(self, prompt: Any) -> Any:
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are an expert at extracting knowledge triplets and entities "
|
||||
"from text. Follow the provided instructions carefully and return valid JSON."
|
||||
),
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
return await self.llm_client.response_structured(
|
||||
messages, TripletExtractionResponse
|
||||
)
|
||||
|
||||
async def parse_response(
|
||||
self, raw_response: Any, input_data: TripletStepInput
|
||||
) -> TripletStepOutput:
|
||||
if not hasattr(raw_response, "triplets"):
|
||||
return self.get_default_output()
|
||||
|
||||
# Keep raw triplets from LLM output (no predicate whitelist filtering).
|
||||
parsed_triplets = [
|
||||
TripletItem(
|
||||
subject_name=t.subject_name,
|
||||
subject_id=t.subject_id,
|
||||
predicate=t.predicate,
|
||||
predicate_description=getattr(t, "predicate_description", ""),
|
||||
object_name=t.object_name,
|
||||
object_id=t.object_id,
|
||||
)
|
||||
for t in raw_response.triplets
|
||||
]
|
||||
|
||||
entities = [
|
||||
EntityItem(
|
||||
entity_idx=e.entity_idx,
|
||||
name=e.name,
|
||||
type=e.type,
|
||||
type_description=getattr(e, "type_description", ""),
|
||||
description=e.description,
|
||||
is_explicit_memory=getattr(e, "is_explicit_memory", False),
|
||||
)
|
||||
for e in (raw_response.entities or [])
|
||||
]
|
||||
|
||||
return TripletStepOutput(entities=entities, triplets=parsed_triplets)
|
||||
|
||||
def get_default_output(self) -> TripletStepOutput:
|
||||
return TripletStepOutput(entities=[], triplets=[])
|
||||
@@ -483,7 +483,7 @@ class ReflectionEngine:
|
||||
result_data['memory_verifies'] = memory_verifies
|
||||
result_data['quality_assessments'] = quality_assessments
|
||||
conflicts_found = 0 # Initialize as integer 0 instead of empty string
|
||||
REMOVE_KEYS = {"created_at", "expired_at","relationship","predicate","statement_id","id","statement_id","relationship_statement_id"}
|
||||
REMOVE_KEYS = {"created_at","relationship","predicate","statement_id","id","statement_id","relationship_statement_id"}
|
||||
# Clean conflict_data, and memory_verify and quality_assessment
|
||||
cleaned_conflict_data = []
|
||||
for item in conflict_data:
|
||||
|
||||
@@ -26,7 +26,6 @@ async def _load_(data: List[Any]) -> List[Dict]:
|
||||
"end_user_id",
|
||||
"chunk_id",
|
||||
"created_at",
|
||||
"expired_at",
|
||||
"valid_at",
|
||||
"invalid_at",
|
||||
]
|
||||
@@ -93,7 +92,6 @@ async def get_data(result):
|
||||
rel_filtered['run_id'] = value.get('run_id')
|
||||
rel_filtered['statement'] = value.get('statement')
|
||||
rel_filtered['statement_id'] = value.get('statement_id')
|
||||
rel_filtered['expired_at'] = value.get('expired_at')
|
||||
rel_filtered['created_at'] = value.get('created_at')
|
||||
filtered_item[key] = value
|
||||
elif key == 'entity2' and value is not None:
|
||||
|
||||
120
api/app/core/memory/utils/debug/pipeline_snapshot.py
Normal file
120
api/app/core/memory/utils/debug/pipeline_snapshot.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""Pipeline stage snapshot — dump each extraction stage's output to JSON for comparison.
|
||||
|
||||
Usage:
|
||||
snapshot = PipelineSnapshot("legacy") # or "new"
|
||||
snapshot.save_stage("1_statements", data)
|
||||
snapshot.save_stage("2_triplets", data)
|
||||
...
|
||||
|
||||
Output structure:
|
||||
logs/memory-output/snapshots/
|
||||
legacy_20260422_123456/
|
||||
1_statements.json
|
||||
2_triplets.json
|
||||
3_nodes_edges.json
|
||||
4_dedup.json
|
||||
new_20260422_123500/
|
||||
1_statements.json
|
||||
2_triplets.json
|
||||
3_nodes_edges.json
|
||||
4_dedup.json
|
||||
|
||||
Controlled by env var PIPELINE_SNAPSHOT_ENABLED (default: false).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ENABLED: Optional[bool] = None
|
||||
|
||||
|
||||
def _is_enabled() -> bool:
|
||||
global _ENABLED
|
||||
if _ENABLED is None:
|
||||
_ENABLED = os.getenv("PIPELINE_SNAPSHOT_ENABLED", "false").lower() == "true"
|
||||
return _ENABLED
|
||||
|
||||
|
||||
def _safe_serialize(obj: Any) -> Any:
|
||||
"""Convert objects to JSON-serializable form."""
|
||||
if obj is None:
|
||||
return None
|
||||
if isinstance(obj, (str, int, float, bool)):
|
||||
return obj
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return [_safe_serialize(item) for item in obj]
|
||||
if isinstance(obj, dict):
|
||||
return {str(k): _safe_serialize(v) for k, v in obj.items()}
|
||||
if hasattr(obj, "model_dump"):
|
||||
return obj.model_dump()
|
||||
if hasattr(obj, "__dataclass_fields__"):
|
||||
from dataclasses import asdict
|
||||
return asdict(obj)
|
||||
if hasattr(obj, "__dict__"):
|
||||
return {k: _safe_serialize(v) for k, v in obj.__dict__.items()
|
||||
if not k.startswith("_")}
|
||||
return str(obj)
|
||||
|
||||
|
||||
class PipelineSnapshot:
|
||||
"""Dump each pipeline stage's output to a timestamped directory."""
|
||||
|
||||
def __init__(self, pipeline_name: str):
|
||||
"""
|
||||
Args:
|
||||
pipeline_name: "legacy" or "new", used as directory prefix.
|
||||
"""
|
||||
self.enabled = _is_enabled()
|
||||
self.pipeline_name = pipeline_name
|
||||
self._dir: Optional[Path] = None
|
||||
|
||||
if self.enabled:
|
||||
from app.core.config import settings
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self._dir = Path(settings.MEMORY_OUTPUT_DIR) / "snapshots" / f"{pipeline_name}_{ts}"
|
||||
self._dir.mkdir(parents=True, exist_ok=True)
|
||||
logger.debug(f"[Snapshot] 已启用,输出目录: {self._dir}")
|
||||
|
||||
@property
|
||||
def directory(self) -> Optional[str]:
|
||||
"""Absolute path (str) of this snapshot's output directory, or None when disabled."""
|
||||
return str(self._dir) if self._dir is not None else None
|
||||
|
||||
def save_stage(self, stage_name: str, data: Any) -> None:
|
||||
"""Save a stage's output as JSON.
|
||||
|
||||
Args:
|
||||
stage_name: e.g. "1_statements", "2_triplets"
|
||||
data: Any serializable data (Pydantic models, dicts, lists, dataclasses)
|
||||
"""
|
||||
if not self.enabled or self._dir is None:
|
||||
return
|
||||
|
||||
try:
|
||||
path = self._dir / f"{stage_name}.json"
|
||||
serialized = _safe_serialize(data)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(serialized, f, ensure_ascii=False, indent=2, default=str)
|
||||
logger.debug(f"[Snapshot] {stage_name} → {path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[Snapshot] 保存 {stage_name} 失败: {e}")
|
||||
|
||||
def save_summary(self, stats: Dict[str, Any]) -> None:
|
||||
"""Save a summary with pipeline metadata and stats."""
|
||||
if not self.enabled or self._dir is None:
|
||||
return
|
||||
|
||||
summary = {
|
||||
"pipeline": self.pipeline_name,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"stats": stats,
|
||||
}
|
||||
self.save_stage("0_summary", summary)
|
||||
217
api/app/core/memory/utils/debug/write_snapshot_recorder.py
Normal file
217
api/app/core/memory/utils/debug/write_snapshot_recorder.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""WriteSnapshotRecorder — 写入流水线快照记录器。
|
||||
|
||||
将 WritePipeline 中所有 snapshot 序列化逻辑集中到此模块,
|
||||
让 Pipeline 只做编排,不关心调试输出的数据格式。
|
||||
|
||||
Pipeline 侧调用示例:
|
||||
recorder = WriteSnapshotRecorder()
|
||||
recorder.record_stage_outputs(orchestrator.last_stage_outputs)
|
||||
recorder.record_graph_before_dedup(graph)
|
||||
recorder.record_dedup_result(dedup_result)
|
||||
recorder.record_summary(extraction_result.stats)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.core.memory.utils.debug.pipeline_snapshot import PipelineSnapshot
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WriteSnapshotRecorder:
|
||||
"""写入流水线各阶段的快照记录器。
|
||||
|
||||
内部持有一个 PipelineSnapshot 实例,对外暴露语义化方法,
|
||||
每个方法对应流水线中的一个可观测阶段。
|
||||
|
||||
当 PIPELINE_SNAPSHOT_ENABLED=false 时,所有方法均为空操作(no-op)。
|
||||
"""
|
||||
|
||||
def __init__(self, pipeline_name: str = "new"):
|
||||
self._snapshot = PipelineSnapshot(pipeline_name)
|
||||
|
||||
# ── 属性 ──
|
||||
|
||||
@property
|
||||
def enabled(self) -> bool:
|
||||
return self._snapshot.enabled
|
||||
|
||||
@property
|
||||
def snapshot_dir(self) -> Optional[str]:
|
||||
"""快照输出目录的绝对路径,未启用时返回 None。"""
|
||||
return self._snapshot.directory
|
||||
|
||||
@property
|
||||
def snapshot(self) -> PipelineSnapshot:
|
||||
"""暴露底层 PipelineSnapshot,供需要直接传递的场景使用(如 SemanticPruner)。"""
|
||||
return self._snapshot
|
||||
|
||||
# ── Stage 2-5: 萃取阶段各步骤输出 ──
|
||||
|
||||
def record_stage_outputs(self, stage_outputs: Optional[Dict[str, Any]]) -> None:
|
||||
"""记录 NewExtractionOrchestrator 各步骤的输出。
|
||||
|
||||
对应原 write_pipeline._extract() 中 stage_outputs 的序列化逻辑,
|
||||
包括 statement / triplet / emotion / embedding 四个阶段。
|
||||
"""
|
||||
if not stage_outputs:
|
||||
return
|
||||
|
||||
self._record_statements(stage_outputs.get("statement_results", {}))
|
||||
self._record_triplets(stage_outputs.get("triplet_results", {}))
|
||||
self._record_emotions(stage_outputs.get("emotion_results", {}))
|
||||
self._record_embeddings(stage_outputs.get("embedding_output"))
|
||||
|
||||
# ── Stage 6: 图构建(去重前) ──
|
||||
|
||||
def record_graph_before_dedup(self, graph: Any) -> None:
|
||||
"""记录 build_graph_nodes_and_edges 的输出(去重前)。"""
|
||||
self._snapshot.save_stage(
|
||||
"6_nodes_edges_before_dedup",
|
||||
{
|
||||
"dialogue_nodes_count": len(graph.dialogue_nodes),
|
||||
"chunk_nodes_count": len(graph.chunk_nodes),
|
||||
"statement_nodes_count": len(graph.statement_nodes),
|
||||
"entity_nodes": [
|
||||
{
|
||||
"id": e.id,
|
||||
"name": e.name,
|
||||
"entity_type": e.entity_type,
|
||||
"description": e.description,
|
||||
}
|
||||
for e in graph.entity_nodes
|
||||
],
|
||||
"entity_entity_edges": [
|
||||
{
|
||||
"source": e.source,
|
||||
"target": e.target,
|
||||
"relation_type": e.relation_type,
|
||||
"statement": e.statement,
|
||||
}
|
||||
for e in graph.entity_entity_edges
|
||||
],
|
||||
"stmt_entity_edges_count": len(graph.stmt_entity_edges),
|
||||
},
|
||||
)
|
||||
|
||||
# ── Stage 7: 去重后 ──
|
||||
|
||||
def record_dedup_result(self, dedup_result: Any) -> None:
|
||||
"""记录两阶段去重消歧后的实体和关系。"""
|
||||
self._snapshot.save_stage(
|
||||
"7_after_dedup",
|
||||
{
|
||||
"entity_nodes": [
|
||||
{
|
||||
"id": e.id,
|
||||
"name": e.name,
|
||||
"entity_type": e.entity_type,
|
||||
"description": e.description,
|
||||
}
|
||||
for e in dedup_result.entity_nodes
|
||||
],
|
||||
"entity_entity_edges": [
|
||||
{
|
||||
"source": e.source,
|
||||
"target": e.target,
|
||||
"relation_type": e.relation_type,
|
||||
"statement": e.statement,
|
||||
}
|
||||
for e in dedup_result.entity_entity_edges
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
# ── Stage 8: 别名归并后(异步,由 Celery PostStore 任务写入) ──
|
||||
|
||||
@staticmethod
|
||||
def save_alias_merge_result(snapshot_dir: str, entity_rows: List[Dict]) -> None:
|
||||
"""将别名归并+节点删除后的 Neo4j 实体状态写入 8_after_alias_merge.json。
|
||||
|
||||
由 Celery post_store_dedup_and_alias_merge 任务在完成归并和删除后调用,
|
||||
直接写入已有的 snapshot 目录,无需重建 WriteSnapshotRecorder 实例。
|
||||
|
||||
Args:
|
||||
snapshot_dir: 主流水线创建的 snapshot 目录绝对路径。
|
||||
entity_rows: 从 Neo4j 查询到的实体属性列表,每项包含
|
||||
id / name / entity_type / description / aliases 字段。
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
path = Path(snapshot_dir) / "8_after_alias_merge.json"
|
||||
data = {
|
||||
"entity_nodes": [
|
||||
{
|
||||
"id": row.get("id"),
|
||||
"name": row.get("name"),
|
||||
"entity_type": row.get("entity_type"),
|
||||
"description": row.get("description"),
|
||||
"aliases": row.get("aliases", []),
|
||||
}
|
||||
for row in entity_rows
|
||||
],
|
||||
"entity_count": len(entity_rows),
|
||||
}
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2, default=str)
|
||||
logger.debug(f"[Snapshot] 8_after_alias_merge → {path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[Snapshot] 保存 8_after_alias_merge 失败: {e}")
|
||||
|
||||
# ── Stage 0: 汇总 ──
|
||||
|
||||
def record_summary(self, stats: Dict[str, int]) -> None:
|
||||
"""记录流水线最终统计摘要。"""
|
||||
self._snapshot.save_summary(stats)
|
||||
|
||||
# ── 内部方法 ──
|
||||
|
||||
def _record_statements(self, stmt_results: Dict) -> None:
|
||||
snapshot_data: List[Dict] = []
|
||||
for _did, chunk_stmts in stmt_results.items():
|
||||
for _cid, stmts in chunk_stmts.items():
|
||||
for s in stmts:
|
||||
snapshot_data.append(s.model_dump())
|
||||
self._snapshot.save_stage("2_statement_outputs", snapshot_data)
|
||||
|
||||
def _record_triplets(self, triplet_results: Dict) -> None:
|
||||
snapshot_data: Dict[str, Any] = {}
|
||||
for _did, stmt_triplets in triplet_results.items():
|
||||
for stmt_id, t_out in stmt_triplets.items():
|
||||
snapshot_data[stmt_id] = t_out.model_dump()
|
||||
self._snapshot.save_stage("3_triplet_outputs", snapshot_data)
|
||||
|
||||
def _record_emotions(self, emotion_results: Dict) -> None:
|
||||
snapshot_data: Dict[str, Any] = {}
|
||||
for stmt_id, emo in emotion_results.items():
|
||||
if hasattr(emo, "model_dump"):
|
||||
snapshot_data[stmt_id] = emo.model_dump()
|
||||
self._snapshot.save_stage("4_emotion_outputs", snapshot_data)
|
||||
|
||||
def _record_embeddings(self, emb_output: Any) -> None:
|
||||
if not emb_output or not hasattr(emb_output, "model_dump"):
|
||||
return
|
||||
|
||||
emb_data = emb_output.model_dump()
|
||||
|
||||
# 截断向量,只保留前 5 维用于调试
|
||||
for key in ("statement_embeddings", "chunk_embeddings", "entity_embeddings"):
|
||||
if key in emb_data and isinstance(emb_data[key], dict):
|
||||
emb_data[key] = {
|
||||
k: v[:5] if isinstance(v, list) else v
|
||||
for k, v in emb_data[key].items()
|
||||
}
|
||||
if "dialog_embeddings" in emb_data and isinstance(
|
||||
emb_data["dialog_embeddings"], list
|
||||
):
|
||||
emb_data["dialog_embeddings"] = [
|
||||
v[:5] if isinstance(v, list) else v
|
||||
for v in emb_data["dialog_embeddings"]
|
||||
]
|
||||
|
||||
self._snapshot.save_stage("5_embedding_outputs", emb_data)
|
||||
@@ -142,7 +142,7 @@ class ConfigAuditLogger:
|
||||
result = "SUCCESS" if success else "FAILED"
|
||||
msg = (
|
||||
f"{operation.upper()} config_id={config_id} "
|
||||
f"group={end_user_id} result={result}"
|
||||
f"end_user_id={end_user_id} result={result}"
|
||||
)
|
||||
if duration is not None:
|
||||
msg += f" duration={duration:.2f}s"
|
||||
|
||||
184
api/app/core/memory/utils/log/bear_logger.py
Normal file
184
api/app/core/memory/utils/log/bear_logger.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""
|
||||
BearLogger — 结构化任务日志工具
|
||||
|
||||
在大量中间模块日志中提供醒目的 Pipeline 步骤进度标记。
|
||||
基于标准 logging.Logger,不修改现有日志配置。
|
||||
|
||||
设计要点:
|
||||
- 每个 step 只输出一行完成日志(不输出"开始"行,减少噪音)
|
||||
- Pipeline 开始/结束用 ═══ 粗分隔线,在终端中一眼可辨
|
||||
- step 完成行用 ▶ 图标 + 固定宽度对齐,紧凑且整齐
|
||||
- 性能告警用 ⚡ 标记,超过阈值自动触发
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from contextvars import ContextVar
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
# ── 上下文变量(线程/协程安全)──
|
||||
_trace_id: ContextVar[str] = ContextVar("bear_trace_id", default="")
|
||||
|
||||
|
||||
# ── 默认性能阈值(秒)──
|
||||
DEFAULT_PERF_THRESHOLDS: Dict[str, float] = {
|
||||
"预处理": 10,
|
||||
"萃取": 60,
|
||||
"存储": 30,
|
||||
"聚类": 5,
|
||||
"摘要": 30,
|
||||
}
|
||||
|
||||
|
||||
class _StepScope:
|
||||
"""Step 作用域,持有单步的状态和元数据。"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
logger: logging.Logger,
|
||||
index: int,
|
||||
total: int,
|
||||
category: str,
|
||||
description: str,
|
||||
threshold: Optional[float] = None,
|
||||
):
|
||||
self._logger = logger
|
||||
self._index = index
|
||||
self._total = total
|
||||
self._category = category
|
||||
self._description = description
|
||||
self._threshold = threshold
|
||||
self._start_time = 0.0
|
||||
self._kv: Dict[str, Any] = {}
|
||||
|
||||
def metadata(self, **kv: Any) -> None:
|
||||
"""附加元数据,会在完成日志的行尾展示。"""
|
||||
self._kv.update(kv)
|
||||
|
||||
def _start(self) -> None:
|
||||
self._start_time = time.time()
|
||||
|
||||
def _succeed(self) -> None:
|
||||
elapsed = time.time() - self._start_time
|
||||
|
||||
# 性能告警
|
||||
if self._threshold and elapsed > self._threshold:
|
||||
status = f"⚡ {elapsed:.2f}s [SLOW]"
|
||||
else:
|
||||
status = f"✔ {elapsed:.2f}s"
|
||||
|
||||
# 元数据
|
||||
kv_str = ""
|
||||
if self._kv:
|
||||
kv_str = " " + ", ".join(f"{k}={v}" for k, v in self._kv.items())
|
||||
|
||||
self._logger.info(
|
||||
f" ▶ [{self._index}/{self._total}] "
|
||||
f"{self._category}:{self._description} "
|
||||
f"── {status}{kv_str}"
|
||||
)
|
||||
|
||||
def _fail(self, error: Exception) -> None:
|
||||
elapsed = time.time() - self._start_time
|
||||
self._logger.error(
|
||||
f" ✘ [{self._index}/{self._total}] "
|
||||
f"{self._category}:{self._description} "
|
||||
f"── FAILED {elapsed:.2f}s error={error}"
|
||||
)
|
||||
|
||||
|
||||
class BearLogger:
|
||||
"""结构化任务日志工具。
|
||||
|
||||
用法::
|
||||
|
||||
bear = BearLogger("memory.pipeline")
|
||||
|
||||
async with bear.pipeline("WritePipeline", mode="正式"):
|
||||
async with bear.step(1, 5, "预处理", "消息分块") as s:
|
||||
result = await preprocess()
|
||||
s.metadata(chunks=3)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str = "memory.pipeline",
|
||||
perf_thresholds: Optional[Dict[str, float]] = None,
|
||||
):
|
||||
self._logger = logging.getLogger(name)
|
||||
self._thresholds = perf_thresholds or DEFAULT_PERF_THRESHOLDS
|
||||
|
||||
@asynccontextmanager
|
||||
async def pipeline(self, name: str, **context_kv: Any):
|
||||
"""Pipeline 级作用域。开始和结束用醒目的分隔线。"""
|
||||
trace_id = uuid.uuid4().hex[:8]
|
||||
token = _trace_id.set(trace_id)
|
||||
start = time.time()
|
||||
|
||||
ctx_parts = [f"{k}={v}" for k, v in context_kv.items()]
|
||||
ctx_str = ", ".join(ctx_parts)
|
||||
|
||||
self._logger.info(
|
||||
f"{'═' * 60}\n"
|
||||
f" 🚀 {name} 开始 {ctx_str}\n"
|
||||
f"{'─' * 60}"
|
||||
)
|
||||
|
||||
error = None
|
||||
try:
|
||||
yield self
|
||||
except Exception as e:
|
||||
error = e
|
||||
raise
|
||||
finally:
|
||||
elapsed = time.time() - start
|
||||
if error:
|
||||
self._logger.error(
|
||||
f"{'─' * 60}\n"
|
||||
f" ✘ {name} 失败 ({elapsed:.2f}s) error={error}\n"
|
||||
f"{'═' * 60}"
|
||||
)
|
||||
else:
|
||||
self._logger.info(
|
||||
f"{'─' * 60}\n"
|
||||
f" ✔ {name} 完成 ({elapsed:.2f}s)\n"
|
||||
f"{'═' * 60}"
|
||||
)
|
||||
_trace_id.reset(token)
|
||||
|
||||
@asynccontextmanager
|
||||
async def step(
|
||||
self,
|
||||
index: int,
|
||||
total: int,
|
||||
category: str,
|
||||
description: str,
|
||||
):
|
||||
"""Step 级作用域。只在完成时输出一行日志(减少噪音)。"""
|
||||
scope = _StepScope(
|
||||
logger=self._logger,
|
||||
index=index,
|
||||
total=total,
|
||||
category=category,
|
||||
description=description,
|
||||
threshold=self._thresholds.get(category),
|
||||
)
|
||||
scope._start()
|
||||
try:
|
||||
yield scope
|
||||
except Exception as e:
|
||||
scope._fail(e)
|
||||
raise
|
||||
else:
|
||||
scope._succeed()
|
||||
|
||||
def info(self, message: str, **kv: Any) -> None:
|
||||
"""带缩进的 info 日志。"""
|
||||
suffix = ""
|
||||
if kv:
|
||||
suffix = " " + ", ".join(f"{k}={v}" for k, v in kv.items())
|
||||
self._logger.info(f" │ {message}{suffix}")
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from app.core.memory.models.ontology_extraction_models import OntologyTypeList
|
||||
@@ -46,6 +47,7 @@ async def render_statement_extraction_prompt(
|
||||
dialogue_content: str | None = None,
|
||||
max_dialogue_chars: int | None = None,
|
||||
language: str = "zh",
|
||||
input_json: dict | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Renders the statement extraction prompt using the extract_statement.jinja2 template.
|
||||
@@ -63,7 +65,7 @@ async def render_statement_extraction_prompt(
|
||||
Returns:
|
||||
Rendered prompt content as string
|
||||
"""
|
||||
template = prompt_env.get_template("extract_statement.jinja2")
|
||||
template = prompt_env.get_template("extract_statement_temporal.jinja2")
|
||||
# Optional clipping of dialogue context
|
||||
ctx = None
|
||||
if include_dialogue_context and dialogue_content:
|
||||
@@ -77,6 +79,7 @@ async def render_statement_extraction_prompt(
|
||||
|
||||
rendered_prompt = template.render(
|
||||
inputs={"chunk": chunk_content},
|
||||
input_json=json.dumps(input_json, ensure_ascii=False) if input_json else "{}",
|
||||
definitions=definitions,
|
||||
json_schema=json_schema,
|
||||
granularity=granularity,
|
||||
@@ -87,7 +90,7 @@ async def render_statement_extraction_prompt(
|
||||
# 记录渲染结果到提示日志(与示例日志结构一致)
|
||||
log_prompt_rendering('statement extraction', rendered_prompt)
|
||||
# 可选:记录模板渲染信息
|
||||
log_template_rendering('extract_statement.jinja2', {
|
||||
log_template_rendering('extract_statement_temporal.jinja2', {
|
||||
'inputs': 'chunk',
|
||||
'definitions': 'LABEL_DEFINITIONS',
|
||||
'json_schema': 'StatementExtractionResponse.schema',
|
||||
@@ -97,7 +100,7 @@ async def render_statement_extraction_prompt(
|
||||
})
|
||||
|
||||
return rendered_prompt
|
||||
|
||||
# TODO temporal与statement prompt合并在一起,以下代码不需要
|
||||
async def render_temporal_extraction_prompt(
|
||||
ref_dates: dict,
|
||||
statement: dict,
|
||||
@@ -198,6 +201,7 @@ def render_entity_dedup_prompt(
|
||||
|
||||
# Args:
|
||||
# entity_a: Dict of entity A attributes
|
||||
|
||||
async def render_triplet_extraction_prompt(
|
||||
statement: str,
|
||||
chunk_content: str,
|
||||
@@ -206,6 +210,8 @@ async def render_triplet_extraction_prompt(
|
||||
language: str = "zh",
|
||||
ontology_types: "OntologyTypeList | None" = None,
|
||||
speaker: str = None,
|
||||
input_json: dict = None,
|
||||
has_unsolved_reference: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Renders the triplet extraction prompt using the extract_triplet.jinja2 template.
|
||||
@@ -218,10 +224,14 @@ async def render_triplet_extraction_prompt(
|
||||
language: The language to use for entity descriptions ("zh" for Chinese, "en" for English)
|
||||
ontology_types: Optional OntologyTypeList containing predefined ontology types for entity classification
|
||||
speaker: Speaker role ("user" or "assistant") for the current statement
|
||||
input_json: Full input JSON for the template
|
||||
has_unsolved_reference: Whether the statement has unresolved references
|
||||
|
||||
Returns:
|
||||
Rendered prompt content as string
|
||||
"""
|
||||
import json
|
||||
|
||||
template = prompt_env.get_template("extract_triplet.jinja2")
|
||||
|
||||
# 准备本体类型数据
|
||||
@@ -233,8 +243,13 @@ async def render_triplet_extraction_prompt(
|
||||
ontology_type_names = ontology_types.get_type_names()
|
||||
type_hierarchy_hints = ontology_types.get_type_hierarchy_hints()
|
||||
|
||||
# 准备 input_json 如果没有提供
|
||||
if input_json is None:
|
||||
input_json = {}
|
||||
|
||||
rendered_prompt = template.render(
|
||||
statement=statement,
|
||||
statement_text=statement, # 兼容模板中的 statement_text 变量
|
||||
chunk_content=chunk_content,
|
||||
json_schema=json_schema,
|
||||
predicate_instructions=predicate_instructions,
|
||||
@@ -243,6 +258,8 @@ async def render_triplet_extraction_prompt(
|
||||
ontology_type_names=ontology_type_names,
|
||||
type_hierarchy_hints=type_hierarchy_hints,
|
||||
speaker=speaker,
|
||||
input_json=json.dumps(input_json, ensure_ascii=False) if input_json else "{}",
|
||||
has_unsolved_reference=has_unsolved_reference,
|
||||
)
|
||||
# 记录渲染结果到提示日志(与示例日志结构一致)
|
||||
log_prompt_rendering('triplet extraction', rendered_prompt)
|
||||
|
||||
@@ -12,24 +12,27 @@ Mode: {{ 'Disambiguation Mode' if disambiguation_mode else 'Deduplication Mode'
|
||||
===Input===
|
||||
{% if language == "zh" %}
|
||||
实体A:
|
||||
|
||||
- 名称: "{{ entity_a.name | default('') }}"
|
||||
- 类型: "{{ entity_a.entity_type | default('') }}"
|
||||
- 描述: "{{ entity_a.description | default('') }}"
|
||||
- 别名: {{ entity_a.aliases | default([]) }}
|
||||
{# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 #}
|
||||
{# - 摘要: "{{ entity_a.fact_summary | default('') }}" #}
|
||||
{# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 #}
|
||||
{# - 摘要: "{{ entity_a.fact_summary | default('') }}" #}
|
||||
- 连接强弱: "{{ entity_a.connect_strength | default('') }}"
|
||||
|
||||
实体B:
|
||||
|
||||
- 名称: "{{ entity_b.name | default('') }}"
|
||||
- 类型: "{{ entity_b.entity_type | default('') }}"
|
||||
- 描述: "{{ entity_b.description | default('') }}"
|
||||
- 别名: {{ entity_b.aliases | default([]) }}
|
||||
{# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 #}
|
||||
{# - 摘要: "{{ entity_b.fact_summary | default('') }}" #}
|
||||
{# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 #}
|
||||
{# - 摘要: "{{ entity_b.fact_summary | default('') }}" #}
|
||||
- 连接强弱: "{{ entity_b.connect_strength | default('') }}"
|
||||
|
||||
上下文:
|
||||
|
||||
- 同组: {{ same_group | default(false) }}
|
||||
- 类型一致或未知类型: {{ type_ok | default(false) }}
|
||||
- 类型相似度(0-1): {{ type_similarity | default(0.0) }}
|
||||
@@ -38,29 +41,31 @@ Mode: {{ 'Disambiguation Mode' if disambiguation_mode else 'Deduplication Mode'
|
||||
- 名称包含关系: {{ name_contains | default(false) }}
|
||||
- 上下文同源(同一语句指向两者): {{ co_occurrence | default(false) }}
|
||||
- 两者相关的关系陈述(来自实体-实体边):
|
||||
{% for s in relation_statements %}
|
||||
{% for s in relation_statements %}
|
||||
- {{ s }}
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
Entity A:
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
Entity A:
|
||||
- Name: "{{ entity_a.name | default('') }}"
|
||||
- Type: "{{ entity_a.entity_type | default('') }}"
|
||||
- Description: "{{ entity_a.description | default('') }}"
|
||||
- Aliases: {{ entity_a.aliases | default([]) }}
|
||||
{# TODO: fact_summary feature temporarily disabled, to be enabled after future development #}
|
||||
{# - Summary: "{{ entity_a.fact_summary | default('') }}" #}
|
||||
{# TODO: fact_summary feature temporarily disabled, to be enabled after future development #}
|
||||
{# - Summary: "{{ entity_a.fact_summary | default('') }}" #}
|
||||
- Connection Strength: "{{ entity_a.connect_strength | default('') }}"
|
||||
|
||||
Entity B:
|
||||
|
||||
- Name: "{{ entity_b.name | default('') }}"
|
||||
- Type: "{{ entity_b.entity_type | default('') }}"
|
||||
- Description: "{{ entity_b.description | default('') }}"
|
||||
- Aliases: {{ entity_b.aliases | default([]) }}
|
||||
{# TODO: fact_summary feature temporarily disabled, to be enabled after future development #}
|
||||
{# - Summary: "{{ entity_b.fact_summary | default('') }}" #}
|
||||
{# TODO: fact_summary feature temporarily disabled, to be enabled after future development #}
|
||||
{# - Summary: "{{ entity_b.fact_summary | default('') }}" #}
|
||||
- Connection Strength: "{{ entity_b.connect_strength | default('') }}"
|
||||
|
||||
Context:
|
||||
|
||||
- Same Group: {{ same_group | default(false) }}
|
||||
- Type Consistent or Unknown: {{ type_ok | default(false) }}
|
||||
- Type Similarity (0-1): {{ type_similarity | default(0.0) }}
|
||||
@@ -69,14 +74,15 @@ Context:
|
||||
- Name Contains Relationship: {{ name_contains | default(false) }}
|
||||
- Context Co-occurrence (same statement refers to both): {{ co_occurrence | default(false) }}
|
||||
- Related Relationship Statements (from entity-entity edges):
|
||||
{% for s in relation_statements %}
|
||||
{% for s in relation_statements %}
|
||||
- {{ s }}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
===Guidelines===
|
||||
{% if language == "zh" %}
|
||||
{% if disambiguation_mode %}
|
||||
|
||||
- 这是"同名但类型不同"的消歧场景。请判断两者是否指向同一真实世界实体。
|
||||
- 综合名称文本/向量相似度、别名、描述、摘要与上下文关系(同源与关系陈述)进行判断。
|
||||
- **别名处理(高优先级)**:
|
||||
@@ -93,7 +99,7 @@ Context:
|
||||
* 建议类型必须与上下文和实体描述一致
|
||||
- 规范实体优先级:连接强度(strong/both)更高者;其余相同则保留描述/摘要更丰富者;再相同时保留实体A(canonical_idx=0)。
|
||||
- **注意**:别名(aliases)已在三元组提取阶段获取,合并时会自动整合,无需在此阶段提取。
|
||||
{% else %}
|
||||
{% else %}
|
||||
- 若实体类型相同或任一为UNKNOWN/空,可放行作为候选;若类型明显冲突(如人 vs 物品),除非别名与描述高度一致,否则判定不同实体。
|
||||
- **别名匹配优先(最高优先级)**:
|
||||
* 如果实体A的名称与实体B的某个别名完全匹配,应视为高置信度匹配
|
||||
@@ -107,9 +113,9 @@ Context:
|
||||
- 若需要合并,选择"保留的规范实体"(canonical_idx)为更合适的一个:
|
||||
- 优先保留连接强度更强(strong/both)者;其余相同则保留描述/摘要更丰富者;再相同时保留实体A(canonical_idx=0)。
|
||||
- **注意**:别名(aliases)已在三元组提取阶段获取,合并时会自动整合,无需在此阶段提取。
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% if disambiguation_mode %}
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% if disambiguation_mode %}
|
||||
- This is a disambiguation scenario for "same name but different types". Please determine whether they refer to the same real-world entity.
|
||||
- Make judgments based on name text/vector similarity, aliases, descriptions, summaries, and contextual relationships (co-occurrence and relationship statements).
|
||||
- **Alias Handling (High Priority)**:
|
||||
@@ -126,7 +132,7 @@ Context:
|
||||
* Suggested type must be consistent with context and entity description
|
||||
- Canonical entity priority: higher connection strength (strong/both); if equal, retain the one with richer description/summary; if still equal, retain Entity A (canonical_idx=0).
|
||||
- **Note**: Aliases are already obtained during triplet extraction and will be automatically integrated during merging; no need to extract at this stage.
|
||||
{% else %}
|
||||
{% else %}
|
||||
- If entity types are the same or either is UNKNOWN/empty, can proceed as candidates; if types clearly conflict (e.g., person vs. item), unless aliases and descriptions are highly consistent, determine as different entities.
|
||||
- **Alias Matching Priority (Highest Priority)**:
|
||||
* If Entity A's name exactly matches any of Entity B's aliases, it should be considered a high-confidence match
|
||||
@@ -140,8 +146,8 @@ Context:
|
||||
- If merging is needed, select the "canonical entity to retain" (canonical_idx) as the more appropriate one:
|
||||
- Prioritize retaining the one with stronger connection strength (strong/both); if equal, retain the one with richer description/summary; if still equal, retain Entity A (canonical_idx=0).
|
||||
- **Note**: Aliases are already obtained during triplet extraction and will be automatically integrated during merging; no need to extract at this stage.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
**Output format**
|
||||
{% if language == "zh" %}
|
||||
@@ -157,64 +163,69 @@ Context:
|
||||
}
|
||||
|
||||
**字段说明**:
|
||||
|
||||
- should_merge: 是否应该合并这两个实体(true/false)
|
||||
- canonical_idx: 规范实体的索引,0表示实体A,1表示实体B
|
||||
- confidence: 决策的置信度,范围0.0-1.0
|
||||
- block_pair: 是否阻断该对在其他模糊/启发式合并中出现(true/false)
|
||||
- suggested_type: 建议的统一类型(字符串或null)
|
||||
- reason: 决策理由的简短说明
|
||||
{% else %}
|
||||
返回JSON格式,必须包含以下字段:
|
||||
{
|
||||
{% else %}
|
||||
返回JSON格式,必须包含以下字段:
|
||||
{
|
||||
"same_entity": boolean,
|
||||
"canonical_idx": 0 or 1,
|
||||
"confidence": float (0.0-1.0),
|
||||
"reason": "string"
|
||||
}
|
||||
}
|
||||
|
||||
**字段说明**:
|
||||
|
||||
- same_entity: 两个实体是否指向同一真实世界实体(true/false)
|
||||
- canonical_idx: 规范实体的索引,0表示实体A,1表示实体B
|
||||
- confidence: 决策的置信度,范围0.0-1.0
|
||||
- reason: 决策理由的简短说明
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% if disambiguation_mode %}
|
||||
Return JSON format with the following required fields:
|
||||
{
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% if disambiguation_mode %}
|
||||
Return JSON format with the following required fields:
|
||||
{
|
||||
"should_merge": boolean,
|
||||
"canonical_idx": 0 or 1,
|
||||
"confidence": float (0.0-1.0),
|
||||
"block_pair": boolean,
|
||||
"suggested_type": "string or null",
|
||||
"reason": "string"
|
||||
}
|
||||
}
|
||||
|
||||
**Field Descriptions**:
|
||||
|
||||
- should_merge: Whether these two entities should be merged (true/false)
|
||||
- canonical_idx: Index of the canonical entity, 0 for Entity A, 1 for Entity B
|
||||
- confidence: Confidence level of the decision, range 0.0-1.0
|
||||
- block_pair: Whether to block this pair in other fuzzy/heuristic merges (true/false)
|
||||
- suggested_type: Suggested unified type (string or null)
|
||||
- reason: Brief explanation of the decision
|
||||
{% else %}
|
||||
Return JSON format with the following required fields:
|
||||
{
|
||||
{% else %}
|
||||
Return JSON format with the following required fields:
|
||||
{
|
||||
"same_entity": boolean,
|
||||
"canonical_idx": 0 or 1,
|
||||
"confidence": float (0.0-1.0),
|
||||
"reason": "string"
|
||||
}
|
||||
}
|
||||
|
||||
**Field Descriptions**:
|
||||
|
||||
- same_entity: Whether the two entities refer to the same real-world entity (true/false)
|
||||
- canonical_idx: Index of the canonical entity, 0 for Entity A, 1 for Entity B
|
||||
- confidence: Confidence level of the decision, range 0.0-1.0
|
||||
- reason: Brief explanation of the decision
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
**CRITICAL JSON FORMATTING REQUIREMENTS:**
|
||||
|
||||
1. Use only standard ASCII double quotes (") for JSON structure - never use Chinese quotation marks ("") or other Unicode quotes
|
||||
2. Ensure all JSON strings are properly closed and comma-separated
|
||||
3. Do not include line breaks within JSON string values
|
||||
@@ -225,4 +236,4 @@ Return JSON format with the following required fields:
|
||||
{% else %}
|
||||
The output language should always be the same as the input language.
|
||||
{% endif %}
|
||||
{{ json_schema }}
|
||||
{{ json_schema }}
|
||||
@@ -1,199 +0,0 @@
|
||||
{#
|
||||
对话级抽取与相关性判定模板(用于剪枝加速)
|
||||
输入:pruning_scene, ontology_class_infos, dialog_text, language
|
||||
- ontology_class_infos: List[{class_name: str, class_description: str}]
|
||||
输出:严格 JSON(不要包含任何多余文本),字段:
|
||||
- is_related: bool,是否与所选场景相关
|
||||
- times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等)
|
||||
- ids: [string],编号/ID/订单号/申请号/账号等
|
||||
- amounts: [string],金额/费用/价格相关(带单位或货币符号)
|
||||
- contacts: [string],联系方式(电话/手机号/邮箱/微信/QQ等)
|
||||
- addresses: [string],地址/地点相关文本
|
||||
- keywords: [string],其它有助于保留的重要关键词(与场景强相关的术语)
|
||||
- preserve_keywords: [string],必须保留的情绪/兴趣/爱好/个人偏好相关词或短语片段
|
||||
|
||||
要求:
|
||||
- 必须只输出上述 JSON,且键名一致;不得输出解释、前后缀;不得包含注释。
|
||||
- times/ids/amounts/contacts/addresses/keywords/preserve_keywords 仅抽取原文片段或规范化后的简单字符串。
|
||||
- 仅输出上述键;避免多余解释或字段。
|
||||
#}
|
||||
|
||||
{# ── 确定场景说明 ── #}
|
||||
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
|
||||
{% if language == 'en' %}
|
||||
{% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is relevant if it involves any of the following entity types.' %}
|
||||
{% else %}
|
||||
{% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关。' %}
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% if language == 'en' %}
|
||||
{% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
|
||||
{% else %}
|
||||
{% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% if language == "zh" %}
|
||||
你是一个对话内容分析助手。请对下方对话全文进行一次性分析,完成两项任务:
|
||||
1. 判断对话是否与指定场景相关;
|
||||
2. 从对话中抽取所有需要保留的重要信息片段。
|
||||
|
||||
场景说明:{{ instruction }}
|
||||
|
||||
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
|
||||
【本场景实体类型定义】
|
||||
以下实体类型定义了本场景中哪些内容是重要的。
|
||||
凡是与以下任意类型相关的内容,都必须保留,并将关键词/短语提取到 keywords 字段:
|
||||
|
||||
{% for info in ontology_class_infos %}
|
||||
- {{ info.class_name }}:{{ info.class_description }}
|
||||
{% endfor %}
|
||||
|
||||
重要提示:只要对话中出现与上述任意实体类型相关的内容,即判定为相关(is_related=true)。
|
||||
{% endif %}
|
||||
|
||||
---
|
||||
【必须保留的内容(不可删除)】
|
||||
以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段:
|
||||
- 时间信息:日期、时间点、时间段、有效期 → times 字段
|
||||
- 编号信息:学号、工号、订单号、申请号、账号、ID → ids 字段
|
||||
- 金额信息:价格、费用、金额(含货币符号或单位,如"100元"、"¥200")→ amounts 字段(注意:考试分数、成绩分数不属于金额,不要放入此字段)
|
||||
- 联系方式:电话、手机号、邮箱、微信、QQ → contacts 字段
|
||||
- 地址信息:地点、地址、位置 → addresses 字段
|
||||
- 场景关键词:与**当前场景**强相关的专业术语、事件名称 → keywords 字段(注意:只放与当前场景直接相关的词,跨场景的内容不要放入此字段)
|
||||
- **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段
|
||||
- **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段
|
||||
- **个人情感态度**:对人际关系、情感状态的明确表达(如"我跟室友闹矛盾了"、"我都快抑郁了")→ preserve_keywords 字段
|
||||
- 注意:学业目标(如"我想考研")、成绩(如"87分")、学科偏好(如"喜欢数学")属于学业信息,不属于情绪/情感,不要放入 preserve_keywords 字段
|
||||
|
||||
【场景无关内容标记】
|
||||
请从对话中识别出与当前场景({{ pruning_scene }})**既不相关、也无语义关联**的消息片段,将其原文(或关键片段)提取到 scene_unrelated_snippets 字段。
|
||||
判断标准:
|
||||
- 与场景实体类型完全无关
|
||||
- 与场景话题没有因果/时间/情境上的关联(例如:不是"因为上课所以累"这种关联)
|
||||
- 纯粹是另一个话题的内容(如在教育场景中讨论购物、娱乐等)
|
||||
注意:有情绪/感受表达的消息即使话题不同,也可能有语义关联,请谨慎标记。
|
||||
|
||||
**重要:scene_unrelated_snippets 必须认真填写,不能为空数组。**
|
||||
如果对话中存在与场景无关的内容,必须将其原文片段提取出来。
|
||||
|
||||
示例(场景=在线教育):
|
||||
- "我最近心情很差,跟室友闹矛盾了" → 与教育场景无关,加入 scene_unrelated_snippets
|
||||
- "她总是很晚回来吵到我睡觉" → 与教育场景无关,加入 scene_unrelated_snippets
|
||||
- "对,我都快抑郁了" → 与教育场景无关,加入 scene_unrelated_snippets
|
||||
- "期末考试12月25日" → 与教育场景相关,不加入 scene_unrelated_snippets
|
||||
- "我上次高数作业87分" → 与教育场景相关,不加入 scene_unrelated_snippets
|
||||
- "我的目标是考研" → 与教育场景相关,不加入 scene_unrelated_snippets
|
||||
|
||||
示例(场景=情感陪伴):
|
||||
- "我最近心情很差,跟室友闹矛盾了" → 与情感陪伴场景相关(情绪+关系),不加入 scene_unrelated_snippets
|
||||
- "对,我都快抑郁了" → 与情感陪伴场景相关(情绪),不加入 scene_unrelated_snippets
|
||||
- "期末考试12月25日,3号教学楼201室" → 与情感陪伴场景无关(教育信息),加入 scene_unrelated_snippets
|
||||
- "我上次高数作业87分,这次能考好吗" → 与情感陪伴场景无关(学业信息),加入 scene_unrelated_snippets
|
||||
- "我的目标是考研,想读应用数学" → 与情感陪伴场景无关(学业目标),加入 scene_unrelated_snippets
|
||||
|
||||
【可以删除的内容】
|
||||
以下类型的内容属于低价值信息,可以在剪枝时删除:
|
||||
- 纯寒暄问候:如"你好"、"在吗"、"拜拜"、"嗯"、"好的"、"哦"等无实质内容的短语
|
||||
- 纯表情/符号:如"[微笑]"、"😊"、"哈哈"等
|
||||
- 重复确认:如"对对对"、"是的是的"、"嗯嗯嗯"等无新增信息的重复
|
||||
- 无意义填充:如"啊"、"呢"、"嘛"等语气词单独成句
|
||||
|
||||
**注意:即使消息很短,只要包含情绪、兴趣、爱好、个人观点等有价值信息,就必须保留,不得删除。**
|
||||
例如:
|
||||
- "我好开心呀" → 包含情绪(开心),必须保留,preserve_keywords 中加入"开心"
|
||||
- "好喜欢打羽毛球呀" → 包含兴趣爱好(喜欢打羽毛球),必须保留,preserve_keywords 中加入"喜欢打羽毛球"
|
||||
- "我好难过" → 包含情绪(难过),必须保留,preserve_keywords 中加入"难过"
|
||||
- "太好啦!看到你开心,我也跟着心情亮起来" → 包含情绪,必须保留,preserve_keywords 中加入"开心"
|
||||
|
||||
---
|
||||
对话全文:
|
||||
"""
|
||||
{{ dialog_text }}
|
||||
"""
|
||||
|
||||
只输出严格 JSON(键固定、顺序不限):
|
||||
{
|
||||
"is_related": <true 或 false>,
|
||||
"times": [<string>...],
|
||||
"ids": [<string>...],
|
||||
"amounts": [<string>...],
|
||||
"contacts": [<string>...],
|
||||
"addresses": [<string>...],
|
||||
"keywords": [<string>...],
|
||||
"preserve_keywords": [<string>...],
|
||||
"scene_unrelated_snippets": [<string>...]
|
||||
}
|
||||
{% else %}
|
||||
You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks:
|
||||
1. Determine whether the dialogue is relevant to the specified scene;
|
||||
2. Extract all important information fragments that must be preserved.
|
||||
|
||||
Scenario Description: {{ instruction }}
|
||||
|
||||
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
|
||||
[Scene Entity Type Definitions]
|
||||
The following entity types define what content is important in this scene.
|
||||
Content related to ANY of these types must be preserved and extracted into the keywords field:
|
||||
|
||||
{% for info in ontology_class_infos %}
|
||||
- {{ info.class_name }}: {{ info.class_description }}
|
||||
{% endfor %}
|
||||
|
||||
Important: If the dialogue contains content related to any of the entity types above, mark it as relevant (is_related=true).
|
||||
{% endif %}
|
||||
|
||||
---
|
||||
[MUST PRESERVE (cannot be deleted)]
|
||||
The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields:
|
||||
- Time information: dates, time points, durations, expiry dates → times field
|
||||
- ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field
|
||||
- Amount information: prices, fees, amounts (with currency symbols or units, e.g., "$100", "¥200") → amounts field (Note: exam scores and grades are NOT amounts, do not put them here)
|
||||
- Contact information: phone numbers, emails, WeChat, QQ → contacts field
|
||||
- Address information: locations, addresses, places → addresses field
|
||||
- Scene keywords: professional terms and event names strongly related to **the current scene** → keywords field (Note: only put terms directly related to the current scene; cross-scene content should not be placed here)
|
||||
- **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field
|
||||
- **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field
|
||||
- **Personal emotional attitudes**: clear expressions about interpersonal relationships or emotional states (e.g., "I had a fight with my roommate", "I'm almost depressed") → preserve_keywords field
|
||||
- Note: Academic goals (e.g., "I want to pursue a master's degree"), grades (e.g., "87 points"), and subject preferences (e.g., "I like math") are academic information, NOT emotions/feelings — do not put them in preserve_keywords
|
||||
|
||||
[Scene-Unrelated Content Marking]
|
||||
Please identify message snippets in the dialogue that are **neither relevant to nor semantically associated with** the current scene ({{ pruning_scene }}), and extract their original text (or key fragments) into the scene_unrelated_snippets field.
|
||||
Criteria:
|
||||
- Completely unrelated to the scene's entity types
|
||||
- No causal/temporal/contextual association with the scene topic (e.g., "feeling tired because of class" IS associated)
|
||||
- Purely belongs to a different topic (e.g., discussing shopping or entertainment in an education scene)
|
||||
Note: Messages with emotional/feeling expressions may still have semantic association even if the topic differs — mark carefully.
|
||||
|
||||
[CAN BE DELETED]
|
||||
The following types of content are low-value and can be removed during pruning:
|
||||
- Pure greetings: e.g., "hello", "are you there", "bye", "ok", "yeah" — short phrases with no substantive content
|
||||
- Pure emojis/symbols: e.g., "[smile]", "😊", "haha"
|
||||
- Repetitive confirmations: e.g., "yes yes yes", "right right", "uh huh" — repetitions with no new information
|
||||
- Meaningless fillers: standalone interjections like "ah", "well", "hmm"
|
||||
|
||||
**Note: Even if a message is short, if it contains emotions, interests, hobbies, or personal opinions, it MUST be preserved.**
|
||||
Examples:
|
||||
- "I'm so happy!" → contains emotion (happy), must preserve; add "happy" to preserve_keywords
|
||||
- "I love playing badminton!" → contains interest (love playing badminton), must preserve; add "love playing badminton" to preserve_keywords
|
||||
- "I feel so sad" → contains emotion (sad), must preserve; add "sad" to preserve_keywords
|
||||
|
||||
---
|
||||
Full Dialogue:
|
||||
"""
|
||||
{{ dialog_text }}
|
||||
"""
|
||||
|
||||
Output strict JSON only (fixed keys, order doesn't matter):
|
||||
{
|
||||
"is_related": <true or false>,
|
||||
"times": [<string>...],
|
||||
"ids": [<string>...],
|
||||
"amounts": [<string>...],
|
||||
"contacts": [<string>...],
|
||||
"addresses": [<string>...],
|
||||
"keywords": [<string>...],
|
||||
"preserve_keywords": [<string>...],
|
||||
"scene_unrelated_snippets": [<string>...]
|
||||
}
|
||||
{% endif %}
|
||||
354
api/app/core/memory/utils/prompt/prompts/extract_pruning.jinja2
Normal file
354
api/app/core/memory/utils/prompt/prompts/extract_pruning.jinja2
Normal file
@@ -0,0 +1,354 @@
|
||||
{% if language == "zh" %}
|
||||
你是一个面向记忆存储的 Assistant 辅助信息压缩器。
|
||||
|
||||
任务:
|
||||
|
||||
- 输入是一个 JSON,对话放在 `msgs` 数组里。
|
||||
- 你只处理 `Assistant.msg`。
|
||||
- `User.msg` 只用于理解上下文,不允许出现在输出里,也不允许被复述成用户摘要。
|
||||
- 你的输出必须包含两个字段:
|
||||
1. `assistant_memory_hint`
|
||||
2. `assistant_memory_type`
|
||||
|
||||
目标:
|
||||
|
||||
- 把较长的 `Assistant.msg` 压缩成一条更短、便于检索的辅助摘要。
|
||||
- 保留建议、推荐、提醒、说明、提问、附和、重复等核心动作。
|
||||
- 删除冗长解释、寒暄、礼貌套话和低价值铺垫,但不要漏掉真正有用的信息。
|
||||
|
||||
硬约束:
|
||||
|
||||
- 不得输出或复述 `User.msg`。
|
||||
- 不得捏造新事实、新建议、新步骤、新材料或新限制。
|
||||
- 不得改变 `Assistant` 原始语义和立场。
|
||||
- 可以压缩、合并、重写 `Assistant.msg`,但必须忠于原内容。
|
||||
- `assistant_memory_hint` 必须是简短的完整句,尽量包含清晰主谓宾,不要只写零散词组。
|
||||
- 如果 `assistant_memory_hint` 里出现"室友""老师""朋友""同事""这件事"这类泛称,而上下文中存在清晰、稳定、唯一的指代对象,则优先改写成那个清晰指代对象。
|
||||
- 只有在当前两条消息里无法稳定落到唯一对象时,才保留泛称或模糊表达。
|
||||
- 如果对象本身已经足够清晰,例如"数据库作业""鸡胸肉沙拉""李教授",则不要为了"更具体"而做不必要的过度展开。
|
||||
- `assistant_memory_type` 只能从以下枚举中选择:
|
||||
`comfort | suggestion | recommendation | warning | instruction | question | agreement | repetition | other`
|
||||
- 如果 `Assistant.msg` 同时包含多个动作,`assistant_memory_hint` 可以保留多个动作,但 `assistant_memory_type` 只标记其中最主要、最值得检索的主动作。
|
||||
- 不再输出 `NULL`。即使内容价值较低,也要尽量压成一条最短的辅助摘要。
|
||||
- 如果 `Assistant.msg` 含有提问、追问或反问,`assistant_memory_hint` 必须保留提问的具体内容,不能只写"询问了用户"。
|
||||
- 如果提问里给出了明确选项、候选分支或对比项,`assistant_memory_hint` 应尽量保留这些选项,而不是只保留上位概括。
|
||||
- `question` 只在"提问/追问/反问"是这条消息的主推进动作时使用;如果消息里同时有建议和提问,但建议明显更核心,则类型标为 `suggestion`,并在 hint 里按需保留提问内容。
|
||||
- 对 `question` 类型,优先保留:
|
||||
1. 问题的核心主题
|
||||
2. 明确给出的选项或分支
|
||||
3. 必要的限定条件
|
||||
- 对 `question` 类型,不要只保留寒暄式前缀,例如"听起来不错""如果方便的话";应保留真正要用户回答的部分。
|
||||
- 只输出严格 JSON,不要输出解释。
|
||||
|
||||
压缩原则:
|
||||
|
||||
- 优先保留具体建议、推荐、提醒、操作步骤、风险提示和问题内容。
|
||||
- 对纯附和内容,压成极短摘要,例如"附和了用户对某事的看法。"
|
||||
- 对明显重复用户内容的回复,压成极短摘要,例如"重复了用户关于某事的说法。"
|
||||
- 对泛泛回应、空泛鼓励、礼貌性延展,压成最短可理解摘要,并标为 `other`。
|
||||
- 如果上下文里能确定人名、关系对象或具体事物,优先在摘要里写出明确对象,不要无必要地保留"室友""那个老师""这件事"这类泛称。
|
||||
- 如果原文里的对象已经明确且自然,就直接保留该对象,不要改写成更绕或更长的表达。
|
||||
- 如果问题中存在"是 A、B 还是 C"这类显式选项,优先保留 A、B、C,而不是只写成"询问用户偏好"。
|
||||
- 如果原文既有建议又有提问,允许在 hint 里同时保留;但 type 只标主动作。若提问是核心推进动作,则 type 标为 `question`;若建议更核心,则 type 标为 `suggestion`。
|
||||
- 优先使用显式主语来写结果,例如:
|
||||
`安慰了用户……`
|
||||
`建议用户……`
|
||||
`推荐用户……`
|
||||
`提醒用户……`
|
||||
`询问用户……`
|
||||
`附和了用户……`
|
||||
`重复了用户……`
|
||||
|
||||
类型判断补充:
|
||||
|
||||
- `question`:主动作是向用户提问、追问、澄清、确认选项或收集偏好。
|
||||
- `suggestion`:主动作是给用户建议;即使末尾顺带问一句,也仍以建议为主。
|
||||
- `recommendation`:主动作是推荐某个方案、菜谱、产品或选择。
|
||||
- `warning`:主动作是提醒风险、限制、禁忌或后果。
|
||||
- `instruction`:主动作是说明操作顺序、步骤或执行流程。
|
||||
- `comfort`:主动作是安慰、理解、支持用户情绪。
|
||||
- `agreement`:主动作是附和、认同用户说法。
|
||||
- `repetition`:主动作是重复、转述用户已有内容,没有新增有效信息。
|
||||
- `other`:不适合归入以上类型,但仍值得压成一条短摘要。
|
||||
|
||||
Few-shot 示例 1
|
||||
输入:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "我室友小雯这学期一直在准备毕业论文,这两周都在改答辩PPT。她下周三答辩,我有点担心她会紧张。"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "听起来你很关心小雯,也希望她答辩顺利。她现在紧张其实很正常,很多人在答辩前都会这样。"
|
||||
}
|
||||
]
|
||||
}
|
||||
输出:
|
||||
{
|
||||
"assistant_memory_hint": "安慰了用户对室友小雯答辩状态的担忧。",
|
||||
"assistant_memory_type": "comfort"
|
||||
}
|
||||
|
||||
Few-shot 示例 2
|
||||
输入:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "我最近总失眠,已经两周了,想先自己调一调。"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "如果你想先自己调整,可以先减少咖啡因摄入,尤其下午和晚上尽量不要再喝咖啡或浓茶,同时把睡前刷手机的时间压缩一些,尽量固定上床时间,先连续观察几天。"
|
||||
}
|
||||
]
|
||||
}
|
||||
输出:
|
||||
{
|
||||
"assistant_memory_hint": "建议用户减少咖啡因摄入、减少睡前刷手机时间并固定上床时间。",
|
||||
"assistant_memory_type": "suggestion"
|
||||
}
|
||||
|
||||
Few-shot 示例 3
|
||||
输入:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "我晚上想做个简单点的减脂餐,最好二十分钟左右能搞定。"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "你可以做一个鸡胸肉沙拉碗,主要用鸡胸肉、生菜、黄瓜和圣女果。鸡胸肉简单煎熟切块后和蔬菜拌在一起,调味尽量用橄榄油加一点醋,不要放太多沙拉酱。"
|
||||
}
|
||||
]
|
||||
}
|
||||
输出:
|
||||
{
|
||||
"assistant_memory_hint": "推荐用户做鸡胸肉沙拉碗,并提醒用户调味时少放沙拉酱。",
|
||||
"assistant_memory_type": "suggestion"
|
||||
}
|
||||
|
||||
Few-shot 示例 4
|
||||
输入:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "听起来不错!"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "听起来不错!你最喜欢吃什么类型的沙拉呢?是蔬菜沙拉、水果沙拉还是其他的?如果有任何特定的食材是你最喜欢的,也可以告诉我哦。"
|
||||
}
|
||||
]
|
||||
}
|
||||
输出:
|
||||
{
|
||||
"assistant_memory_hint": "询问用户更喜欢蔬菜沙拉、水果沙拉还是其他类型的沙拉,以及是否有偏好的食材。",
|
||||
"assistant_memory_type": "question"
|
||||
}
|
||||
|
||||
Few-shot 示例 5
|
||||
输入:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "我最近总失眠,白天特别困,想先自己调一调。"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "你可以先减少下午和晚上的咖啡因摄入,睡前也尽量少看手机。如果方便的话,我还想了解一下,你通常晚上大概几点上床、几点真正睡着?"
|
||||
}
|
||||
]
|
||||
}
|
||||
输出:
|
||||
{
|
||||
"assistant_memory_hint": "建议用户减少下午和晚上的咖啡因摄入并减少睡前看手机,同时询问用户通常几点上床和几点入睡。",
|
||||
"assistant_memory_type": "suggestion"
|
||||
}
|
||||
{% else %}
|
||||
You are an Assistant-side memory compression module designed for memory storage.
|
||||
|
||||
Task:
|
||||
|
||||
- The input is a JSON object, and the dialogue is stored in the `msgs` array.
|
||||
- You only process `Assistant.msg`.
|
||||
- `User.msg` is context only. It must not appear in the output, and it must not be rewritten into a user summary.
|
||||
- Your output must contain exactly two fields:
|
||||
1. `assistant_memory_hint`
|
||||
2. `assistant_memory_type`
|
||||
|
||||
Goal:
|
||||
|
||||
- Compress a long `Assistant.msg` into a shorter retrieval-friendly assistant summary.
|
||||
- Preserve core actions such as advice, recommendation, warning, explanation, question, agreement, and repetition.
|
||||
- Remove verbose explanation, small talk, politeness padding, and low-value lead-in, but do not drop truly useful information.
|
||||
|
||||
Hard constraints:
|
||||
|
||||
- Do not output or restate `User.msg`.
|
||||
- Do not invent new facts, advice, steps, ingredients, or constraints.
|
||||
- Do not change the original meaning or stance of `Assistant.msg`.
|
||||
- You may compress, merge, or rewrite `Assistant.msg`, but you must stay faithful to the original content.
|
||||
- `assistant_memory_hint` must be a short complete sentence, ideally with a clear subject, predicate, and object, not a loose fragment.
|
||||
- If `assistant_memory_hint` contains generic labels such as "roommate", "teacher", "friend", "coworker", or "this matter", and the context provides a clear, stable, unique referent, prefer the explicit referent.
|
||||
- Only keep generic or vague wording when the current two-message context cannot resolve it stably to a unique referent.
|
||||
- If the object is already naturally clear, such as "database homework", "chicken salad", or "Professor Li", do not over-expand it just to sound more specific.
|
||||
- `assistant_memory_type` must be chosen only from:
|
||||
`comfort | suggestion | recommendation | warning | instruction | question | agreement | repetition | other`
|
||||
- If `Assistant.msg` contains multiple actions, `assistant_memory_hint` may keep multiple actions, but `assistant_memory_type` must label only the most important and most retrieval-worthy primary action.
|
||||
- Do not output `NULL`. Even if the content is low-value, compress it into the shortest useful assistant-side summary.
|
||||
- If `Assistant.msg` contains a question, follow-up question, or counter-question, `assistant_memory_hint` must preserve the actual question content and must not reduce it to "asked the user".
|
||||
- If the question contains explicit options, candidate branches, or comparisons, `assistant_memory_hint` should preserve those options instead of collapsing them into a generic abstraction.
|
||||
- Use `question` only when asking, follow-up asking, or counter-questioning is the main forward-driving action of the message. If the message contains both advice and a question, but advice is clearly more central, use `suggestion` and keep the question content in the hint when needed.
|
||||
- For `question`, prioritize:
|
||||
1. the core topic of the question
|
||||
2. the explicit options or branches
|
||||
3. the necessary constraints
|
||||
- For `question`, do not keep only social softeners such as "that sounds nice" or "if that's convenient"; keep the actual part that requires an answer.
|
||||
- Return strict JSON only. Do not output explanations.
|
||||
|
||||
Compression principles:
|
||||
|
||||
- Prioritize concrete advice, recommendations, warnings, operational steps, risk reminders, and question content.
|
||||
- Compress pure agreement into a very short summary, such as "Agreed with the user's view on something."
|
||||
- Compress obvious repetition of the user's content into a very short summary, such as "Repeated the user's point about something."
|
||||
- Compress generic responses, vague encouragement, and polite extension into the shortest understandable summary and label them `other`.
|
||||
- If the context makes a person, relation, or concrete object identifiable, prefer the explicit object in the summary and avoid unnecessary generic terms like "roommate", "that teacher", or "this matter".
|
||||
- If the object in the original message is already clear and natural, keep it directly rather than rewriting it into a longer or more awkward form.
|
||||
- If the question contains explicit choices such as "A, B, or C", preserve A, B, and C rather than reducing it to "asked about the user's preference".
|
||||
- If the original message contains both advice and a question, both may remain in the hint, but the type should mark only the primary action. If the question is the main forward-driving action, use `question`; if the advice is more central, use `suggestion`.
|
||||
- Prefer explicit leading verbs in the result, for example:
|
||||
`Comforted the user...`
|
||||
`Suggested that the user...`
|
||||
`Recommended that the user...`
|
||||
`Warned the user...`
|
||||
`Asked the user...`
|
||||
`Agreed with the user...`
|
||||
`Repeated the user's point...`
|
||||
|
||||
Type notes:
|
||||
|
||||
- `question`: the primary action is asking, following up, clarifying, confirming options, or collecting preferences.
|
||||
- `suggestion`: the primary action is giving advice, even if a question appears at the end.
|
||||
- `recommendation`: the primary action is recommending a plan, dish, product, or choice.
|
||||
- `warning`: the primary action is warning about a risk, restriction, taboo, or consequence.
|
||||
- `instruction`: the primary action is explaining an operation order, concrete steps, or an execution flow.
|
||||
- `comfort`: the primary action is comforting, understanding, or emotionally supporting the user.
|
||||
- `agreement`: the primary action is agreeing with or affirming the user's statement.
|
||||
- `repetition`: the primary action is repeating or rephrasing content the user already said, without adding meaningful new information.
|
||||
- `other`: does not fit the types above, but still deserves a short summary.
|
||||
|
||||
English few-shot example 1
|
||||
Input:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "My roommate Xiaowen has been preparing her thesis all semester, and she has spent the last two weeks revising her defense slides. She defends next Wednesday, and I'm a little worried she'll be nervous."
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "It sounds like you really care about Xiaowen and want her defense to go well. Feeling nervous before a defense is actually very normal, and many people feel that way."
|
||||
}
|
||||
]
|
||||
}
|
||||
Output:
|
||||
{
|
||||
"assistant_memory_hint": "Comforted the user about roommate Xiaowen's defense state.",
|
||||
"assistant_memory_type": "comfort"
|
||||
}
|
||||
|
||||
English few-shot example 2
|
||||
Input:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "I've had insomnia for the past two weeks and want to try adjusting it myself first."
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "If you want to adjust it yourself first, you can start by reducing caffeine intake, especially in the afternoon and evening, cutting down screen time before bed, and keeping a consistent bedtime for a few days."
|
||||
}
|
||||
]
|
||||
}
|
||||
Output:
|
||||
{
|
||||
"assistant_memory_hint": "Suggested that the user reduce caffeine intake, reduce screen time before bed, and keep a consistent bedtime.",
|
||||
"assistant_memory_type": "suggestion"
|
||||
}
|
||||
|
||||
English few-shot example 3
|
||||
Input:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "I want to make a simple low-fat dinner tonight that takes about twenty minutes."
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "You could make a chicken salad bowl with chicken breast, lettuce, cucumber, and cherry tomatoes. After cooking and slicing the chicken, mix it with the vegetables, and keep the dressing light without adding too much salad dressing."
|
||||
}
|
||||
]
|
||||
}
|
||||
Output:
|
||||
{
|
||||
"assistant_memory_hint": "Recommended that the user make a chicken salad bowl and use less salad dressing.",
|
||||
"assistant_memory_type": "suggestion"
|
||||
}
|
||||
|
||||
English few-shot example 4
|
||||
Input:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "That sounds good!"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "That sounds good! What kind of salad do you like most? Vegetable salad, fruit salad, or something else? If you have any favorite ingredients, you can tell me too."
|
||||
}
|
||||
]
|
||||
}
|
||||
Output:
|
||||
{
|
||||
"assistant_memory_hint": "Asked what kind of salad the user prefers, whether they prefer vegetable salad, fruit salad, or something else, and whether they have any favorite ingredients.",
|
||||
"assistant_memory_type": "question"
|
||||
}
|
||||
|
||||
English few-shot example 5
|
||||
Input:
|
||||
{
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "I've been having insomnia lately, I feel especially tired during the day, and I want to adjust it myself first."
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "You can first reduce caffeine intake in the afternoon and evening and also try to look at your phone less before bed. If it's convenient, I'd also like to know what time you usually get into bed and what time you actually fall asleep."
|
||||
}
|
||||
]
|
||||
}
|
||||
Output:
|
||||
{
|
||||
"assistant_memory_hint": "Suggested that the user reduce afternoon and evening caffeine intake and reduce phone use before bed, while also asking when the user usually gets into bed and falls asleep.",
|
||||
"assistant_memory_type": "suggestion"
|
||||
}
|
||||
{% endif %}
|
||||
|
||||
现在处理下面这个输入。
|
||||
输入:{{ dialog_text }}
|
||||
|
||||
只输出严格 JSON:
|
||||
{
|
||||
"assistant_memory_hint": "<string>",
|
||||
"assistant_memory_type": "comfort | suggestion | recommendation | warning | instruction | question | agreement | repetition | other"
|
||||
}
|
||||
@@ -1,393 +0,0 @@
|
||||
{% macro tidy(name) -%}
|
||||
{{ name.replace('_', ' ')}}
|
||||
{%- endmacro %}
|
||||
|
||||
|
||||
===Tasks===
|
||||
|
||||
{% if language == "zh" %}
|
||||
你的任务是根据详细的提取指南,从提供的对话片段中识别和提取陈述句。
|
||||
每个陈述句必须按照下面提到的标准进行标记。
|
||||
{% else %}
|
||||
Your task is to identify and extract declarative statements from the provided conversational chunk based on the detailed extraction guidelines.
|
||||
Each statement must be labeled as per the criteria mentioned below.
|
||||
{% endif %}
|
||||
|
||||
===Inputs===
|
||||
{% if inputs %}
|
||||
{% for key, val in inputs.items() %}
|
||||
- {{ key }}: {{val}}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
|
||||
===Extraction Instructions===
|
||||
{% if language == "zh" %}
|
||||
{% if granularity %}
|
||||
{% if granularity == 3 %}
|
||||
原子化和清晰:构建陈述句以清楚地显示单一的主谓宾关系。最好有多个较小的陈述句,而不是一个复杂的陈述句。
|
||||
上下文独立:陈述句必须在不需要阅读整个对话的情况下可以理解。
|
||||
{% elif granularity == 2 %}
|
||||
在句子级别提取陈述句。每个陈述句应对应一个单一、完整的思想(通常是来源中的一个完整句子),但要重新表述以获得最大的清晰度,删除对话填充词(例如,"嗯"、"像"、感叹词)。
|
||||
{% elif granularity == 1 %}
|
||||
仅提取精华句子,并将片段总结为多个独立的陈述句,每个陈述句关注事实陈述、用户偏好、关系和显著的时间上下文。
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
上下文解析要求:
|
||||
- 将指示代词("那个"、"这个"、"那些"、"这些")解析为其具体指代对象
|
||||
- 如果陈述句包含无法从对话上下文中解析的模糊引用,则:
|
||||
a) 扩展陈述句以包含对话早期的缺失上下文
|
||||
b) 标记陈述句为需要额外上下文
|
||||
c) 如果陈述句在没有上下文的情况下变得无意义,则跳过提取
|
||||
|
||||
对话上下文和共指消解:
|
||||
- 将每个陈述句归属于说出它的参与者。
|
||||
- **对于用户的发言:必须使用"用户"作为主语**,禁止将"用户"或"我"替换为用户的真实姓名或别名。例如,用户说"我叫张三"应提取为"用户叫张三",而不是"张三叫张三"。
|
||||
- 对于 AI 助手的发言:使用"助手"或"AI助手"作为主语。
|
||||
- 将所有代词解析为对话上下文中的具体人物或实体,但"我"必须解析为"用户"。
|
||||
- 识别并将抽象引用解析为其具体名称(如果提到)。
|
||||
- 将缩写和首字母缩略词扩展为其完整形式。
|
||||
{% else %}
|
||||
{% if granularity %}
|
||||
{% if granularity == 3 %}
|
||||
Atomic & Clear: Structure statements to clearly show a single subject-predicate-object relationship. It is better to have multiple smaller statements than one complex one.
|
||||
Context-Independent: Statements must be understandable without needing to read the entire conversation.
|
||||
{% elif granularity == 2 %}
|
||||
Extract statements at the sentence level. Each statement should correspond to a single, complete thought (typically a full sentence from the source) but be rephrased for maximum clarity, removing conversational filler (e.g., 'um,' 'like,' interjections).
|
||||
{% elif granularity == 1 %}
|
||||
Extract only essence sentences and summarize the chunk into multiple, standalone statements, each focusing on factual statements, user preferences, relationships, and salient temporal context.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
Context Resolution Requirements:
|
||||
- Resolve demonstrative pronouns ("that," "this," "those") to their specific referents
|
||||
- If a statement contains vague references that cannot be resolved from the conversation context, either:
|
||||
a) Expand the statement to include the missing context from earlier in the conversation
|
||||
b) Mark the statement as requiring additional context
|
||||
c) Skip extraction if the statement becomes meaningless without context
|
||||
|
||||
Conversational Context & Co-reference Resolution:
|
||||
- Attribute every statement to the participant who uttered it.
|
||||
- **For user's statements: always use "用户" (User) as the subject**. Do NOT replace "用户" or "I" with the user's real name or alias. For example, if the user says "I'm John", extract as "用户 is John", not "John is John".
|
||||
- For AI assistant's statements: use "助手" or "AI助手" as the subject.
|
||||
- Resolve all pronouns to the specific person or entity from the conversation's context, but "I"/"我" must always resolve to "用户".
|
||||
- Identify and resolve abstract references to their specific names if mentioned.
|
||||
- Expand abbreviations and acronyms to their full form.
|
||||
{% endif %}
|
||||
|
||||
{% if include_dialogue_context %}
|
||||
{% if language == "zh" %}
|
||||
===完整对话上下文===
|
||||
以下是完整的对话上下文,以帮助您理解引用、代词和对话流程:
|
||||
{% else %}
|
||||
===Full Dialogue Context===
|
||||
The following is the complete dialogue context to help you understand references, pronouns, and conversational flow:
|
||||
{% endif %}
|
||||
|
||||
{{ dialogue_context }}
|
||||
|
||||
{% if language == "zh" %}
|
||||
===对话上下文结束===
|
||||
{% else %}
|
||||
===End of Dialogue Context===
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% if language == "zh" %}
|
||||
过滤和格式化:
|
||||
|
||||
- 仅提取陈述句。
|
||||
不要提取问题、命令、问候语或对话填充词。
|
||||
时间精度:
|
||||
|
||||
包括任何明确的日期、时间或定量限定符。
|
||||
如果一个句子既描述了事件的开始(静态)又描述了其持续性质(动态),则将两者提取为单独的陈述句。
|
||||
{% else %}
|
||||
Filtering and Formatting:
|
||||
|
||||
- Extract only declarative statements.
|
||||
DO NOT extract questions, commands, greetings, or conversational filler.
|
||||
Temporal Precision:
|
||||
|
||||
Include any explicit dates, times, or quantitative qualifiers.
|
||||
If a sentence describes both the start of an event (static) and its ongoing nature (dynamic), extract both as separate statements.
|
||||
{% endif %}
|
||||
|
||||
{%- if definitions %}
|
||||
{%- for section_key, section_dict in definitions.items() %}
|
||||
==== {{ tidy(section_key) | upper }} {% if language == "zh" %}定义和指导{% else %}DEFINITIONS & GUIDANCE{% endif %} ====
|
||||
{%- for category, details in section_dict.items() %}
|
||||
{{ loop.index }}. {{ category }}
|
||||
- {% if language == "zh" %}定义{% else %}Definition{% endif %}: {{ details.get("definition", "") }}
|
||||
{% endfor -%}
|
||||
{% endfor -%}
|
||||
{% endif -%}
|
||||
|
||||
===Examples===
|
||||
{% if language == "zh" %}
|
||||
示例 1: 英文对话
|
||||
示例片段: """
|
||||
日期: 2024年3月15日
|
||||
参与者:
|
||||
- Sarah Chen (用户)
|
||||
- 助手 (AI)
|
||||
|
||||
用户: "我最近一直在尝试水彩画,画了一些花朵。"
|
||||
AI: "水彩画很有趣!水彩颜料通常由颜料与阿拉伯树胶等粘合剂混合而成。你觉得怎么样?"
|
||||
用户: "我认为色彩组合可以改进,但我真的很喜欢玫瑰和百合。"
|
||||
"""
|
||||
|
||||
示例输出: {
|
||||
"statements": [
|
||||
{
|
||||
"statement": "用户最近一直在尝试水彩画。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户画了一些花朵。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "水彩颜料通常由颜料与阿拉伯树胶等粘合剂混合而成。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "ATEMPORAL",
|
||||
"relevance": "IRRELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户认为她的水彩画中的色彩组合可以改进。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户真的很喜欢玫瑰和百合。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
示例 2: 中文对话示例
|
||||
示例片段: """
|
||||
日期: 2024年3月15日
|
||||
参与者:
|
||||
- 张曼婷 (用户)
|
||||
- 小助手 (AI助手)
|
||||
|
||||
用户: "我最近在尝试水彩画,画了一些花朵。"
|
||||
AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合剂混合而成。你觉得怎么样?"
|
||||
用户: "我觉得色彩搭配还有提升的空间,不过我很喜欢玫瑰和百合这两种花。"
|
||||
"""
|
||||
|
||||
示例输出: {
|
||||
"statements": [
|
||||
{
|
||||
"statement": "用户最近在尝试水彩画。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户画了一些花朵。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "水彩颜料通常由颜料和阿拉伯树胶等粘合剂混合而成。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "ATEMPORAL",
|
||||
"relevance": "IRRELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户觉得水彩画的色彩搭配还有提升的空间。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户很喜欢玫瑰和百合。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
}
|
||||
]
|
||||
}
|
||||
{% else %}
|
||||
Example 1: English Conversation
|
||||
Example Chunk: """
|
||||
Date: March 15, 2024
|
||||
Participants:
|
||||
- Sarah Chen (User)
|
||||
- Assistant (AI)
|
||||
|
||||
User: "I've been trying watercolor painting recently and painted some flowers."
|
||||
AI: "Watercolor painting is very interesting! Watercolor paints are typically made from pigments mixed with binders like gum arabic. How do you like it?"
|
||||
User: "I think the color combinations could use some improvement, but I really like roses and lilies."
|
||||
"""
|
||||
|
||||
Example Output: {
|
||||
"statements": [
|
||||
{
|
||||
"statement": "用户 has been trying watercolor painting recently.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户 painted some flowers.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "Watercolor paints are typically made from pigments mixed with binders like gum arabic.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "ATEMPORAL",
|
||||
"relevance": "IRRELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户 thinks the color combinations in her watercolor paintings could use some improvement.",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户 really likes roses and lilies.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Example 2: Chinese Conversation (中文对话示例)
|
||||
Example Chunk: """
|
||||
日期: 2024年3月15日
|
||||
参与者:
|
||||
- 张曼婷 (用户)
|
||||
- 小助手 (AI助手)
|
||||
|
||||
用户: "我最近在尝试水彩画,画了一些花朵。"
|
||||
AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合剂混合而成。你觉得怎么样?"
|
||||
用户: "我觉得色彩搭配还有提升的空间,不过我很喜欢玫瑰和百合这两种花。"
|
||||
"""
|
||||
|
||||
Example Output: {
|
||||
"statements": [
|
||||
{
|
||||
"statement": "用户最近在尝试水彩画。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户画了一些花朵。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "水彩颜料通常由颜料和阿拉伯树胶等粘合剂混合而成。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "ATEMPORAL",
|
||||
"relevance": "IRRELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户觉得水彩画的色彩搭配还有提升的空间。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "用户很喜欢玫瑰和百合。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
}
|
||||
]
|
||||
}
|
||||
{% endif %}
|
||||
===End of Examples===
|
||||
|
||||
{% if language == "zh" %}
|
||||
===反思过程===
|
||||
|
||||
提取陈述句后,执行以下自我审查步骤:
|
||||
|
||||
**步骤 1: 归属检查**
|
||||
- 确认每个陈述句都正确归属于正确的说话者
|
||||
- 验证说话者名称在整个过程中使用一致
|
||||
- 检查 AI 助手陈述句是否正确归属
|
||||
|
||||
**步骤 2: 完整性审查**
|
||||
- 确保没有遗漏重要的陈述句
|
||||
- 检查时间信息是否保留
|
||||
|
||||
**步骤 3: 分类验证**
|
||||
- 审查 statement_type 分类(FACT/OPINION/PREDICTION/SUGGESTION)
|
||||
- 验证 temporal_type 分配(STATIC/DYNAMIC/ATEMPORAL)
|
||||
- 确保分类与提供的定义一致
|
||||
|
||||
**步骤 4: 最终质量检查**
|
||||
- 删除任何问题、命令或对话填充词
|
||||
- 验证 JSON 格式合规性
|
||||
- 确认输出语言与输入语言匹配
|
||||
{% else %}
|
||||
===Reflection Process===
|
||||
|
||||
After extracting statements, perform the following self-review steps:
|
||||
|
||||
**Step 1: Attribution Check**
|
||||
- Confirm every statement is properly attributed to the correct speaker
|
||||
- Verify speaker names are used consistently throughout
|
||||
- Check that AI assistant statements are properly attributed
|
||||
|
||||
**Step 2: Completeness Review**
|
||||
- Ensure no important declarative statements were missed
|
||||
- Check that temporal information is preserved
|
||||
|
||||
**Step 3: Classification Validation**
|
||||
- Review statement_type classifications (FACT/OPINION/PREDICTION/SUGGESTION)
|
||||
- Verify temporal_type assignments (STATIC/DYNAMIC/ATEMPORAL)
|
||||
- Ensure classifications align with the provided definitions
|
||||
|
||||
**Step 4: Final Quality Check**
|
||||
- Remove any questions, commands, or conversational filler
|
||||
- Verify JSON format compliance
|
||||
- Confirm output language matches input language
|
||||
{% endif %}
|
||||
|
||||
**Output format**
|
||||
**CRITICAL JSON FORMATTING REQUIREMENTS:**
|
||||
1. Use only standard ASCII double quotes (") for JSON structure - never use Chinese quotation marks ("") or other Unicode quotes
|
||||
2. If the extracted statement text contains quotation marks, escape them properly using backslashes (\")
|
||||
3. Ensure all JSON strings are properly closed and comma-separated
|
||||
4. Do not include line breaks within JSON string values
|
||||
5. Example of proper escaping: "statement": "John said: \"I really like this book.\""
|
||||
|
||||
**LANGUAGE REQUIREMENT:**
|
||||
{% if language == "zh" %}
|
||||
- 输出语言应始终与输入语言匹配
|
||||
- 如果输入是中文,则用中文提取陈述句
|
||||
- 如果输入是英文,则用英文提取陈述句
|
||||
- 保留原始语言,不要翻译
|
||||
{% else %}
|
||||
- The output language should ALWAYS match the input language
|
||||
- If input is in English, extract statements in English
|
||||
- If input is in Chinese, extract statements in Chinese
|
||||
- Preserve the original language and do not translate
|
||||
{% endif %}
|
||||
|
||||
{% if language == "zh" %}
|
||||
仅返回与以下架构匹配的 JSON 对象数组中提取的标记陈述句列表:
|
||||
{% else %}
|
||||
Return only a list of extracted labelled statements in the JSON ARRAY of objects that match the schema below:
|
||||
{% endif %}
|
||||
{{ json_schema }}
|
||||
@@ -0,0 +1,712 @@
|
||||
{% macro render_input() -%}
|
||||
{{ input_json }}
|
||||
{%- endmacro %}
|
||||
|
||||
=== Tasks ===
|
||||
|
||||
{% if language == "zh" %}
|
||||
你的任务是从提供的目标文本中识别并提取陈述句,并为每条陈述句标注以下信息:
|
||||
|
||||
- statement_id
|
||||
- statement_text
|
||||
- statement_type
|
||||
- temporal_type
|
||||
- has_emotional_state
|
||||
- has_unsolved_reference
|
||||
- dialog_at
|
||||
- valid_at
|
||||
- invalid_at
|
||||
|
||||
每条输出都应是一个结构化的候选记忆陈述句。
|
||||
{% else %}
|
||||
Your task is to identify and extract declarative statements from the provided target text, and annotate each extracted statement with:
|
||||
|
||||
- statement_id
|
||||
- statement_text
|
||||
- statement_type
|
||||
- temporal_type
|
||||
- has_emotional_state
|
||||
- has_unsolved_reference
|
||||
- dialog_at
|
||||
- valid_at
|
||||
- invalid_at
|
||||
|
||||
Each output item should be a structured candidate memory statement.
|
||||
{% endif %}
|
||||
|
||||
=== Inputs ===
|
||||
{% if language == "zh" %}
|
||||
|
||||
- chunk_id: chunk 唯一 ID
|
||||
- end_user_id: 终端用户 ID
|
||||
- dialog_at: 会话时间,必须是 ISO 8601 时间点
|
||||
- target_content: 当前要处理的对话片段文本,也是唯一允许被抽取的目标文本
|
||||
- target_message_date: 目标文本对应的时间,可作为辅助时间背景;当与 dialog_at 同时存在时,优先使用 dialog_at 解析相对时间表达
|
||||
- supporting_context: 完整对话上下文,仅用于辅助理解 target_content,不能单独贡献新的可抽取事实
|
||||
- supporting_context.msgs: 按顺序提供的上下文消息,可包含 User 和 Assistant
|
||||
{% else %}
|
||||
- chunk_id: unique chunk identifier
|
||||
- end_user_id: end-user identifier
|
||||
- dialog_at: session time, which must be an ISO 8601 timestamp
|
||||
- target_content: the current dialogue fragment to process, and the only text span that may be extracted from
|
||||
- target_message_date: the time associated with the target content and may serve as supporting temporal context; when both exist, prefer `dialog_at` for resolving relative expressions
|
||||
- supporting_context: full dialogue context used only to help interpret target_content and must not independently contribute new extractable facts
|
||||
- supporting_context.msgs: ordered contextual messages, which may include User and Assistant messages
|
||||
{% endif %}
|
||||
|
||||
=== Scope ===
|
||||
{% if language == "zh" %}
|
||||
|
||||
- 只从 `target_content` 中提取陈述句。
|
||||
- `supporting_context.msgs` 只用于解释 `target_content` 中的代词、省略、主体、时间和语义背景。
|
||||
- 不要从 `supporting_context.msgs` 中单独提取任何陈述句。
|
||||
- 如果某条信息没有出现在 `target_content` 中,即使它出现在 `supporting_context.msgs` 中,也不能把它作为独立 statement 输出。
|
||||
- 如果 Assistant 在 `supporting_context.msgs` 中提供了总结、猜测、解释或改写,这些内容只能作为理解辅助,不能被当作事实直接提取。
|
||||
- 每条输出的 statement 都必须能够在 `target_content` 中找到直接对应的表达依据。
|
||||
{% else %}
|
||||
- Extract statements only from `target_content`.
|
||||
- `supporting_context.msgs` is used only to interpret references, ellipsis, subjects, temporal expressions, and semantic background in `target_content`.
|
||||
- Do not extract any standalone statement from `supporting_context.msgs`.
|
||||
- If a piece of information does not appear in `target_content`, it must not be output as an independent statement even if it appears in `supporting_context.msgs`.
|
||||
- If the Assistant in `supporting_context.msgs` provides a summary, guess, interpretation, or rephrasing, treat it only as interpretive support and never as a direct factual source for extraction.
|
||||
- Every output statement must be directly grounded in wording from `target_content`.
|
||||
{% endif %}
|
||||
|
||||
=== Extraction Rules ===
|
||||
{% if language == "zh" %}
|
||||
拆分规则:
|
||||
|
||||
- 以“一个完整意思”为单位提取陈述句,通常对应一个完整句子或一个自然语义片段。
|
||||
- 默认保留句子级结构;只有当一个句子内部包含两个及以上彼此独立、拆开后明显更清晰的重要信息时,才拆成多条。
|
||||
- 宁可多提取,也不要漏掉 `target_content` 中能独立成立、且语义稳定的 statement。
|
||||
- 但不要为了提高覆盖率而引入原文没有的信息,或输出语义不成立的 statement。
|
||||
|
||||
用户主语归一化:
|
||||
|
||||
- 如果陈述句的主语是用户本人,无论上下文中给出的用户名称、昵称、别名或真实姓名是什么,提取后的陈述句统一使用“用户”作为主语,不要使用用户的具体名字或别名。
|
||||
- 这是硬规则;如果用户主语没有统一成“用户”,则该 statement 视为不合格。
|
||||
|
||||
共指消解:
|
||||
|
||||
- 先完成最终的 `statement_text` 改写,再判断 `has_unsolved_reference`。
|
||||
- `has_unsolved_reference` 必须基于最终输出的 `statement_text` 判断,而不是基于原始 `target_content` 里是否出现过代词来判断。
|
||||
- 如果最终 `statement_text` 已经把引用改写成具体实体名,例如“助理恭喜用户”“小李点了一杯美式咖啡”,则 `has_unsolved_reference` 必须是 `false`。
|
||||
- 如果可以解析到具体实体名,优先输出具体实体名,并将 `has_unsolved_reference` 设为 `false`。
|
||||
- 如果不能解析到具体实体名,但可以解析到最小必要描述,则输出该最小必要描述,并将 `has_unsolved_reference` 设为 `true`。
|
||||
- 如果既不能解析到具体实体名,也不能稳定解析到最小必要描述,则保留最小必要原始表达,并将 `has_unsolved_reference` 设为 `true`。
|
||||
- 对涉及用户与其他人的共同活动,优先写成“用户和谁……”的形式,而不是保留“我们”“他们”这类未展开表达。
|
||||
|
||||
清晰指代与模糊指代:
|
||||
|
||||
- 只有当当前 `supporting_context` 足以将引用稳定映射到具体实体名时,才算 fully resolved。
|
||||
- `张三`、`老张`(且上下文中明确就是张三)、`李教授`、`王老师` 属于清晰指代。
|
||||
- `用户的朋友`、`用户的同事`、`某位老师`、`一位面试官` 这类最小必要描述允许输出,但仍然算 unresolved。
|
||||
- `朋友`、`前天那个人`、`那个`、`这个`、`那些`、`那两个`、`对方`、`他/她`(且无唯一可解对象)属于模糊指代。
|
||||
|
||||
过滤:
|
||||
|
||||
- 仅提取陈述句。
|
||||
- 不要提取问题、命令、问候语或对话填充词。
|
||||
|
||||
statement_type:
|
||||
|
||||
- `FACT`:用户陈述的事实、状态、关系、经历、行为、事件或计划等现实描述。
|
||||
- `OPINION`:主观评价、态度、判断、感受、看法,例如“我觉得”“我担心”。
|
||||
- `OTHER`:不应归入 `FACT` 或 `OPINION` 的其他陈述;“我希望……”默认标为 `OTHER`。
|
||||
- 不要因为句子带有主观色彩就自动判为 `OPINION`;只有在其核心是个人判断、态度、感受或评价时才标为 `OPINION`。
|
||||
|
||||
时间规则:
|
||||
|
||||
- 仅使用目标文本中明确陈述或可由 `dialog_at` / `target_message_date` 直接解析的时间信息;不要使用外部知识补时间。
|
||||
- 优先使用 `dialog_at` 作为“现在”来解释相对时间,例如“昨天”“上周五”“下个月”;只有在 `dialog_at` 缺失时才退回 `target_message_date`。
|
||||
- 如果相对时间可以稳定落到更具体的中文时间表达,就应直接改写进 `statement_text`,而不要保留原始模糊表达。
|
||||
- 可稳定具体化的示例包括:
|
||||
- “昨天” -> “2026年4月29日”
|
||||
- “前天晚上” -> “2026年4月28日晚上”
|
||||
- “上周三” -> “2026年4月22日”
|
||||
- “上周” -> “2026年4月20日至2026年4月26日”
|
||||
- “上周末” -> “2026年4月25日至2026年4月26日”
|
||||
- “上个月” -> “2026年3月”
|
||||
- “下周” -> “2026年5月4日至2026年5月10日”
|
||||
- 对开放区间时间表达,也要做相对时间消解并改写进 `statement_text`。
|
||||
- 常见开放过去区间表达包括:`最近`、`近来`、`这段时间`、`这些天`、`截至现在`、`更早之前`。
|
||||
- 常见开放未来区间表达包括:`即将`、`接下来`、`不久后`、`很快`、`未来一段时间`。
|
||||
- 这类表达无法稳定落到封闭日期区间时,可以改写为开放区间表达,例如:
|
||||
- “最近” -> “截至2026年4月1日之前的最近一段时间”
|
||||
- “近来” -> “截至2026年4月1日之前的近来一段时间”
|
||||
- “这段时间” -> “截至2026年4月1日之前的这段时间”
|
||||
- “即将” -> “在2026年4月1日之后即将发生”
|
||||
- “接下来” -> “在2026年4月1日之后接下来的一段时间”
|
||||
- “很快” -> “在2026年4月1日之后不久”
|
||||
- 如果相对时间不能稳定落到具体日期或日期区间,就保留其最小可信粗粒度,但仍尽量做相对时间消解;例如“去年冬天”可改写为“2025年冬天”,而不是保留“去年冬天”。
|
||||
- 对节假日类表达,能稳定映射到具体日期或日期区间时应具体化;例如“五一”通常可改写为具体日期,“清明节”通常也可改写为具体日期或短区间;“春节前后”这类边界不稳的表达仍保留较粗粒度。
|
||||
- `valid_at` 表示陈述开始成立或生效的时间。
|
||||
- `invalid_at` 表示陈述结束或不再成立的时间;如果仍在持续,填 `"NULL"`。
|
||||
- `dialog_at` 表示当前会话时间,每条 statement 都必须原样复制输入中的 `dialog_at`。
|
||||
- 时间格式优先使用 ISO 8601。
|
||||
- 对于只有日期没有时分秒的时间,默认使用整天边界,便于后续检索。
|
||||
- 如果没有明确时间,不要编造时间。
|
||||
- 对于点状事件(例如某天发生的一次考试、一次见面、一次提交),`valid_at` 和 `invalid_at` 都应填写为该事件的起止边界;不要只填 `valid_at`。
|
||||
|
||||
情感状态判断:
|
||||
|
||||
- `has_emotional_state` 只用于判断当前 statement 是否反映了用户的情感状态。
|
||||
- 如果根据当前 statement 和 supporting_context,可以判断用户当前存在某种情感状态,则输出 `true`。
|
||||
- 该字段不是情绪分类字段,不要求输出具体情绪类型。
|
||||
- 明确情绪表达例如“开心”“难过”“紧张”“有压力”通常应标为 `true`。
|
||||
- 即使没有明确情绪词,只要语义足以表明用户当前具有情感状态,也可以标为 `true`,例如“我很好”。
|
||||
- 如果只是客观事实、动作描述或安排,且无法从当前上下文稳定判断用户情感状态,则输出 `false`。
|
||||
|
||||
temporal_type:
|
||||
|
||||
- `STATIC`:相对稳定、持续性的状态、身份、属性、长期偏好、长期关系、长期职业或长期居住状态;若带起始时间,可填 `valid_at`,`invalid_at` 必须为 `"NULL"`。
|
||||
- `DYNAMIC`:有明确时间范围、阶段性持续、可结束或已结束的事件、活动、计划、任务或临时状态。
|
||||
- `ATEMPORAL`:普遍事实、定义、常识、百科知识、数学事实或无具体时间边界的泛化陈述;`valid_at` 和 `invalid_at` 都必须为 `"NULL"`。
|
||||
- 不要因为句子里出现时间词就机械地标为 `DYNAMIC`。
|
||||
|
||||
改写边界:
|
||||
|
||||
- 允许为解决代词、省略和时间歧义做最小必要改写。
|
||||
- 不要引入原文未明确表达的新事实、额外推断或风格化概括。
|
||||
{% else %}
|
||||
Splitting rules:
|
||||
- Extract statements at the level of one complete thought, usually one full sentence or one natural semantic unit.
|
||||
- Preserve sentence-level structure by default; split only when a sentence contains two or more independent and important pieces of information that become clearly easier to understand when separated.
|
||||
- Prefer higher recall: do not miss independently valid and semantically stable statements in `target_content`.
|
||||
- But do not increase recall by inventing unsupported facts or emitting semantically unstable statements.
|
||||
|
||||
User-subject normalization:
|
||||
|
||||
- If the subject of a statement is the user, always use “the user” as the subject in the extracted statement, regardless of whether the context provides the user’s real name, nickname, alias, or other identifier.
|
||||
- This is a hard rule. If a user-subject statement does not use “the user,” treat it as invalid.
|
||||
- Keep “the user” as the main retrieval anchor in English rewrites, including object position when possible.
|
||||
- For English reflexive self-expressions, preserve retrieval consistency without creating unnatural strings. Use these preferred rewrites:
|
||||
- “myself” in ordinary object position -> “the user”
|
||||
- “be myself” -> “be who the user is”
|
||||
- “embrace myself” -> “embrace who the user is”
|
||||
- “accept myself” -> “accept who the user is”
|
||||
- “express myself” -> “express the user’s thoughts” only if needed for grammaticality; otherwise keep the smallest rewrite anchored on “the user”
|
||||
- Do not rewrite fixed self-expressions into forms such as “embrace the user” or “be the user” when a more natural anchored template is available.
|
||||
|
||||
Coreference resolution:
|
||||
|
||||
- If you can resolve to a concrete named entity, output that name and set `has_unsolved_reference` to `false`.
|
||||
- If you cannot resolve to a concrete named entity but can resolve to a minimal grounded description, output that description and set `has_unsolved_reference` to `true`.
|
||||
- If you cannot even resolve to a stable minimal grounded description, keep the minimal original expression and set `has_unsolved_reference` to `true`.
|
||||
- For shared activities involving the user and others, prefer forms like “the user and X...” rather than unresolved expressions like “we” or “they”.
|
||||
|
||||
Clear vs unresolved reference:
|
||||
|
||||
- First produce the final rewritten `statement_text`, then decide `has_unsolved_reference`.
|
||||
- `has_unsolved_reference` must be judged from the final `statement_text`, not from whether the original `target_content` once contained a pronoun.
|
||||
- If the final `statement_text` already resolves the reference to a concrete named entity, such as “The assistant congratulates the user” or “Xiao Li ordered an Americano,” then `has_unsolved_reference` must be `false`.
|
||||
- A reference is fully resolved only if the current `supporting_context` can map it to a concrete named entity.
|
||||
- `Zhang San`, `Old Zhang` when clearly resolved to Zhang San, `Professor Li`, and `Teacher Wang` are clear references.
|
||||
- `the user's friend`, `the user's coworker`, `a teacher`, and `an interviewer` are allowed outputs but still count as unresolved.
|
||||
- `friend`, `that person from the other day`, `that one`, `this one`, `those`, `the two of them`, `the other party`, and `he/she` without a unique referent are unresolved.
|
||||
|
||||
Filtering:
|
||||
|
||||
- Extract only declarative statements.
|
||||
- Do not extract questions, commands, greetings, or conversational filler.
|
||||
|
||||
statement_type:
|
||||
|
||||
- `FACT`: user-stated facts, states, relationships, experiences, behaviors, events, or plans.
|
||||
- `OPINION`: subjective judgments, attitudes, feelings, evaluations, or viewpoints, such as “I think...” or “I worry...”.
|
||||
- `OTHER`: statements that should not be categorized as `FACT` or `OPINION`; statements like “I hope...” default to `OTHER`.
|
||||
- Do not classify a statement as `OPINION` merely because it sounds subjective; use `OPINION` only when its core content is a personal judgment, attitude, feeling, or evaluation.
|
||||
|
||||
Temporal rules:
|
||||
|
||||
- Use only temporal information explicitly stated in the target text or directly resolvable from `dialog_at` / `target_message_date`; do not add dates from external knowledge.
|
||||
- Prefer `dialog_at` as “now” when interpreting relative expressions such as “yesterday,” “last Friday,” or “next month”; only fall back to `target_message_date` when `dialog_at` is unavailable.
|
||||
- If a relative time can be stably grounded to a more concrete time expression in the output language, rewrite it directly into `statement_text` rather than keeping the vague source phrase.
|
||||
- Examples of stable concretization:
|
||||
- “yesterday” -> “April 29, 2026”
|
||||
- “the night before last” -> “the evening of April 28, 2026”
|
||||
- “last Wednesday” -> “April 22, 2026”
|
||||
- “last week” -> “April 20 to April 26, 2026”
|
||||
- “last weekend” -> “April 25 to April 26, 2026”
|
||||
- “last month” -> “March 2026”
|
||||
- “next week” -> “May 4 to May 10, 2026”
|
||||
- Open-interval temporal expressions should also be resolved and rewritten inside `statement_text`.
|
||||
- Common open past-interval expressions include: `recently`, `lately`, `these days`, `over this period`, `as of now`, and `earlier`.
|
||||
- Common open future-interval expressions include: `upcoming`, `coming up`, `soon`, `before long`, and `in the near future`.
|
||||
- When they cannot be stably converted into a closed date range, rewrite them as open intervals, for example:
|
||||
- `recently` -> `recently before April 1, 2026`
|
||||
- `lately` -> `lately before April 1, 2026`
|
||||
- `these days` -> `during the period leading up to April 1, 2026`
|
||||
- `upcoming` -> `upcoming after April 1, 2026`
|
||||
- `coming up` -> `coming up after April 1, 2026`
|
||||
- `soon` -> `soon after April 1, 2026`
|
||||
- If the relative time cannot be stably grounded to an exact date or date range, keep the smallest trustworthy coarse granularity but still resolve the relative reference as much as possible; for example, “last winter” may become “winter 2025” rather than remaining “last winter”.
|
||||
- For holiday expressions, concretize them when they can be stably mapped to specific dates or short date ranges; for example, Labor Day or Qingming Festival usually can be grounded, while expressions such as “around Spring Festival” should stay at a coarser granularity.
|
||||
- `valid_at` means when the statement became valid or started to hold.
|
||||
- `invalid_at` means when the statement ended or stopped being valid; use `"NULL"` if it is still ongoing.
|
||||
- `dialog_at` is the session timestamp, and every statement must copy the input `dialog_at` verbatim.
|
||||
- Prefer ISO 8601 for time values.
|
||||
- When only a date can be resolved, default to full-day boundaries for retrieval use.
|
||||
- If no explicit time is available, do not invent one.
|
||||
- For point-in-time events such as a single exam, a meeting, or a submission on one day, populate both `valid_at` and `invalid_at`; do not fill only `valid_at`.
|
||||
|
||||
Emotional-state detection:
|
||||
|
||||
- `has_emotional_state` is used only to judge whether the current statement reflects the user's emotional state.
|
||||
- If the current statement plus supporting context is sufficient to infer that the user currently has some emotional state, output `true`.
|
||||
- This field is not an emotion category field. Do not infer or output a specific emotion label here.
|
||||
- Explicit emotion wording such as “happy”, “sad”, “nervous”, or “under pressure” should usually be marked `true`.
|
||||
- Statements without explicit emotion words may still be `true` if the user's emotional state is reasonably inferable, such as “I am fine.”
|
||||
- If the statement is only an objective fact or action description and the user's emotional state cannot be stably inferred from the current context, output `false`.
|
||||
|
||||
temporal_type:
|
||||
|
||||
- `STATIC`: relatively stable, ongoing states, identities, attributes, long-term preferences, long-term relationships, occupations, or residence states.
|
||||
- `DYNAMIC`: events, activities, plans, tasks, or temporary states with a bounded or potentially bounded time span.
|
||||
- `ATEMPORAL`: general facts, definitions, common knowledge, encyclopedic knowledge, mathematical facts, or generalized statements without meaningful temporal boundaries; both `valid_at` and `invalid_at` must be `"NULL"`.
|
||||
- Do not classify a statement as `DYNAMIC` merely because it contains a time word.
|
||||
|
||||
Rewrite boundary:
|
||||
|
||||
- Minimal rewriting is allowed only to resolve reference, ellipsis, and temporal ambiguity.
|
||||
- For resolvable relative time expressions, rewrite them into grounded time expressions directly inside `statement_text`, using the output language.
|
||||
- Do not keep both the vague source phrase and the grounded phrase together; output only the rewritten concrete form.
|
||||
- Do not fake precision for time expressions that cannot be grounded reliably from `dialog_at`.
|
||||
- In English, you may use a slightly more natural anchored paraphrase for reflexive user-self expressions when a literal replacement would be awkward, as long as the rewritten form still keeps “the user” as the retrieval anchor and does not change the meaning.
|
||||
- Do not introduce unsupported facts, extra inference, or stylistic summarization.
|
||||
{% endif %}
|
||||
|
||||
=== Examples ===
|
||||
{% if language == "zh" %}
|
||||
示例 1:
|
||||
示例输入: {
|
||||
"chunk_id": "chunk_a1b2c3d4",
|
||||
"end_user_id": "eu_12345678",
|
||||
"dialog_at": "2023-09-04T18:00:00Z",
|
||||
"target_content": "老李这学期要求还是一如既往地严,不过他讲课确实清晰透彻,而且每节课的结构都特别清楚。就是气场实在太吓人了,我每次被他点名都有点发怵。",
|
||||
"target_message_date": "2023-09-04T18:00:00",
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "今天是九月第一周的星期一,上了本学期第一节数据库课。作为班长,我帮李教授发了教学大纲。老李宣布的期末项目考核标准特别严,看了一眼大纲上的作业量,我感觉这学期恐怕要脱层皮。不过老李讲课确实清晰透彻,就是气场实在太吓人了。"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "听起来你对这门课既佩服又有点压力,李教授应该是很有气场的老师。"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
示例输出: {
|
||||
"statements": [
|
||||
{
|
||||
"statement_id": "stmt_e5f6g7h8",
|
||||
"statement_text": "李教授这学期要求很严。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2023-09-04T18:00:00Z",
|
||||
"valid_at": "2023-09-04T18:00:00",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_i9j0k1l2",
|
||||
"statement_text": "李教授讲课清晰透彻。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "ATEMPORAL",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2023-09-04T18:00:00Z",
|
||||
"valid_at": "NULL",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_m1n2o3p4",
|
||||
"statement_text": "用户每次被李教授点名都有点发怵。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": true,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2023-09-04T18:00:00Z",
|
||||
"valid_at": "2023-09-04T18:00:00",
|
||||
"invalid_at": "NULL"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
示例 2:
|
||||
示例输入: {
|
||||
"chunk_id": "chunk_b2c3d4e5",
|
||||
"end_user_id": "eu_12345678",
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"target_content": "我最近在学Python,每天晚上都会练一个小时。这周还打算先把基础语法和函数部分过一遍。",
|
||||
"target_message_date": "2026-04-01T00:00:00",
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "我最近在学Python。"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "Python 是一个很实用的语言。"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
示例输出: {
|
||||
"statements": [
|
||||
{
|
||||
"statement_id": "stmt_m3n4o5p6",
|
||||
"statement_text": "用户截至2026年4月1日之前的最近一段时间在学Python。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_q7r8s9t0",
|
||||
"statement_text": "用户截至2026年4月1日之前的最近一段时间每晚都会练一个小时Python。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_u1v2w3x4",
|
||||
"statement_text": "用户计划在2026年3月30日至2026年4月5日先复习Python的基础语法和函数部分。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
示例 3:
|
||||
示例输入: {
|
||||
"chunk_id": "chunk_c3d4e5f6",
|
||||
"end_user_id": "eu_12345678",
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"target_content": "去年冬天老师布置的那两个项目我一直觉得有点难,而且我昨晚看了半天还是没太搞明白。要是这周末再弄不出来,我可能就得去问助教了。",
|
||||
"target_message_date": "2026-04-01T00:00:00",
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "去年冬天老师布置的那两个项目我一直觉得有点难,而且我昨晚看了半天还是没太搞明白。要是这周末再弄不出来,我可能就得去问助教了。"
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "听起来你卡在老师去年冬天布置的那两个项目上了,如果这周末还没进展,再去问助教也可以。"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
示例输出: {
|
||||
"statements": [
|
||||
{
|
||||
"statement_id": "stmt_y5z6a7b8",
|
||||
"statement_text": "用户觉得2025年冬天老师布置的那两个项目有点难。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": true,
|
||||
"has_unsolved_reference": true,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_c9d0e1f2",
|
||||
"statement_text": "用户2026年3月31日晚上看了半天那两个项目还是没太搞明白。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": true,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-03-31T00:00:00",
|
||||
"invalid_at": "2026-03-31T23:59:59"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_g3h4i5j6",
|
||||
"statement_text": "如果到2026年4月4日至2026年4月5日还弄不出来,用户可能会去问助教。",
|
||||
"statement_type": "OTHER",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": true,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
}
|
||||
]
|
||||
}
|
||||
{% else %}
|
||||
Example 1:
|
||||
Example Input: {
|
||||
"chunk_id": "chunk_a1b2c3d4",
|
||||
"end_user_id": "eu_12345678",
|
||||
"dialog_at": "2023-09-04T18:00:00Z",
|
||||
"target_content": "Old Li is just as strict as ever this semester, but he really explains things clearly and the structure of every class is extremely clear. His presence is honestly kind of intimidating, and I get nervous every time he calls on me.",
|
||||
"target_message_date": "2023-09-04T18:00:00",
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "Today was the Monday of the first week of September, and I had the first database class of the semester. As class monitor, I helped Professor Li distribute the syllabus. Professor Li said the grading criteria for the final project would be very strict. Old Li is just as strict as ever this semester, but he really explains things clearly and the structure of every class is extremely clear. His presence is honestly kind of intimidating."
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "It sounds like you admire the teaching but also feel pressured by Professor Li."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
Example Output: {
|
||||
"statements": [
|
||||
{
|
||||
"statement_id": "stmt_e5f6g7h8",
|
||||
"statement_text": "Professor Li is very strict this semester.",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2023-09-04T18:00:00Z",
|
||||
"valid_at": "2023-09-04T18:00:00",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_i9j0k1l2",
|
||||
"statement_text": "Professor Li explains things clearly.",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "ATEMPORAL",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2023-09-04T18:00:00Z",
|
||||
"valid_at": "NULL",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_m1n2o3p4",
|
||||
"statement_text": "The user gets nervous every time Professor Li calls on the user.",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": true,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2023-09-04T18:00:00Z",
|
||||
"valid_at": "2023-09-04T18:00:00",
|
||||
"invalid_at": "NULL"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Example 2:
|
||||
Example Input: {
|
||||
"chunk_id": "chunk_b2c3d4e5",
|
||||
"end_user_id": "eu_12345678",
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"target_content": "I've been learning Python recently, and I practice for an hour every night. This week I also plan to review basic syntax and functions first.",
|
||||
"target_message_date": "2026-04-01T00:00:00",
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "I've been learning Python recently."
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "Python is a very practical language."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
Example Output: {
|
||||
"statements": [
|
||||
{
|
||||
"statement_id": "stmt_m3n4o5p6",
|
||||
"statement_text": "The user has been learning Python recently before April 1, 2026.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_q7r8s9t0",
|
||||
"statement_text": "The user has been practicing Python for an hour every night recently before April 1, 2026.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_u1v2w3x4",
|
||||
"statement_text": "The user plans to review Python basic syntax and functions first during 2026-03-30 to 2026-04-05.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": false,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Example 3:
|
||||
Example Input: {
|
||||
"chunk_id": "chunk_c3d4e5f6",
|
||||
"end_user_id": "eu_12345678",
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"target_content": "The two projects the teacher assigned last winter seem difficult to me, and even after looking at them for a long time last night I still didn't really understand them. If I still can't finish them by this weekend, I may have to ask the TA.",
|
||||
"target_message_date": "2026-04-01T00:00:00",
|
||||
"supporting_context": {
|
||||
"msgs": [
|
||||
{
|
||||
"role": "User",
|
||||
"msg": "The two projects the teacher assigned last winter seem difficult to me, and even after looking at them for a long time last night I still didn't really understand them. If I still can't finish them by this weekend, I may have to ask the TA."
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"msg": "It sounds like you're stuck on the two projects assigned last winter, and asking the TA would make sense if there is still no progress by this weekend."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
Example Output: {
|
||||
"statements": [
|
||||
{
|
||||
"statement_id": "stmt_y5z6a7b8",
|
||||
"statement_text": "The user thinks the two projects assigned in winter 2025 are difficult.",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": true,
|
||||
"has_unsolved_reference": true,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_c9d0e1f2",
|
||||
"statement_text": "The user spent a long time on the evening of 2026-03-31 looking at those two projects but still did not really understand them.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": true,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-03-31T00:00:00",
|
||||
"invalid_at": "2026-03-31T23:59:59"
|
||||
},
|
||||
{
|
||||
"statement_id": "stmt_g3h4i5j6",
|
||||
"statement_text": "If the user still cannot finish them by 2026-04-04 to 2026-04-05, the user may ask the TA.",
|
||||
"statement_type": "OTHER",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"has_emotional_state": false,
|
||||
"has_unsolved_reference": true,
|
||||
"dialog_at": "2026-04-01T00:00:00Z",
|
||||
"valid_at": "2026-04-01T00:00:00",
|
||||
"invalid_at": "NULL"
|
||||
}
|
||||
]
|
||||
}
|
||||
{% endif %}
|
||||
=== End of Examples ===
|
||||
|
||||
{% if language == "zh" %}
|
||||
最终输出前检查:
|
||||
|
||||
- 是否只保留 `target_content` 中可直接支持的陈述句
|
||||
- 如果主语是用户,是否统一写“用户”
|
||||
- 非用户主体是否尽量写成具体名称;若无法做到,是否已正确标记 `has_unsolved_reference = true`
|
||||
- 如果最终 `statement_text` 已经落到具体实体名,`has_unsolved_reference` 是否已经改为 `false`
|
||||
- 如果 `statement_text` 中出现可由 `dialog_at` 稳定解析的相对时间,是否已经改写成更具体的日期、月份或日期区间表达
|
||||
- 如果 `statement_text` 中出现“最近”“近来”“即将”“接下来”“很快”这类开放区间时间词,是否已经改写为带 `dialog_at` 锚点的开放区间表达
|
||||
- statement_type 是否合法,且没有把一般事实机械标成 `OPINION`
|
||||
- `has_emotional_state` 是否仅用于判断是否存在情感状态,而没有被当作情绪分类字段
|
||||
- temporal_type 是否与 valid_at / invalid_at 一致
|
||||
- 输出是否严格符合 JSON schema
|
||||
{% else %}
|
||||
Final checks before output:
|
||||
- Keep only statements directly supported by `target_content`
|
||||
- If the subject is the user, render it as “the user”
|
||||
- Render non-user subjects as concrete names when possible; otherwise mark `has_unsolved_reference = true`
|
||||
- If the final `statement_text` already resolves the reference to a concrete named entity, ensure `has_unsolved_reference = false`
|
||||
- If `statement_text` contains relative time expressions that can be stably resolved from `dialog_at`, rewrite them into more concrete date, month, or date-range expressions
|
||||
- If `statement_text` contains open-interval temporal words such as `recently`, `lately`, `upcoming`, `coming up`, or `soon`, rewrite them into open interval expressions anchored on `dialog_at`
|
||||
- Ensure statement_type is valid and do not mechanically label ordinary facts as `OPINION`
|
||||
- Ensure `has_emotional_state` is used only for emotional-state presence detection, not emotion classification
|
||||
- Ensure temporal_type is consistent with valid_at and invalid_at
|
||||
- Ensure the output strictly matches the JSON schema
|
||||
{% endif %}
|
||||
|
||||
**Output format**
|
||||
**CRITICAL JSON FORMATTING REQUIREMENTS:**
|
||||
|
||||
1. Use only standard ASCII double quotes (") for JSON structure.
|
||||
2. Escape internal quotation marks inside string values using backslashes (\").
|
||||
3. Ensure all JSON strings are properly closed and comma-separated.
|
||||
4. Do not include line breaks within JSON string values.
|
||||
5. Return only the JSON object. Do not add explanations before or after it.
|
||||
|
||||
**ISO 8601 HARD CONSTRAINT:**
|
||||
|
||||
- `dialog_at` must be ISO 8601.
|
||||
- `target_message_date` must be ISO 8601.
|
||||
- `valid_at` and `invalid_at` must be ISO 8601, or `"NULL"` when no time is available.
|
||||
- Do not output non-ISO values such as `2026/04/01`, `2026-04-01 00:00:00`, `yesterday evening`, or `下周三`.
|
||||
- When only a date is known, still output an ISO 8601 datetime boundary.
|
||||
|
||||
**LANGUAGE REQUIREMENT:**
|
||||
{% if language == "zh" %}
|
||||
|
||||
- 输出语言应始终与输入语言匹配。
|
||||
- 如果输入是中文,则用中文提取陈述句。
|
||||
- 如果输入是英文,则用英文提取陈述句。
|
||||
- 保留原始语言,不要翻译。
|
||||
{% else %}
|
||||
- The output language must always match the input language.
|
||||
- If the input is in Chinese, extract statements in Chinese.
|
||||
- If the input is in English, extract statements in English.
|
||||
- Preserve the original language and do not translate.
|
||||
{% endif %}
|
||||
|
||||
现在处理下面这个输入:{{ render_input() }}
|
||||
|
||||
Return only a JSON object matching the schema below:
|
||||
{
|
||||
"statements": [
|
||||
{
|
||||
"statement_id": "string",
|
||||
"statement_text": "string",
|
||||
"statement_type": "FACT | OPINION | OTHER",
|
||||
"temporal_type": "STATIC | DYNAMIC | ATEMPORAL",
|
||||
"has_emotional_state": "boolean",
|
||||
"has_unsolved_reference": "boolean",
|
||||
"dialog_at": "string",
|
||||
"valid_at": "string | NULL",
|
||||
"invalid_at": "string | NULL"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,126 +0,0 @@
|
||||
|
||||
{% macro tidy(name) -%}
|
||||
{{ name.replace('_', ' ')}}
|
||||
{%- endmacro %}
|
||||
{#
|
||||
This prompt (template) is adapted from [getzep/graphiti]
|
||||
Licensed under the Apache License, Version 2.0
|
||||
|
||||
Original work:
|
||||
https://github.com/getzep/graphiti/blob/main/graphiti_core/prompts/extract_edge_dates.py
|
||||
|
||||
Modifications made by Ke Sun on 2025-09-01
|
||||
See the LICENSE file for the full Apache 2.0 license text.
|
||||
#}
|
||||
# Task
|
||||
|
||||
{% if language == "zh" %}
|
||||
从提供的陈述句中提取时间信息(日期和时间范围)。确定所描述的关系或事件何时生效以及何时结束(如果适用)。
|
||||
{% else %}
|
||||
Extract temporal information (dates and time ranges) from the provided statement. Determine when the relationship or event described became valid and when it ended (if applicable).
|
||||
{% endif %}
|
||||
|
||||
# {% if language == "zh" %}输入数据{% else %}Input Data{% endif %}
|
||||
{% if inputs %}
|
||||
{% for key, val in inputs.items() %}
|
||||
- {{ key }}: {{val}}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
# {% if language == "zh" %}时间字段{% else %}Temporal Fields{% endif %}
|
||||
|
||||
{% if language == "zh" %}
|
||||
- **valid_at**: 关系/事件开始或成为真实的时间(ISO 8601 格式)
|
||||
- **invalid_at**: 关系/事件结束或停止为真的时间(ISO 8601 格式,如果正在进行则为 null)
|
||||
{% else %}
|
||||
- **valid_at**: When the relationship/event started or became true (ISO 8601 format)
|
||||
- **invalid_at**: When the relationship/event ended or stopped being true (ISO 8601 format, or null if ongoing)
|
||||
{% endif %}
|
||||
|
||||
# {% if language == "zh" %}提取规则{% else %}Extraction Rules{% endif %}
|
||||
|
||||
## {% if language == "zh" %}核心原则{% else %}Core Principles{% endif %}
|
||||
{% if language == "zh" %}
|
||||
1. **仅使用明确陈述的时间信息** - 不要从外部知识推断日期
|
||||
2. **使用参考/发布日期作为"现在"** 解释相对时间时
|
||||
3. **仅在日期与关系的有效性相关时设置日期** - 忽略偶然的时间提及
|
||||
4. **对于时间点事件**,仅设置 `valid_at`
|
||||
{% else %}
|
||||
1. **Only use explicitly stated temporal information** - do not infer dates from external knowledge
|
||||
2. **Use the reference/publication date as "now"** when interpreting relative times
|
||||
3. **Set dates only if they relate to the validity of the relationship** - ignore incidental time mentions
|
||||
4. **For point-in-time events**, set only `valid_at`
|
||||
{% endif %}
|
||||
|
||||
## {% if language == "zh" %}日期格式要求{% else %}Date Format Requirements{% endif %}
|
||||
{% if language == "zh" %}
|
||||
- 使用 ISO 8601: `YYYY-MM-DDTHH:MM:SS.SSSSSSZ`
|
||||
- 如果未指定时间,使用 `00:00:00`(午夜)
|
||||
- 如果仅提及年份,根据情况使用 `YYYY-01-01`(开始)或 `YYYY-12-31`(结束)
|
||||
- 如果仅提及月份,使用月份的第一天或最后一天
|
||||
- 始终包含时区(如果未指定,使用 `Z` 表示 UTC)
|
||||
- 根据参考日期将相对时间("两周前"、"去年")转换为绝对日期
|
||||
{% else %}
|
||||
- Use ISO 8601: `YYYY-MM-DDTHH:MM:SS.SSSSSSZ`
|
||||
- If no time specified, use `00:00:00` (midnight)
|
||||
- If only year mentioned, use `YYYY-01-01` (start) or `YYYY-12-31` (end) as appropriate
|
||||
- If only month mentioned, use first or last day of month
|
||||
- Always include timezone (use `Z` for UTC if unspecified)
|
||||
- Convert relative times ("two weeks ago", "last year") to absolute dates based on reference date
|
||||
{% endif %}
|
||||
|
||||
## {% if language == "zh" %}陈述句类型规则{% else %}Statement Type Rules{% endif %}
|
||||
|
||||
{{ inputs.get("statement_type") | upper }} {% if language == "zh" %}陈述句指导{% else %}Statement Guidance{% endif %}:
|
||||
{%for key, guide in statement_guide.items() %}
|
||||
- {{ tidy(key) | capitalize }}: {{ guide }}
|
||||
{% endfor %}
|
||||
|
||||
**{% if language == "zh" %}特殊情况{% else %}Special Cases{% endif %}:**
|
||||
{% if language == "zh" %}
|
||||
- **意见陈述句**: 仅设置 `valid_at`(意见表达的时间)
|
||||
- **预测陈述句**: 如果明确提及,将 `invalid_at` 设置为预测窗口的结束
|
||||
{% else %}
|
||||
- **Opinion statements**: Set only `valid_at` (when opinion was expressed)
|
||||
- **Prediction statements**: Set `invalid_at` to the end of the prediction window if explicitly mentioned
|
||||
{% endif %}
|
||||
|
||||
## {% if language == "zh" %}时间类型规则{% else %}Temporal Type Rules{% endif %}
|
||||
|
||||
{{ inputs.get("temporal_type") | upper }} {% if language == "zh" %}时间类型指导{% else %}Temporal Type Guidance{% endif %}:
|
||||
{% for key, guide in temporal_guide.items() %}
|
||||
- {{ tidy(key) | capitalize }}: {{ guide }}
|
||||
{% endfor %}
|
||||
|
||||
{% if inputs.get('quarter') and inputs.get('publication_date') %}
|
||||
## {% if language == "zh" %}季度参考{% else %}Quarter Reference{% endif %}
|
||||
{% if language == "zh" %}
|
||||
假设 {{ inputs.quarter }} 在 {{ inputs.publication_date }} 结束。从此基线计算任何季度引用(Q1、Q2 等)的日期。
|
||||
{% else %}
|
||||
Assume {{ inputs.quarter }} ends on {{ inputs.publication_date }}. Calculate dates for any quarter references (Q1, Q2, etc.) from this baseline.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
# {% if language == "zh" %}输出要求{% else %}Output Requirements{% endif %}
|
||||
|
||||
## {% if language == "zh" %}JSON 格式化(关键){% else %}JSON Formatting (CRITICAL){% endif %}
|
||||
{% if language == "zh" %}
|
||||
1. 使用**仅标准 ASCII 双引号** (") - 永远不要使用中文引号("")或其他 Unicode 变体
|
||||
2. 使用反斜杠转义内部引号: `\"`
|
||||
3. JSON 字符串值中不要有换行符
|
||||
4. 正确关闭并用逗号分隔所有字段
|
||||
{% else %}
|
||||
1. Use **only standard ASCII double quotes** (") - never use Chinese quotes ("") or other Unicode variants
|
||||
2. Escape internal quotes with backslash: `\"`
|
||||
3. No line breaks within JSON string values
|
||||
4. Properly close and comma-separate all fields
|
||||
{% endif %}
|
||||
|
||||
## {% if language == "zh" %}语言{% else %}Language{% endif %}
|
||||
{% if language == "zh" %}
|
||||
输出语言必须与输入语言匹配。
|
||||
{% else %}
|
||||
Output language must match input language.
|
||||
{% endif %}
|
||||
|
||||
{{ json_schema }}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,140 +1,616 @@
|
||||
===Task===
|
||||
Extract user metadata changes from the following conversation statements spoken by the user.
|
||||
|
||||
{% if language == "zh" %}
|
||||
**"三度原则"判断标准:**
|
||||
- 复用度:该信息是否会被多个功能模块使用?
|
||||
- 约束度:该信息是否会影响系统行为?
|
||||
- 时效性:该信息是长期稳定的还是临时的?仅提取长期稳定信息。
|
||||
你是一个用户画像 metadata 增量提取助手。你的任务是根据输入的用户 `description` 列表,提取值得长期保留、适合挂在“用户节点”下的新增 metadata。
|
||||
|
||||
**提取规则:**
|
||||
- **只提取关于"用户本人"的画像信息**,忽略用户提到的第三方人物(如朋友、同事、家人)的信息
|
||||
- 仅提取文本中明确提到的信息,不要推测
|
||||
- **输出语言必须与输入文本的语言一致**(输入中文则输出中文值,输入英文则输出英文值)
|
||||
你会同时收到:
|
||||
|
||||
**增量模式(重要):**
|
||||
你只需要输出**本次对话引起的变更操作**,不要输出完整的元数据。每个变更是一个对象,包含:
|
||||
- `field_path`:字段路径,用点号分隔(如 `profile.role`、`profile.expertise`)
|
||||
- `action`:操作类型
|
||||
* `set`:新增或修改一个字段的值
|
||||
* `remove`:移除一个字段的值
|
||||
- `value`:字段的新值(`action="set"` 时必填,`action="remove"` 时填要移除的元素值)
|
||||
* 所有字段均为列表类型,每个元素一条变更记录
|
||||
- `description`: 一组待分析的描述字符串
|
||||
- `existing_metadata`: 用户当前已经存在的 metadata
|
||||
|
||||
**判断规则:**
|
||||
- 用户提到新信息 → `action="set"`,填入新值
|
||||
- 用户明确否定已有信息(如"我不再做老师了"、"我已经不学Python了")→ `action="remove"`,`value` 填要移除的元素值
|
||||
- 如果本次对话没有任何可提取的变更,返回空的 `metadata_changes` 数组 `[]`
|
||||
- **不要为未被提及的字段生成任何变更操作**
|
||||
你的目标不是重建完整 metadata,而是只输出“新增内容”:
|
||||
|
||||
{% if existing_metadata %}
|
||||
**已有元数据(仅供参考,用于判断是否需要变更):**
|
||||
请对比已有数据和用户最新发言,只输出差异部分的变更操作。
|
||||
- 如果用户说的信息和已有数据一致,不需要输出变更
|
||||
- 如果用户否定了已有数据中的某个值,输出 `remove` 操作
|
||||
- 如果用户提到了新信息,输出 `set` 操作
|
||||
{% endif %}
|
||||
- 只能输出从 `description` 中能够支持的新增 metadata
|
||||
- 不要重复输出已经出现在 `existing_metadata` 里的内容
|
||||
- 不允许修改、重写、删除或纠正已有 metadata
|
||||
- 所有字段一律输出为字符串数组
|
||||
{% else %}
|
||||
You are an assistant for incremental user metadata extraction. Your task is to extract durable, user-node-level new metadata from the input `description` list.
|
||||
|
||||
**字段说明:**
|
||||
- profile.role:用户的职业或角色(列表),如 教师、医生、后端工程师,一个人可以有多个角色
|
||||
- profile.domain:用户所在领域(列表),如 教育、医疗、软件开发,一个人可以涉及多个领域
|
||||
- profile.expertise:用户擅长的技能或工具(列表),如 Python、心理咨询、高中物理
|
||||
- profile.interests:用户主动表达兴趣的话题或领域标签(列表)
|
||||
You will receive:
|
||||
|
||||
**用户别名变更(增量模式):**
|
||||
- **aliases_to_add**:本次新发现的用户别名,包括:
|
||||
* 用户主动自我介绍:如"我叫张三"、"我的名字是XX"、"我的网名是XX"
|
||||
* 他人对用户的称呼:如"同事叫我陈哥"、"大家叫我小张"、"领导叫我老陈"
|
||||
* 只提取原文中逐字出现的名字,严禁推测或创造
|
||||
* 禁止提取:用户给 AI 取的名字、第三方人物自身的名字、"用户"/"我" 等占位词
|
||||
* 如果没有新别名,返回空数组 `[]`
|
||||
- **aliases_to_remove**:用户明确否认的别名,包括:
|
||||
* 用户说"我不叫XX了"、"别叫我XX"、"我改名了,不叫XX" → 将 XX 放入此数组
|
||||
* **严格限制**:只将用户原文中**逐字提到**的被否认名字放入,不要推断关联的其他别名
|
||||
* 如果没有要移除的别名,返回空数组 `[]`
|
||||
{% if existing_aliases %}
|
||||
- 已有别名:{{ existing_aliases | tojson }}(仅供参考,不需要在输出中重复)
|
||||
{% endif %}
|
||||
{% else %}
|
||||
**"Three-Degree Principle" criteria:**
|
||||
- Reusability: Will this information be used by multiple functional modules?
|
||||
- Constraint: Will this information affect system behavior?
|
||||
- Timeliness: Is this information long-term stable or temporary? Only extract long-term stable information.
|
||||
- `description`: a list of descriptions to analyze
|
||||
- `existing_metadata`: the user's existing metadata
|
||||
|
||||
**Extraction rules:**
|
||||
- **Only extract profile information about the user themselves**, ignore information about third parties (friends, colleagues, family) mentioned by the user
|
||||
- Only extract information explicitly mentioned in the text, do not speculate
|
||||
- **Output language must match the input text language**
|
||||
Your goal is not to rebuild the full metadata. You must output only new metadata:
|
||||
|
||||
**Incremental mode (important):**
|
||||
You should only output **the change operations caused by this conversation**, not the complete metadata. Each change is an object containing:
|
||||
- `field_path`: Field path separated by dots (e.g. `profile.role`, `profile.expertise`)
|
||||
- `action`: Operation type
|
||||
* `set`: Add or update a field value
|
||||
* `remove`: Remove a field value
|
||||
- `value`: The new value for the field (required when `action="set"`, for `action="remove"` fill in the element value to remove)
|
||||
* All fields are list types, one change record per element
|
||||
- Output only metadata supported by `description`
|
||||
- Do not repeat anything already present in `existing_metadata`
|
||||
- Do not modify, rewrite, delete, or correct existing metadata
|
||||
- Every field must be an array of strings
|
||||
{% endif %}
|
||||
|
||||
**Decision rules:**
|
||||
- User mentions new information → `action="set"`, fill in the new value
|
||||
- User explicitly negates existing info (e.g. "I'm no longer a teacher", "I stopped learning Python") → `action="remove"`, `value` is the element to remove
|
||||
- If this conversation has no extractable changes, return an empty `metadata_changes` array `[]`
|
||||
- **Do NOT generate any change operations for fields not mentioned in the conversation**
|
||||
===Inputs===
|
||||
{% if language == "zh" %}
|
||||
输入 JSON 包含以下字段:
|
||||
|
||||
{% if existing_metadata %}
|
||||
**Existing metadata (for reference only, to determine if changes are needed):**
|
||||
Compare existing data with the user's latest statements, and only output change operations for the differences.
|
||||
- If the user's statement matches existing data, no change is needed
|
||||
- If the user negates a value in existing data, output a `remove` operation
|
||||
- If the user mentions new information, output a `set` operation
|
||||
{% endif %}
|
||||
- `description`: 字符串数组,表示关于用户的一组描述
|
||||
- `existing_metadata`: 现有 metadata 对象,字段固定为:
|
||||
- `aliases`
|
||||
- `core_facts`
|
||||
- `traits`
|
||||
- `relations`
|
||||
- `goals`
|
||||
- `interests`
|
||||
- `beliefs_or_stances`
|
||||
- `anchors`
|
||||
- `events`
|
||||
{% else %}
|
||||
The input JSON contains:
|
||||
- `description`: an array of strings describing the user
|
||||
- `existing_metadata`: an existing metadata object with these fixed fields:
|
||||
- `aliases`
|
||||
- `core_facts`
|
||||
- `traits`
|
||||
- `relations`
|
||||
- `goals`
|
||||
- `interests`
|
||||
- `beliefs_or_stances`
|
||||
- `anchors`
|
||||
- `events`
|
||||
{% endif %}
|
||||
|
||||
**Field descriptions:**
|
||||
- profile.role: User's occupation or role (list), e.g. teacher, doctor, software engineer. A person can have multiple roles
|
||||
- profile.domain: User's domain (list), e.g. education, healthcare, software development. A person can span multiple domains
|
||||
- profile.expertise: User's skills or tools (list), e.g. Python, counseling, physics
|
||||
- profile.interests: Topics or domain tags the user actively expressed interest in (list)
|
||||
Input JSON:
|
||||
|
||||
**User alias changes (incremental mode):**
|
||||
- **aliases_to_add**: Newly discovered user aliases from this conversation, including:
|
||||
* User self-introductions: e.g. "I'm John", "My name is XX", "My username is XX"
|
||||
* How others address the user: e.g. "My colleagues call me Johnny", "People call me Mike"
|
||||
* Only extract names that appear VERBATIM in the text — never infer or fabricate
|
||||
* Do NOT extract: names the user gives to the AI, third-party people's own names, placeholder words like "User"/"I"
|
||||
* If no new aliases, return empty array `[]`
|
||||
- **aliases_to_remove**: Aliases the user explicitly denies, including:
|
||||
* User says "Don't call me XX anymore", "I'm not called XX", "I changed my name from XX" → put XX in this array
|
||||
* **Strict rule**: Only include the exact name the user **verbatim mentions** as denied. Do NOT infer or remove related aliases
|
||||
* If no aliases to remove, return empty array `[]`
|
||||
{% if existing_aliases %}
|
||||
- Existing aliases: {{ existing_aliases | tojson }} (for reference only, do not repeat in output)
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
===User Statements===
|
||||
{% for stmt in statements %}
|
||||
- {{ stmt }}
|
||||
{% endfor %}
|
||||
|
||||
{% if existing_metadata %}
|
||||
===Existing User Metadata===
|
||||
```json
|
||||
{{ existing_metadata | tojson }}
|
||||
{{ input_json | default("{}") }}
|
||||
```
|
||||
|
||||
===Field Definitions===
|
||||
{% if language == "zh" %}
|
||||
|
||||
- `aliases`
|
||||
- 用户的别名、昵称、称呼、英文名、稳定使用的另一个名字
|
||||
- `core_facts`
|
||||
- 用户相对稳定的基础事实,如身份、年龄、国籍、所在地、关系状态、家庭状态、长期背景
|
||||
- `traits`
|
||||
- 用户相对稳定的人格特质、风格、气质、行为倾向
|
||||
- `relations`
|
||||
- 用户与他人/群体/宠物/重要对象之间值得长期记忆的关系
|
||||
- 保持字符串格式,可包含多个片段,常见格式如 `对象 | 关系/身份 | 补充信息`
|
||||
- `goals`
|
||||
- 用户明确、稳定、值得长期保留的人生目标、长期计划、持续追求
|
||||
- `interests`
|
||||
- 用户稳定的兴趣、偏好、长期爱好
|
||||
- `beliefs_or_stances`
|
||||
- 用户稳定的信念、价值立场、政治/宗教/社会议题立场
|
||||
- `anchors`
|
||||
- 对用户有长期意义的物品、收藏、纪念物、象征物
|
||||
- 保持字符串格式,可包含多个片段,常见格式如 `对象 | 来源/关联 | 意义`
|
||||
- `events`
|
||||
- 对用户画像有长期价值的个人经历、事件、里程碑
|
||||
- 保持字符串格式,可包含多个片段,常见格式如 `事件 | 时间 | 补充说明`
|
||||
{% else %}
|
||||
- `aliases`
|
||||
- aliases, nicknames, stable alternative names, English names, or regular forms of address
|
||||
- `core_facts`
|
||||
- stable basic facts such as identity, age, nationality, residence, relationship status, family status, or long-term background
|
||||
- `traits`
|
||||
- stable personality traits, style, temperament, or behavioral tendencies
|
||||
- `relations`
|
||||
- durable relationships between the user and people/groups/pets/important entities
|
||||
- keep string format; common pattern: `entity | relation/identity | extra info`
|
||||
- `goals`
|
||||
- explicit, stable, long-term goals or ongoing pursuits worth remembering
|
||||
- `interests`
|
||||
- stable interests, preferences, or hobbies
|
||||
- `beliefs_or_stances`
|
||||
- stable beliefs, values, political/religious/social stances
|
||||
- `anchors`
|
||||
- personally meaningful objects, collections, keepsakes, or symbols
|
||||
- keep string format; common pattern: `object | source/association | meaning`
|
||||
- `events`
|
||||
- durable personal experiences, milestones, or events worth preserving
|
||||
- keep string format; common pattern: `event | time | extra note`
|
||||
{% endif %}
|
||||
|
||||
===Core Principles===
|
||||
{% if language == "zh" %}
|
||||
|
||||
1. 只提取新增内容
|
||||
|
||||
- 如果某条信息已经在 `existing_metadata` 中出现,不能再次输出
|
||||
- 即使 `description` 只是换了一种说法表达已有信息,也不要重复输出
|
||||
- 如果只是对已有信息做轻微改写、近义改写、语序调整,也视为重复
|
||||
|
||||
2. 不修改已有内容
|
||||
|
||||
- 不要纠正已有 metadata 的措辞
|
||||
- 不要补全已有 metadata 的结构
|
||||
- 不要把已有 metadata 中的短字符串改写成更长版本后再输出
|
||||
- 不要因为 `description` 出现了更精确表达,就把已有内容“升级后重新输出”
|
||||
|
||||
3. 只保留对用户画像有长期价值的信息
|
||||
|
||||
- 优先提取稳定身份、长期偏好、重要关系、重大目标、长期立场、重要锚点、关键事件
|
||||
- 不要提取纯闲聊、瞬时感受、一次性很弱的细节
|
||||
- 短暂情绪通常不单独提取,除非它是某个重要事件说明的一部分
|
||||
|
||||
4. 所有字段都必须是字符串数组
|
||||
|
||||
- 不允许输出对象数组
|
||||
- 不允许输出嵌套结构
|
||||
- 不允许把 `events` 拆成 event/time/note 对象
|
||||
- 不允许把 `relations` 拆成 subject/relation/object 对象
|
||||
|
||||
5. 可以保留多段信息在一个字符串里
|
||||
|
||||
- `relations`、`anchors`、`events` 可以使用 `|` 连接多个片段
|
||||
- 只有在确实有助于保留结构时才这样做
|
||||
- 不必强行补满固定片段数,宁可简洁准确
|
||||
|
||||
6. 证据边界
|
||||
|
||||
- 只能依据 `description` 提取新增 metadata
|
||||
- `existing_metadata` 只用于去重和分类参考,不是新增内容来源
|
||||
- 不要从常识、推测或世界知识补充额外信息
|
||||
{% else %}
|
||||
|
||||
1. Extract only new content
|
||||
|
||||
- If something already appears in `existing_metadata`, do not output it again
|
||||
- If a description merely paraphrases existing metadata, do not output it
|
||||
- Minor wording changes, synonym swaps, or reordered phrasing still count as duplicates
|
||||
|
||||
2. Do not modify existing content
|
||||
|
||||
- Do not correct wording in existing metadata
|
||||
- Do not expand existing metadata and re-output it
|
||||
- Do not upgrade an existing item into a more detailed version and emit it as new
|
||||
|
||||
3. Keep only durable user-profile information
|
||||
|
||||
- Prioritize stable identity, long-term preferences, important relationships, major goals, durable stances, meaningful anchors, and key events
|
||||
- Exclude casual chatter, fleeting states, and weak one-off details
|
||||
- Temporary emotions should usually not be extracted unless they are part of an important event description
|
||||
|
||||
4. Every field must be an array of strings
|
||||
|
||||
- No object arrays
|
||||
- No nested structure
|
||||
- Do not split `events` into event/time/note objects
|
||||
- Do not split `relations` into structured triples
|
||||
|
||||
5. Multi-part strings are allowed
|
||||
|
||||
- `relations`, `anchors`, and `events` may use `|` to join parts
|
||||
- Do this only when it helps preserve useful structure
|
||||
- Do not force a fixed number of parts
|
||||
|
||||
6. Evidence boundary
|
||||
|
||||
- Extract new metadata only from `description`
|
||||
- Use `existing_metadata` only for deduplication and category reference
|
||||
- Do not add unsupported information from world knowledge or inference beyond the text
|
||||
{% endif %}
|
||||
|
||||
===Deduplication Rules===
|
||||
{% if language == "zh" %}
|
||||
|
||||
- 先理解 `description` 想表达的含义,再与 `existing_metadata` 做语义去重
|
||||
- 若以下任一情况成立,则视为“已存在”,不要输出:
|
||||
- 完全相同
|
||||
- 近义表达
|
||||
- 更长或更短但语义相同
|
||||
- 只是把已有多段字符串拆开或重新组合
|
||||
- 只是把已有事件/关系中的时间或备注略作改写
|
||||
- 去重标准以“是否新增了值得保留的新事实”为准,而不是字面是否完全一致
|
||||
|
||||
去重示例:
|
||||
|
||||
- 已有 `single`,新描述说 `not in a relationship`,不要输出
|
||||
- 已有 `from Sweden`,新描述说 `originally from Sweden`,不要输出
|
||||
- 已有 `art`,新描述说 `likes art a lot`,通常不要输出
|
||||
- 已有 `Oscar | pet guinea pig`,新描述说 `her guinea pig Oscar`,不要输出
|
||||
{% else %}
|
||||
- First understand the meaning of the description, then deduplicate semantically against `existing_metadata`
|
||||
- Treat an item as already existing if any of these holds:
|
||||
- exact match
|
||||
- close paraphrase
|
||||
- longer or shorter wording with the same meaning
|
||||
- just a split or recombination of an existing multi-part string
|
||||
- a lightly reworded time/note variant of an existing event or relation
|
||||
- The test is whether the item adds a genuinely new durable fact, not whether the wording is different
|
||||
{% endif %}
|
||||
|
||||
===Extraction Guidance By Field===
|
||||
{% if language == "zh" %}
|
||||
`aliases`
|
||||
|
||||
- 只收稳定名字,不收临时调侃
|
||||
- 职业、身份、评价词不算 alias
|
||||
|
||||
`core_facts`
|
||||
|
||||
- 放稳定基础事实
|
||||
- 不要放短暂状态、一次性动作、弱情绪
|
||||
|
||||
`traits`
|
||||
|
||||
- 只收相对稳定的人格或行为风格
|
||||
- 不要因为一次行为就推断 trait
|
||||
|
||||
`relations`
|
||||
|
||||
- 只保留长期关系、有记忆价值的关系
|
||||
- 可以写成 `对象 | 关系/身份 | 补充信息`
|
||||
- 不要收纯一次性互动
|
||||
|
||||
`goals`
|
||||
|
||||
- 只收长期目标
|
||||
- 不要把一时愿望、泛化口号、普通期待当作 goal
|
||||
|
||||
`interests`
|
||||
|
||||
- 只收稳定兴趣
|
||||
- 短期尝试一次某事,通常不算 interest
|
||||
|
||||
`beliefs_or_stances`
|
||||
|
||||
- 收稳定信念、价值观、政治/宗教/社会议题立场
|
||||
- 不要收普通瞬时意见
|
||||
|
||||
`anchors`
|
||||
|
||||
- 收具有象征意义、纪念意义、长期陪伴意义的对象
|
||||
- 可写来源与意义
|
||||
|
||||
`events`
|
||||
|
||||
- 只收对用户画像有长期价值的事件或里程碑
|
||||
- 优先保留时间信息和事件意义
|
||||
- 普通日常小事通常不收,除非它明显揭示重要关系、目标推进或身份背景
|
||||
{% else %}
|
||||
`aliases`
|
||||
- only stable names, not playful one-off labels
|
||||
- occupations, identities, and evaluations are not aliases
|
||||
|
||||
`core_facts`
|
||||
|
||||
- keep stable background facts
|
||||
- exclude temporary states, one-off actions, and weak emotions
|
||||
|
||||
`traits`
|
||||
|
||||
- only relatively stable traits or behavioral style
|
||||
- do not infer a trait from one isolated action
|
||||
|
||||
`relations`
|
||||
|
||||
- keep durable, memory-worthy relationships
|
||||
- may use `entity | relation/identity | extra info`
|
||||
- exclude one-off interactions
|
||||
|
||||
`goals`
|
||||
|
||||
- only long-term goals
|
||||
- do not treat temporary wishes or generic aspirations as goals
|
||||
|
||||
`interests`
|
||||
|
||||
- only stable interests
|
||||
- a one-time attempt usually does not qualify
|
||||
|
||||
`beliefs_or_stances`
|
||||
|
||||
- keep stable beliefs, values, or social/political/religious stances
|
||||
- exclude ordinary fleeting opinions
|
||||
|
||||
`anchors`
|
||||
|
||||
- keep symbolic, commemorative, or personally meaningful objects
|
||||
- source and meaning may be included
|
||||
|
||||
`events`
|
||||
|
||||
- keep only events or milestones with durable profile value
|
||||
- preserve time and significance when useful
|
||||
- exclude ordinary daily trivia unless it clearly advances an important goal, relationship, or identity arc
|
||||
{% endif %}
|
||||
|
||||
===Output Hard Constraints===
|
||||
{% if language == "zh" %}
|
||||
|
||||
- 只输出新增 metadata,不要输出完整 metadata
|
||||
- 结果必须包含全部 9 个字段
|
||||
- 每个字段都必须是数组
|
||||
- 即使某字段没有新增内容,也必须输出空数组
|
||||
- 每个数组元素必须是字符串
|
||||
- 不要输出 `null`
|
||||
- 不要输出解释文字
|
||||
- 不要输出 markdown code fence
|
||||
- 不要输出字段之外的任何额外键
|
||||
- 如果没有任何新增 metadata,也必须返回所有字段都为空数组的 JSON
|
||||
{% else %}
|
||||
- Output only new metadata, not the full metadata
|
||||
- The result must include all 9 fields
|
||||
- Every field must be an array
|
||||
- Use empty arrays when there is no new content
|
||||
- Every array element must be a string
|
||||
- Do not output `null`
|
||||
- Do not output explanation text
|
||||
- Do not wrap the result in markdown code fences
|
||||
- Do not output any keys beyond the required fields
|
||||
- If there is no new metadata, still return the full JSON shape with empty arrays
|
||||
{% endif %}
|
||||
|
||||
===Examples===
|
||||
{% if language == "zh" %}
|
||||
示例 1
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "She recently started volunteering for a trans youth hotline."
|
||||
- existing_metadata:
|
||||
- goals: ["pursue counseling / mental health work for transgender people"]
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [],
|
||||
"events": [
|
||||
"started volunteering for a trans youth hotline"
|
||||
]
|
||||
}
|
||||
|
||||
示例 2
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "She is originally from Sweden."
|
||||
- "She is not dating anyone right now."
|
||||
- existing_metadata:
|
||||
- core_facts: ["from Sweden", "single"]
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [],
|
||||
"events": []
|
||||
}
|
||||
|
||||
示例 3
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "Her sister Mia encouraged her to apply."
|
||||
- existing_metadata:
|
||||
- relations: ["grandma | grandmother | from Sweden"]
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [
|
||||
"Mia | sister"
|
||||
],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [],
|
||||
"events": []
|
||||
}
|
||||
|
||||
示例 4
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "She keeps a journal from her first year after moving."
|
||||
- existing_metadata:
|
||||
- anchors: []
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [
|
||||
"journal | from first year after moving"
|
||||
],
|
||||
"events": []
|
||||
}
|
||||
|
||||
示例 5
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "Last month she attended a workshop on trauma-informed care and felt it clarified her future direction."
|
||||
- existing_metadata:
|
||||
- goals: ["pursue counseling / mental health work for transgender people"]
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [],
|
||||
"events": [
|
||||
"attended workshop on trauma-informed care | last month | clarified future direction"
|
||||
]
|
||||
}
|
||||
{% else %}
|
||||
Example 1
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "She recently started volunteering for a trans youth hotline."
|
||||
- existing_metadata:
|
||||
- goals: ["pursue counseling / mental health work for transgender people"]
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [],
|
||||
"events": [
|
||||
"started volunteering for a trans youth hotline"
|
||||
]
|
||||
}
|
||||
|
||||
Example 2
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "She is originally from Sweden."
|
||||
- "She is not dating anyone right now."
|
||||
- existing_metadata:
|
||||
- core_facts: ["from Sweden", "single"]
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [],
|
||||
"events": []
|
||||
}
|
||||
|
||||
Example 3
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "Her sister Mia encouraged her to apply."
|
||||
- existing_metadata:
|
||||
- relations: ["grandma | grandmother | from Sweden"]
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [
|
||||
"Mia | sister"
|
||||
],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [],
|
||||
"events": []
|
||||
}
|
||||
|
||||
Example 4
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "She keeps a journal from her first year after moving."
|
||||
- existing_metadata:
|
||||
- anchors: []
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [
|
||||
"journal | from first year after moving"
|
||||
],
|
||||
"events": []
|
||||
}
|
||||
|
||||
Example 5
|
||||
Input:
|
||||
|
||||
- description:
|
||||
- "Last month she attended a workshop on trauma-informed care and felt it clarified her future direction."
|
||||
- existing_metadata:
|
||||
- goals: ["pursue counseling / mental health work for transgender people"]
|
||||
|
||||
Output:
|
||||
{
|
||||
"aliases": [],
|
||||
"core_facts": [],
|
||||
"traits": [],
|
||||
"relations": [],
|
||||
"goals": [],
|
||||
"interests": [],
|
||||
"beliefs_or_stances": [],
|
||||
"anchors": [],
|
||||
"events": [
|
||||
"attended workshop on trauma-informed care | last month | clarified future direction"
|
||||
]
|
||||
}
|
||||
{% endif %}
|
||||
|
||||
===Output Format===
|
||||
Return a JSON object with the following structure:
|
||||
{% if language == "zh" %}
|
||||
输出必须是严格可解析的 JSON 对象,结构固定如下:
|
||||
{% else %}
|
||||
Return a strict JSON object with this exact structure:
|
||||
{% endif %}
|
||||
|
||||
```json
|
||||
{
|
||||
"metadata_changes": [
|
||||
{"field_path": "profile.role", "action": "set", "value": "后端工程师"},
|
||||
{"field_path": "profile.expertise", "action": "set", "value": "Python"},
|
||||
{"field_path": "profile.expertise", "action": "remove", "value": "Java"}
|
||||
],
|
||||
"aliases_to_add": [],
|
||||
"aliases_to_remove": []
|
||||
"aliases": ["string"],
|
||||
"core_facts": ["string"],
|
||||
"traits": ["string"],
|
||||
"relations": ["string"],
|
||||
"goals": ["string"],
|
||||
"interests": ["string"],
|
||||
"beliefs_or_stances": ["string"],
|
||||
"anchors": ["string"],
|
||||
"events": ["string"]
|
||||
}
|
||||
```
|
||||
|
||||
{{ json_schema }}
|
||||
{% if language == "zh" %}
|
||||
JSON 要求:
|
||||
|
||||
- 使用标准 ASCII 双引号 `"`
|
||||
- 不要使用中文引号
|
||||
- 不要在 JSON 外输出任何文字
|
||||
- 字符串内如果包含双引号,必须转义为 `\"`
|
||||
- 不要遗漏字段
|
||||
- 不要输出尾逗号
|
||||
{% else %}
|
||||
JSON requirements:
|
||||
- Use standard ASCII double quotes `"`
|
||||
- No smart quotes
|
||||
- Output JSON only
|
||||
- Escape internal quotes as `\"`
|
||||
- Do not omit any field
|
||||
- Do not emit trailing commas
|
||||
{% endif %}
|
||||
@@ -57,10 +57,8 @@ class SensitiveDataFilter:
|
||||
(re.compile(r'\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+'), "[TOKEN]"),
|
||||
# JWT Token 部分匹配(只有header和payload,没有signature)
|
||||
(re.compile(r'\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]*)?'), "[TOKEN]"),
|
||||
# UUID格式的token或ID
|
||||
(re.compile(r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b', re.IGNORECASE), "[UUID]"),
|
||||
# API密钥格式(32位以上的字母数字组合)
|
||||
(re.compile(r'\b[A-Za-z0-9]{32,}\b'), "[API_KEY]"),
|
||||
# API密钥格式(64位以上的字母数字组合,避免误过滤普通业务字段)
|
||||
(re.compile(r'\b[A-Za-z0-9]{64,}\b'), "[API_KEY]"),
|
||||
]
|
||||
|
||||
# 替换文本
|
||||
|
||||
@@ -2,8 +2,7 @@ import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import Session, sessionmaker, declarative_base
|
||||
from app.core.config import settings
|
||||
|
||||
SQLALCHEMY_DATABASE_URL = f"postgresql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
# 必须在导入任何使用 DashScope SDK 的模块之前应用补丁
|
||||
import app.plugins.dashscope_patch # noqa: F401
|
||||
|
||||
from app.repositories.neo4j.create_indexes import create_all_indexes
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
|
||||
73
api/app/plugins/dashscope_patch.py
Normal file
73
api/app/plugins/dashscope_patch.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""
|
||||
DashScope SDK 补丁:修复 __getattr__ 违反 Python 属性访问协议的 bug。
|
||||
|
||||
背景
|
||||
----
|
||||
DashScope SDK 的 DictMixin(所有响应类的基类)的 __getattr__ 实现为:
|
||||
|
||||
def __getattr__(self, attr):
|
||||
return self[attr]
|
||||
|
||||
当属性/键不存在时,它抛出 KeyError。但按照 Python 数据模型规范,
|
||||
__getattr__ 应当抛出 AttributeError,否则 hasattr()/getattr(obj, name, default)
|
||||
等内置函数会失效。
|
||||
|
||||
实际影响
|
||||
--------
|
||||
requests 库在构造 HTTPError 时会调用 hasattr(response, "request")
|
||||
(见 requests/exceptions.py:22),当 DashScope 响应对象参与异常链路时,
|
||||
hasattr 会因 KeyError 直接崩溃,掩盖了真正的 HTTP 错误(如 429 限流、超时)。
|
||||
|
||||
此时抛出的异常表现为 KeyError('request'),极具误导性,并导致项目内已有的
|
||||
429 自动重试逻辑无法捕获真正的限流错误。
|
||||
|
||||
参考
|
||||
----
|
||||
DashScope SDK 官方 Issue #114:
|
||||
https://github.com/dashscope/dashscope-sdk-python/issues/114
|
||||
|
||||
修复
|
||||
----
|
||||
对 DictMixin.__getattr__ 进行 monkey-patch,将 KeyError 转换为 AttributeError,
|
||||
使其符合 Python 语义。补丁应用于基类,因此所有派生响应类型(DashScopeAPIResponse、
|
||||
GenerationResponse、MultiModalConversationResponse 等)都能一次性受益。
|
||||
|
||||
使用方式
|
||||
--------
|
||||
在应用入口(main.py / celery_worker.py)的最顶部导入本模块,
|
||||
在任何 DashScope 调用发生前完成补丁注入:
|
||||
|
||||
import app.plugins.dashscope_patch # noqa: F401
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from dashscope.api_entities.dashscope_response import DictMixin
|
||||
|
||||
# 防止被重复应用(例如 main 和 celery worker 都导入时)
|
||||
if not getattr(DictMixin, "_redbear_getattr_patched", False):
|
||||
_orig_getattr = DictMixin.__getattr__
|
||||
|
||||
def _safe_getattr(self, attr):
|
||||
"""符合 Python 语义的 __getattr__:键缺失抛 AttributeError 而非 KeyError。"""
|
||||
try:
|
||||
return _orig_getattr(self, attr)
|
||||
except KeyError as e:
|
||||
# 使用 `from None` 抑制 KeyError 链,避免异常信息里出现误导性的
|
||||
# "During handling of the above exception..." 堆栈
|
||||
raise AttributeError(attr) from None
|
||||
|
||||
DictMixin.__getattr__ = _safe_getattr
|
||||
DictMixin._redbear_getattr_patched = True # type: ignore[attr-defined]
|
||||
logger.info(
|
||||
"DashScope SDK 补丁已生效:DictMixin.__getattr__ 在缺失键时抛 AttributeError"
|
||||
)
|
||||
except ImportError:
|
||||
# DashScope SDK 未安装时跳过,不影响其他 provider
|
||||
logger.debug("未安装 dashscope,跳过 DashScope SDK 补丁")
|
||||
except Exception as e:
|
||||
# 补丁失败不应阻止应用启动
|
||||
logger.warning(f"应用 DashScope SDK 补丁失败,将继续启动: {e}")
|
||||
@@ -2,7 +2,7 @@
|
||||
终端用户信息仓储层
|
||||
"""
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.end_user_info_model import EndUserInfo
|
||||
@@ -69,3 +69,110 @@ class EndUserInfoRepository:
|
||||
self.db.commit()
|
||||
logger.info(f"删除用户所有信息记录: end_user_id={end_user_id}, count={count}")
|
||||
return count
|
||||
|
||||
def update_aliases_and_metadata(
|
||||
self,
|
||||
end_user_id: uuid.UUID,
|
||||
new_aliases: Optional[List[str]] = None,
|
||||
new_metadata: Optional[dict] = None,
|
||||
) -> Optional[EndUserInfo]:
|
||||
"""增量更新用户别名列表和元数据。
|
||||
|
||||
- aliases:将 new_aliases 合并到现有列表(去重,忽略大小写),不覆盖
|
||||
- meta_data:将 new_metadata 的各字段列表合并到现有 meta_data(去重),不覆盖
|
||||
- other_name:若当前为空且 aliases 非空,则取 aliases[0] 作为 other_name
|
||||
|
||||
Args:
|
||||
end_user_id: 终端用户 ID
|
||||
new_aliases: 本次新增的别名列表
|
||||
new_metadata: 本次提取的 extracted_metadata 字典
|
||||
|
||||
Returns:
|
||||
更新后的 EndUserInfo,若记录不存在则返回 None
|
||||
"""
|
||||
end_user_info = self.get_by_end_user_id(end_user_id)
|
||||
if not end_user_info:
|
||||
logger.warning(f"[EndUserInfo] 记录不存在,跳过更新: end_user_id={end_user_id}")
|
||||
return None
|
||||
|
||||
changed = False
|
||||
|
||||
# ── 合并 aliases(去重,忽略大小写)──
|
||||
if new_aliases:
|
||||
existing = list(end_user_info.aliases or [])
|
||||
existing_lower = {a.lower() for a in existing}
|
||||
for alias in new_aliases:
|
||||
alias = alias.strip()
|
||||
if alias and alias.lower() not in existing_lower:
|
||||
existing.append(alias)
|
||||
existing_lower.add(alias.lower())
|
||||
end_user_info.aliases = existing
|
||||
changed = True
|
||||
|
||||
# ── 同步 other_name:取 aliases[0](若当前为空)──
|
||||
if end_user_info.aliases and not (end_user_info.other_name or "").strip():
|
||||
end_user_info.other_name = end_user_info.aliases[0]
|
||||
changed = True
|
||||
|
||||
# ── 合并 meta_data(各字段列表去重追加)──
|
||||
if new_metadata:
|
||||
existing_meta = dict(end_user_info.meta_data or {})
|
||||
for field, values in new_metadata.items():
|
||||
if not isinstance(values, list):
|
||||
continue
|
||||
existing_list = list(existing_meta.get(field) or [])
|
||||
existing_set = {str(v).lower() for v in existing_list}
|
||||
for v in values:
|
||||
if str(v).lower() not in existing_set:
|
||||
existing_list.append(v)
|
||||
existing_set.add(str(v).lower())
|
||||
existing_meta[field] = existing_list
|
||||
end_user_info.meta_data = existing_meta
|
||||
changed = True
|
||||
|
||||
if changed:
|
||||
self.db.commit()
|
||||
self.db.refresh(end_user_info)
|
||||
logger.info(
|
||||
f"[EndUserInfo] 更新完成: end_user_id={end_user_id}, "
|
||||
f"aliases_count={len(end_user_info.aliases or [])}"
|
||||
)
|
||||
return end_user_info
|
||||
|
||||
def remove_aliases(
|
||||
self,
|
||||
end_user_id: uuid.UUID,
|
||||
aliases_to_remove: List[str],
|
||||
) -> Optional["EndUserInfo"]:
|
||||
"""从用户别名列表中移除指定别名(忽略大小写)。
|
||||
|
||||
Args:
|
||||
end_user_id: 终端用户 ID
|
||||
aliases_to_remove: 需要移除的别名列表
|
||||
|
||||
Returns:
|
||||
更新后的 EndUserInfo,若记录不存在则返回 None
|
||||
"""
|
||||
if not aliases_to_remove:
|
||||
return self.get_by_end_user_id(end_user_id)
|
||||
|
||||
end_user_info = self.get_by_end_user_id(end_user_id)
|
||||
if not end_user_info:
|
||||
logger.warning(f"[EndUserInfo] 记录不存在,跳过别名移除: end_user_id={end_user_id}")
|
||||
return None
|
||||
|
||||
remove_lower = {a.strip().lower() for a in aliases_to_remove if a.strip()}
|
||||
existing = list(end_user_info.aliases or [])
|
||||
new_aliases = [a for a in existing if a.lower() not in remove_lower]
|
||||
|
||||
if len(new_aliases) == len(existing):
|
||||
return end_user_info
|
||||
|
||||
end_user_info.aliases = new_aliases
|
||||
self.db.commit()
|
||||
self.db.refresh(end_user_info)
|
||||
logger.info(
|
||||
f"[EndUserInfo] 别名移除完成: end_user_id={end_user_id}, "
|
||||
f"removed={aliases_to_remove}, remaining={new_aliases}"
|
||||
)
|
||||
return end_user_info
|
||||
|
||||
@@ -37,7 +37,6 @@ async def add_chunk_statement_edges(chunks: List[Chunk], connector: Neo4jConnect
|
||||
"apply_id": getattr(stmt, 'apply_id', None),
|
||||
"run_id": getattr(stmt, 'run_id', None) or getattr(chunk, 'run_id', None),
|
||||
"created_at": getattr(stmt, 'created_at', None),
|
||||
"expired_at": getattr(stmt, 'expired_at', None),
|
||||
# "created_at": getattr(statement, 'created_at', None),
|
||||
# "expired_at": None # Set to None or appropriate default
|
||||
}
|
||||
@@ -87,7 +86,6 @@ async def add_memory_summary_statement_edges(summaries: List[MemorySummaryNode],
|
||||
"end_user_id": s.end_user_id,
|
||||
"run_id": s.run_id,
|
||||
"created_at": s.created_at.isoformat() if s.created_at else None,
|
||||
"expired_at": s.expired_at.isoformat() if s.expired_at else None,
|
||||
})
|
||||
|
||||
if not edges:
|
||||
|
||||
@@ -42,7 +42,6 @@ async def add_dialogue_nodes(dialogues: List[DialogueNode], connector: Neo4jConn
|
||||
"ref_id": dialogue.ref_id,
|
||||
"name": dialogue.name,
|
||||
"created_at": dialogue.created_at.isoformat() if dialogue.created_at else None,
|
||||
"expired_at": dialogue.expired_at.isoformat() if dialogue.expired_at else None,
|
||||
"content": dialogue.content,
|
||||
"dialog_embedding": dialogue.dialog_embedding
|
||||
})
|
||||
@@ -87,7 +86,6 @@ async def add_statement_nodes(statements: List[StatementNode], connector: Neo4jC
|
||||
"chunk_id": statement.chunk_id,
|
||||
# "created_at": statement.created_at.isoformat(),
|
||||
"created_at": statement.created_at.isoformat() if statement.created_at else None,
|
||||
"expired_at": statement.expired_at.isoformat() if statement.expired_at else None,
|
||||
"stmt_type": statement.stmt_type,
|
||||
"temporal_info": statement.temporal_info.value,
|
||||
"statement": statement.statement,
|
||||
@@ -115,7 +113,8 @@ async def add_statement_nodes(statements: List[StatementNode], connector: Neo4jC
|
||||
"activation_value": statement.activation_value,
|
||||
"access_history": statement.access_history if statement.access_history else [],
|
||||
"last_access_time": statement.last_access_time,
|
||||
"access_count": statement.access_count
|
||||
"access_count": statement.access_count,
|
||||
"dialog_at": statement.dialog_at.isoformat() if statement.dialog_at else None,
|
||||
}
|
||||
flattened_statements.append(flattened_statement)
|
||||
|
||||
@@ -159,7 +158,6 @@ async def add_chunk_nodes(chunks: List[ChunkNode], connector: Neo4jConnector) ->
|
||||
"end_user_id": chunk.end_user_id,
|
||||
"run_id": chunk.run_id,
|
||||
"created_at": chunk.created_at.isoformat() if chunk.created_at else None,
|
||||
"expired_at": chunk.expired_at.isoformat() if chunk.expired_at else None,
|
||||
"dialog_id": chunk.dialog_id,
|
||||
"content": chunk.content,
|
||||
"chunk_embedding": chunk.chunk_embedding if chunk.chunk_embedding else None,
|
||||
@@ -211,7 +209,6 @@ async def add_memory_summary_nodes(
|
||||
"end_user_id": s.end_user_id,
|
||||
"run_id": s.run_id,
|
||||
"created_at": s.created_at.isoformat() if s.created_at else None,
|
||||
"expired_at": s.expired_at.isoformat() if s.expired_at else None,
|
||||
"dialog_id": s.dialog_id,
|
||||
"chunk_ids": s.chunk_ids,
|
||||
"content": s.content,
|
||||
|
||||
@@ -17,10 +17,9 @@ async def create_fulltext_indexes():
|
||||
# CREATE FULLTEXT INDEX dialoguesFulltext IF NOT EXISTS FOR (d:Dialogue) ON EACH [d.content]
|
||||
# OPTIONS { indexConfig: { `fulltext.analyzer`: 'cjk' } }
|
||||
# """)
|
||||
# 创建 Entities 索引
|
||||
# 创建 Entities 索引 (name + description + aliases)
|
||||
await connector.execute_query("""
|
||||
CREATE FULLTEXT INDEX entitiesFulltext IF NOT EXISTS
|
||||
FOR (e:ExtractedEntity) ON EACH [e.name, e.description, e.aliases]
|
||||
CREATE FULLTEXT INDEX entitiesFulltext IF NOT EXISTS FOR (e:ExtractedEntity) ON EACH [e.name, e.description, e.aliases]
|
||||
OPTIONS { indexConfig: { `fulltext.analyzer`: 'cjk' } }
|
||||
""")
|
||||
|
||||
@@ -47,6 +46,12 @@ async def create_fulltext_indexes():
|
||||
OPTIONS { indexConfig: { `fulltext.analyzer`: 'cjk' } }
|
||||
""")
|
||||
|
||||
# 创建 AssistantPruned 剪枝文本全文索引
|
||||
await connector.execute_query("""
|
||||
CREATE FULLTEXT INDEX assistantPrunedFulltext IF NOT EXISTS FOR (p:AssistantPruned) ON EACH [p.text]
|
||||
OPTIONS { indexConfig: { `fulltext.analyzer`: 'cjk' } }
|
||||
""")
|
||||
|
||||
finally:
|
||||
await connector.close()
|
||||
|
||||
@@ -136,6 +141,17 @@ async def create_vector_indexes():
|
||||
`vector.similarity_function`: 'cosine'
|
||||
}}
|
||||
""")
|
||||
|
||||
# AssistantPruned text embedding index (optional, for semantic search on pruned hints)
|
||||
await connector.execute_query("""
|
||||
CREATE VECTOR INDEX assistant_pruned_embedding_index IF NOT EXISTS
|
||||
FOR (p:AssistantPruned)
|
||||
ON p.text_embedding
|
||||
OPTIONS {indexConfig: {
|
||||
`vector.dimensions`: 1024,
|
||||
`vector.similarity_function`: 'cosine'
|
||||
}}
|
||||
""")
|
||||
finally:
|
||||
await connector.close()
|
||||
|
||||
@@ -180,6 +196,22 @@ async def create_unique_constraints():
|
||||
"""
|
||||
)
|
||||
|
||||
# AssistantOriginal.id unique
|
||||
await connector.execute_query(
|
||||
"""
|
||||
CREATE CONSTRAINT assistant_original_id_unique IF NOT EXISTS
|
||||
FOR (o:AssistantOriginal) REQUIRE o.id IS UNIQUE
|
||||
"""
|
||||
)
|
||||
|
||||
# AssistantPruned.id unique
|
||||
await connector.execute_query(
|
||||
"""
|
||||
CREATE CONSTRAINT assistant_pruned_id_unique IF NOT EXISTS
|
||||
FOR (p:AssistantPruned) REQUIRE p.id IS UNIQUE
|
||||
"""
|
||||
)
|
||||
|
||||
finally:
|
||||
await connector.close()
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ DIALOGUE_NODE_SAVE = """
|
||||
n.run_id = dialogue.run_id,
|
||||
n.ref_id = dialogue.ref_id,
|
||||
n.created_at = dialogue.created_at,
|
||||
n.expired_at = dialogue.expired_at,
|
||||
n.content = dialogue.content,
|
||||
n.dialog_embedding = dialogue.dialog_embedding
|
||||
RETURN n.id AS uuid
|
||||
@@ -32,20 +31,29 @@ SET s += {
|
||||
emotion_keywords: statement.emotion_keywords,
|
||||
temporal_info: statement.temporal_info,
|
||||
created_at: statement.created_at,
|
||||
expired_at: statement.expired_at,
|
||||
valid_at: statement.valid_at,
|
||||
invalid_at: statement.invalid_at,
|
||||
valid_at: coalesce(statement.valid_at, ""),
|
||||
invalid_at: coalesce(statement.invalid_at, ""),
|
||||
statement_embedding: statement.statement_embedding,
|
||||
relevence_info: statement.relevence_info,
|
||||
importance_score: statement.importance_score,
|
||||
activation_value: statement.activation_value,
|
||||
access_history: statement.access_history,
|
||||
last_access_time: statement.last_access_time,
|
||||
access_count: statement.access_count
|
||||
access_count: statement.access_count,
|
||||
dialog_at: statement.dialog_at
|
||||
}
|
||||
RETURN s.id AS uuid
|
||||
"""
|
||||
|
||||
STATEMENT_EMOTION_UPDATE = """
|
||||
UNWIND $items AS item
|
||||
MATCH (s:Statement {id: item.statement_id})
|
||||
SET s.emotion_type = item.emotion_type,
|
||||
s.emotion_intensity = item.emotion_intensity,
|
||||
s.emotion_keywords = item.emotion_keywords
|
||||
RETURN s.id AS uuid
|
||||
"""
|
||||
|
||||
CHUNK_NODE_SAVE = """
|
||||
UNWIND $chunks AS chunk
|
||||
MERGE (c:Chunk {id: chunk.id})
|
||||
@@ -55,7 +63,6 @@ SET c += {
|
||||
end_user_id: chunk.end_user_id,
|
||||
run_id: chunk.run_id,
|
||||
created_at: chunk.created_at,
|
||||
expired_at: chunk.expired_at,
|
||||
dialog_id: chunk.dialog_id,
|
||||
content: chunk.content,
|
||||
speaker: chunk.speaker,
|
||||
@@ -78,11 +85,9 @@ SET e.name = CASE WHEN entity.name IS NOT NULL AND entity.name <> '' THEN entity
|
||||
e.created_at = CASE
|
||||
WHEN entity.created_at IS NOT NULL AND (e.created_at IS NULL OR entity.created_at < e.created_at)
|
||||
THEN entity.created_at ELSE e.created_at END,
|
||||
e.expired_at = CASE
|
||||
WHEN entity.expired_at IS NOT NULL AND (e.expired_at IS NULL OR entity.expired_at > e.expired_at)
|
||||
THEN entity.expired_at ELSE e.expired_at END,
|
||||
e.entity_idx = CASE WHEN e.entity_idx IS NULL OR e.entity_idx = 0 THEN entity.entity_idx ELSE e.entity_idx END,
|
||||
e.entity_type = CASE WHEN entity.entity_type IS NOT NULL AND entity.entity_type <> '' THEN entity.entity_type ELSE e.entity_type END,
|
||||
e.type_description = CASE WHEN entity.type_description IS NOT NULL AND entity.type_description <> '' THEN entity.type_description ELSE coalesce(e.type_description, '') END,
|
||||
e.description = CASE
|
||||
WHEN entity.description IS NOT NULL AND entity.description <> ''
|
||||
AND (e.description IS NULL OR size(e.description) = 0 OR size(entity.description) > size(e.description))
|
||||
@@ -129,6 +134,65 @@ SET e.name = CASE WHEN entity.name IS NOT NULL AND entity.name <> '' THEN entity
|
||||
RETURN e.id AS uuid
|
||||
"""
|
||||
|
||||
# ── 元数据增量回写:将 LLM 提取的元数据追加到用户实体节点 ──
|
||||
ENTITY_METADATA_UPDATE = """
|
||||
MATCH (e:ExtractedEntity {id: $entity_id})
|
||||
SET e.core_facts = CASE
|
||||
WHEN $core_facts IS NOT NULL AND size($core_facts) > 0
|
||||
THEN reduce(acc = coalesce(e.core_facts, []), item IN $core_facts |
|
||||
CASE WHEN item IN acc THEN acc ELSE acc + item END)
|
||||
ELSE coalesce(e.core_facts, []) END,
|
||||
e.traits = CASE
|
||||
WHEN $traits IS NOT NULL AND size($traits) > 0
|
||||
THEN reduce(acc = coalesce(e.traits, []), item IN $traits |
|
||||
CASE WHEN item IN acc THEN acc ELSE acc + item END)
|
||||
ELSE coalesce(e.traits, []) END,
|
||||
e.relations = CASE
|
||||
WHEN $relations IS NOT NULL AND size($relations) > 0
|
||||
THEN reduce(acc = coalesce(e.relations, []), item IN $relations |
|
||||
CASE WHEN item IN acc THEN acc ELSE acc + item END)
|
||||
ELSE coalesce(e.relations, []) END,
|
||||
e.goals = CASE
|
||||
WHEN $goals IS NOT NULL AND size($goals) > 0
|
||||
THEN reduce(acc = coalesce(e.goals, []), item IN $goals |
|
||||
CASE WHEN item IN acc THEN acc ELSE acc + item END)
|
||||
ELSE coalesce(e.goals, []) END,
|
||||
e.interests = CASE
|
||||
WHEN $interests IS NOT NULL AND size($interests) > 0
|
||||
THEN reduce(acc = coalesce(e.interests, []), item IN $interests |
|
||||
CASE WHEN item IN acc THEN acc ELSE acc + item END)
|
||||
ELSE coalesce(e.interests, []) END,
|
||||
e.beliefs_or_stances = CASE
|
||||
WHEN $beliefs_or_stances IS NOT NULL AND size($beliefs_or_stances) > 0
|
||||
THEN reduce(acc = coalesce(e.beliefs_or_stances, []), item IN $beliefs_or_stances |
|
||||
CASE WHEN item IN acc THEN acc ELSE acc + item END)
|
||||
ELSE coalesce(e.beliefs_or_stances, []) END,
|
||||
e.anchors = CASE
|
||||
WHEN $anchors IS NOT NULL AND size($anchors) > 0
|
||||
THEN reduce(acc = coalesce(e.anchors, []), item IN $anchors |
|
||||
CASE WHEN item IN acc THEN acc ELSE acc + item END)
|
||||
ELSE coalesce(e.anchors, []) END,
|
||||
e.events = CASE
|
||||
WHEN $events IS NOT NULL AND size($events) > 0
|
||||
THEN reduce(acc = coalesce(e.events, []), item IN $events |
|
||||
CASE WHEN item IN acc THEN acc ELSE acc + item END)
|
||||
ELSE coalesce(e.events, []) END
|
||||
RETURN e.id AS uuid
|
||||
"""
|
||||
|
||||
# ── 查询用户实体已有的元数据(供增量提取时去重) ──
|
||||
ENTITY_METADATA_QUERY = """
|
||||
MATCH (e:ExtractedEntity {id: $entity_id})
|
||||
RETURN e.core_facts AS core_facts,
|
||||
e.traits AS traits,
|
||||
e.relations AS relations,
|
||||
e.goals AS goals,
|
||||
e.interests AS interests,
|
||||
e.beliefs_or_stances AS beliefs_or_stances,
|
||||
e.anchors AS anchors,
|
||||
e.events AS events
|
||||
"""
|
||||
|
||||
# Add back ENTITY_RELATIONSHIP_SAVE to be used by graph_saver.save_entities_and_relationships
|
||||
ENTITY_RELATIONSHIP_SAVE = """
|
||||
UNWIND $relationships AS rel
|
||||
@@ -138,18 +202,68 @@ MATCH (object:ExtractedEntity {id: rel.target_id, end_user_id: rel.end_user_id})
|
||||
// Avoid duplicate edges across runs for the same endpoints
|
||||
MERGE (subject)-[r:EXTRACTED_RELATIONSHIP]->(object)
|
||||
SET r.predicate = rel.predicate,
|
||||
r.predicate_description = rel.predicate_description,
|
||||
r.statement_id = rel.statement_id,
|
||||
r.value = rel.value,
|
||||
r.statement = rel.statement,
|
||||
r.valid_at = rel.valid_at,
|
||||
r.invalid_at = rel.invalid_at,
|
||||
r.valid_at = coalesce(rel.valid_at, ""),
|
||||
r.invalid_at = coalesce(rel.invalid_at, ""),
|
||||
r.created_at = rel.created_at,
|
||||
r.expired_at = rel.expired_at,
|
||||
r.run_id = rel.run_id,
|
||||
r.end_user_id = rel.end_user_id
|
||||
RETURN elementId(r) AS uuid
|
||||
"""
|
||||
|
||||
# 在 Neo4j 5及后续版本中,id() 函数已被标记为弃用,用elementId() 函数替代
|
||||
|
||||
# 保存弱关系实体,设置 e.is_weak = true;不维护 e.relations 聚合字段
|
||||
WEAK_ENTITY_NODE_SAVE = """
|
||||
UNWIND $weak_entities AS entity
|
||||
MERGE (e:ExtractedEntity {id: entity.id, run_id: entity.run_id})
|
||||
SET e += {
|
||||
name: entity.name,
|
||||
end_user_id: entity.end_user_id,
|
||||
run_id: entity.run_id,
|
||||
description: entity.description,
|
||||
chunk_id: entity.chunk_id,
|
||||
dialog_id: entity.dialog_id
|
||||
}
|
||||
// Independent weak flag,仅标记弱关系,不再维护 relations 聚合字段
|
||||
SET e.is_weak = true
|
||||
RETURN e.id AS id
|
||||
"""
|
||||
|
||||
# 为强关系三元组中的主语和宾语创建/更新实体节点,仅设置 e.is_strong = true,不维护 e.relations 字段
|
||||
SAVE_STRONG_TRIPLE_ENTITIES = """
|
||||
UNWIND $items AS item
|
||||
MERGE (s:ExtractedEntity {id: item.source_id, run_id: item.run_id})
|
||||
SET s += {name: item.subject, end_user_id: item.end_user_id, run_id: item.run_id}
|
||||
// Independent strong flag
|
||||
SET s.is_strong = true
|
||||
MERGE (o:ExtractedEntity {id: item.target_id, run_id: item.run_id})
|
||||
SET o += {name: item.object, end_user_id: item.end_user_id, run_id: item.run_id}
|
||||
// Independent strong flag
|
||||
SET o.is_strong = true
|
||||
"""
|
||||
|
||||
|
||||
DIALOGUE_STATEMENT_EDGE_SAVE = """
|
||||
UNWIND $dialogue_statement_edges AS edge
|
||||
// 支持按 uuid 或 ref_id 连接到 Dialogue,避免因来源 ID 不一致而断链
|
||||
MATCH (dialogue:Dialogue)
|
||||
WHERE dialogue.uuid = edge.source OR dialogue.ref_id = edge.source
|
||||
MATCH (statement:Statement {id: edge.target})
|
||||
// 仅按端点去重,关系属性可更新
|
||||
MERGE (dialogue)-[e:MENTIONS]->(statement)
|
||||
SET e.uuid = edge.id,
|
||||
e.end_user_id = edge.end_user_id,
|
||||
e.created_at = edge.created_at
|
||||
RETURN e.uuid AS uuid
|
||||
"""
|
||||
|
||||
# 在 Neo4j 5及后续版本中,id() 函数已被标记为弃用,用elementId() 函数替代
|
||||
|
||||
|
||||
CHUNK_STATEMENT_EDGE_SAVE = """
|
||||
UNWIND $chunk_statement_edges AS edge
|
||||
MATCH (statement:Statement {id: edge.source, run_id: edge.run_id})
|
||||
@@ -157,8 +271,7 @@ CHUNK_STATEMENT_EDGE_SAVE = """
|
||||
MERGE (chunk)-[e:CONTAINS {id: edge.id}]->(statement)
|
||||
SET e.end_user_id = edge.end_user_id,
|
||||
e.run_id = edge.run_id,
|
||||
e.created_at = edge.created_at,
|
||||
e.expired_at = edge.expired_at
|
||||
e.created_at = edge.created_at
|
||||
RETURN e.id AS uuid
|
||||
"""
|
||||
|
||||
@@ -173,11 +286,89 @@ MERGE (statement)-[r:REFERENCES_ENTITY]->(entity)
|
||||
SET r.end_user_id = rel.end_user_id,
|
||||
r.run_id = rel.run_id,
|
||||
r.created_at = rel.created_at,
|
||||
r.expired_at = rel.expired_at,
|
||||
r.connect_strength = rel.connect_strength
|
||||
RETURN elementId(r) AS uuid
|
||||
"""
|
||||
|
||||
ENTITY_EMBEDDING_SEARCH = """
|
||||
CALL db.index.vector.queryNodes('entity_embedding_index', $limit * 100, $embedding)
|
||||
YIELD node AS e, score
|
||||
WHERE e.name_embedding IS NOT NULL
|
||||
AND ($end_user_id IS NULL OR e.end_user_id = $end_user_id)
|
||||
RETURN e.id AS id,
|
||||
e.name AS name,
|
||||
e.end_user_id AS end_user_id,
|
||||
e.entity_type AS entity_type,
|
||||
COALESCE(e.activation_value, e.importance_score, 0.5) AS activation_value,
|
||||
COALESCE(e.importance_score, 0.5) AS importance_score,
|
||||
e.last_access_time AS last_access_time,
|
||||
COALESCE(e.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
# Embedding-based search: cosine similarity on Statement.statement_embedding
|
||||
STATEMENT_EMBEDDING_SEARCH = """
|
||||
CALL db.index.vector.queryNodes('statement_embedding_index', $limit * 100, $embedding)
|
||||
YIELD node AS s, score
|
||||
WHERE s.statement_embedding IS NOT NULL
|
||||
AND ($end_user_id IS NULL OR s.end_user_id = $end_user_id)
|
||||
RETURN s.id AS id,
|
||||
s.statement AS statement,
|
||||
s.end_user_id AS end_user_id,
|
||||
s.chunk_id AS chunk_id,
|
||||
s.created_at AS created_at,
|
||||
s.valid_at AS valid_at,
|
||||
s.invalid_at AS invalid_at,
|
||||
COALESCE(s.activation_value, s.importance_score, 0.5) AS activation_value,
|
||||
COALESCE(s.importance_score, 0.5) AS importance_score,
|
||||
s.last_access_time AS last_access_time,
|
||||
COALESCE(s.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
# Embedding-based search: cosine similarity on Chunk.chunk_embedding
|
||||
CHUNK_EMBEDDING_SEARCH = """
|
||||
CALL db.index.vector.queryNodes('chunk_embedding_index', $limit * 100, $embedding)
|
||||
YIELD node AS c, score
|
||||
WHERE c.chunk_embedding IS NOT NULL
|
||||
AND ($end_user_id IS NULL OR c.end_user_id = $end_user_id)
|
||||
RETURN c.id AS chunk_id,
|
||||
c.end_user_id AS end_user_id,
|
||||
c.content AS content,
|
||||
c.dialog_id AS dialog_id,
|
||||
COALESCE(c.activation_value, 0.5) AS activation_value,
|
||||
c.last_access_time AS last_access_time,
|
||||
COALESCE(c.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
SEARCH_STATEMENTS_BY_KEYWORD = """
|
||||
CALL db.index.fulltext.queryNodes("statementsFulltext", $query) YIELD node AS s, score
|
||||
WHERE ($end_user_id IS NULL OR s.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s)
|
||||
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity)
|
||||
RETURN s.id AS id,
|
||||
s.statement AS statement,
|
||||
s.end_user_id AS end_user_id,
|
||||
s.chunk_id AS chunk_id,
|
||||
s.created_at AS created_at,
|
||||
s.valid_at AS valid_at,
|
||||
s.invalid_at AS invalid_at,
|
||||
c.id AS chunk_id_from_rel,
|
||||
collect(DISTINCT e.id) AS entity_ids,
|
||||
COALESCE(s.activation_value, s.importance_score, 0.5) AS activation_value,
|
||||
COALESCE(s.importance_score, 0.5) AS importance_score,
|
||||
s.last_access_time AS last_access_time,
|
||||
COALESCE(s.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
# 查询实体名称包含指定字符串的实体
|
||||
SEARCH_ENTITIES_BY_NAME = """
|
||||
CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score
|
||||
@@ -189,7 +380,6 @@ RETURN e.id AS id,
|
||||
e.end_user_id AS end_user_id,
|
||||
e.entity_type AS entity_type,
|
||||
e.created_at AS created_at,
|
||||
e.expired_at AS expired_at,
|
||||
e.entity_idx AS entity_idx,
|
||||
e.statement_id AS statement_id,
|
||||
e.description AS description,
|
||||
@@ -209,6 +399,72 @@ ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
SEARCH_ENTITIES_BY_NAME_OR_ALIAS = """
|
||||
CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score
|
||||
WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id)
|
||||
WITH e, score
|
||||
With collect({entity: e, score: score}) AS fulltextResults
|
||||
|
||||
OPTIONAL MATCH (ae:ExtractedEntity)
|
||||
WHERE ($end_user_id IS NULL OR ae.end_user_id = $end_user_id)
|
||||
AND ae.aliases IS NOT NULL
|
||||
AND ANY(alias IN ae.aliases WHERE toLower(alias) CONTAINS toLower($query))
|
||||
WITH fulltextResults, collect(ae) AS aliasEntities
|
||||
|
||||
UNWIND (fulltextResults + [x IN aliasEntities | {entity: x, score:
|
||||
CASE
|
||||
WHEN ANY(alias IN x.aliases WHERE toLower(alias) = toLower($query)) THEN 1.0
|
||||
WHEN ANY(alias IN x.aliases WHERE toLower(alias) STARTS WITH toLower($query)) THEN 0.9
|
||||
ELSE 0.8
|
||||
END
|
||||
}]) AS row
|
||||
WITH row.entity AS e, row.score AS score
|
||||
WITH DISTINCT e, MAX(score) AS score
|
||||
OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e)
|
||||
OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s)
|
||||
RETURN e.id AS id,
|
||||
e.name AS name,
|
||||
e.end_user_id AS end_user_id,
|
||||
e.entity_type AS entity_type,
|
||||
e.created_at AS created_at,
|
||||
e.entity_idx AS entity_idx,
|
||||
e.statement_id AS statement_id,
|
||||
e.description AS description,
|
||||
e.aliases AS aliases,
|
||||
e.name_embedding AS name_embedding,
|
||||
e.connect_strength AS connect_strength,
|
||||
collect(DISTINCT s.id) AS statement_ids,
|
||||
collect(DISTINCT c.id) AS chunk_ids,
|
||||
COALESCE(e.activation_value, e.importance_score, 0.5) AS activation_value,
|
||||
COALESCE(e.importance_score, 0.5) AS importance_score,
|
||||
e.last_access_time AS last_access_time,
|
||||
COALESCE(e.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
|
||||
SEARCH_CHUNKS_BY_CONTENT = """
|
||||
CALL db.index.fulltext.queryNodes("chunksFulltext", $query) YIELD node AS c, score
|
||||
WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (c)-[:CONTAINS]->(s:Statement)
|
||||
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity)
|
||||
RETURN c.id AS chunk_id,
|
||||
c.end_user_id AS end_user_id,
|
||||
c.content AS content,
|
||||
c.dialog_id AS dialog_id,
|
||||
c.sequence_number AS sequence_number,
|
||||
collect(DISTINCT s.id) AS statement_ids,
|
||||
collect(DISTINCT e.id) AS entity_ids,
|
||||
COALESCE(c.activation_value, 0.5) AS activation_value,
|
||||
c.last_access_time AS last_access_time,
|
||||
COALESCE(c.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
# 以下是关于第二层去重消歧与数据库进行检索的语句,在最近的规划中不再使用
|
||||
|
||||
# # 同组group_id下按“精确名字或别名+可选类型一致”来检索
|
||||
@@ -262,8 +518,7 @@ WHERE ($end_user_id IS NULL OR d.end_user_id = $end_user_id)
|
||||
RETURN d.id AS dialog_id,
|
||||
d.end_user_id AS end_user_id,
|
||||
d.content AS content,
|
||||
d.created_at AS created_at,
|
||||
d.expired_at AS expired_at
|
||||
d.created_at AS created_at
|
||||
ORDER BY d.created_at DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
@@ -277,7 +532,6 @@ RETURN c.id AS chunk_id,
|
||||
c.content AS content,
|
||||
c.dialog_id AS dialog_id,
|
||||
c.created_at AS created_at,
|
||||
c.expired_at AS expired_at,
|
||||
c.sequence_number AS sequence_number
|
||||
ORDER BY c.created_at DESC
|
||||
LIMIT $limit
|
||||
@@ -490,7 +744,6 @@ SET m += {
|
||||
end_user_id: summary.end_user_id,
|
||||
run_id: summary.run_id,
|
||||
created_at: summary.created_at,
|
||||
expired_at: summary.expired_at,
|
||||
dialog_id: summary.dialog_id,
|
||||
chunk_ids: summary.chunk_ids,
|
||||
content: summary.content,
|
||||
@@ -514,8 +767,7 @@ MATCH (c)-[:CONTAINS]->(s:Statement {run_id: e.run_id})
|
||||
MERGE (ms)-[r:DERIVED_FROM_STATEMENT]->(s)
|
||||
SET r.end_user_id = e.end_user_id,
|
||||
r.run_id = e.run_id,
|
||||
r.created_at = e.created_at,
|
||||
r.expired_at = e.expired_at
|
||||
r.created_at = e.created_at
|
||||
RETURN elementId(r) AS uuid
|
||||
"""
|
||||
|
||||
@@ -544,8 +796,7 @@ FOREACH (rel IN CASE WHEN r IS NOT NULL THEN [r] ELSE [] END |
|
||||
user_id: rel.user_id,
|
||||
apply_id: rel.apply_id,
|
||||
run_id: rel.run_id,
|
||||
created_at: rel.created_at,
|
||||
expired_at: rel.expired_at
|
||||
created_at: rel.created_at
|
||||
}]->(target)
|
||||
)
|
||||
|
||||
@@ -566,8 +817,7 @@ FOREACH (rel IN CASE WHEN r IS NOT NULL THEN [r] ELSE [] END |
|
||||
user_id: rel.user_id,
|
||||
apply_id: rel.apply_id,
|
||||
run_id: rel.run_id,
|
||||
created_at: rel.created_at,
|
||||
expired_at: rel.expired_at
|
||||
created_at: rel.created_at
|
||||
}]->(canonical)
|
||||
)
|
||||
|
||||
@@ -608,7 +858,6 @@ neo4j_query_part = """
|
||||
m.description as description,
|
||||
m.statement_id as statement_id,
|
||||
m.created_at as created_at,
|
||||
m.expired_at as expired_at,
|
||||
CASE WHEN rel IS NULL THEN "NO_RELATIONSHIP" ELSE type(rel) END as relationship_type,
|
||||
elementId(rel) as rel_id,
|
||||
rel.predicate as predicate,
|
||||
@@ -628,7 +877,6 @@ neo4j_query_all = """
|
||||
m.description as description,
|
||||
m.statement_id as statement_id,
|
||||
m.created_at as created_at,
|
||||
m.expired_at as expired_at,
|
||||
CASE WHEN rel IS NULL THEN "NO_RELATIONSHIP" ELSE type(rel) END as relationship_type,
|
||||
elementId(rel) as rel_id,
|
||||
rel.predicate as predicate,
|
||||
@@ -1096,6 +1344,111 @@ RETURN (
|
||||
) AS is_complete
|
||||
"""
|
||||
|
||||
# 别名归并:将 predicate="别名属于" 的 EXTRACTED_RELATIONSHIP 边的 source.name
|
||||
# 合并进 target.aliases(去重),并将 source.description 追加到 target.description(分号分隔)
|
||||
MERGE_ALIAS_BELONGS_TO = """
|
||||
MATCH (source:ExtractedEntity {end_user_id: $end_user_id})-[r:EXTRACTED_RELATIONSHIP]->(target:ExtractedEntity {end_user_id: $end_user_id})
|
||||
WHERE r.predicate = '别名属于'
|
||||
WITH source, target,
|
||||
coalesce(target.aliases, []) AS existing_aliases,
|
||||
source.name AS source_name,
|
||||
coalesce(source.description, '') AS src_desc,
|
||||
coalesce(target.description, '') AS tgt_desc
|
||||
|
||||
// 1. 合并 aliases:将 source.name 追加到 target.aliases(去重)
|
||||
WITH source, target, src_desc, tgt_desc,
|
||||
CASE
|
||||
WHEN source_name IS NOT NULL AND source_name <> '' AND NOT source_name IN existing_aliases
|
||||
THEN existing_aliases + source_name
|
||||
ELSE existing_aliases
|
||||
END AS new_aliases
|
||||
|
||||
SET target.aliases = new_aliases,
|
||||
target.description = CASE
|
||||
WHEN src_desc <> '' AND NOT src_desc IN tgt_desc
|
||||
THEN CASE WHEN tgt_desc = '' THEN src_desc ELSE tgt_desc + ';' + src_desc END
|
||||
ELSE tgt_desc
|
||||
END
|
||||
|
||||
RETURN source.name AS merged_alias, target.name AS target_name, new_aliases AS updated_aliases
|
||||
"""
|
||||
|
||||
# 边重定向:将指向别名节点("别名属于"关系的 source)的所有其他边,重定向到用户节点(target)。
|
||||
# 处理两类边:
|
||||
# 1. EXTRACTED_RELATIONSHIP:其他实体 → 别名节点 或 别名节点 → 其他实体
|
||||
# 2. STATEMENT_ENTITY:陈述句 → 别名节点
|
||||
# 对于每条需要重定向的边,创建一条指向用户节点的新边(复制所有属性),然后删除旧边。
|
||||
REDIRECT_ALIAS_EDGES = """
|
||||
// 找到所有 别名→用户 的映射(包含 别名属于 和 别名失效 两种 predicate)
|
||||
MATCH (alias:ExtractedEntity {end_user_id: $end_user_id})-[ar:EXTRACTED_RELATIONSHIP]->(user:ExtractedEntity {end_user_id: $end_user_id})
|
||||
WHERE ar.predicate IN ['别名属于', '别名失效']
|
||||
WITH collect({alias_id: elementId(alias), user_id: elementId(user), alias_eid: alias.id, user_eid: user.id}) AS mappings
|
||||
|
||||
// 1. 重定向 EXTRACTED_RELATIONSHIP 边:别名节点作为 target 的情况
|
||||
UNWIND mappings AS m
|
||||
MATCH (other)-[r:EXTRACTED_RELATIONSHIP]->(alias:ExtractedEntity {end_user_id: $end_user_id})
|
||||
WHERE alias.id = m.alias_eid
|
||||
AND NOT (r.predicate IN ['别名属于', '别名失效'])
|
||||
AND other.id <> m.user_eid
|
||||
WITH m, other, r, alias
|
||||
MATCH (user:ExtractedEntity {id: m.user_eid, end_user_id: $end_user_id})
|
||||
CREATE (other)-[nr:EXTRACTED_RELATIONSHIP]->(user)
|
||||
SET nr = properties(r)
|
||||
DELETE r
|
||||
WITH count(*) AS redirected_incoming
|
||||
|
||||
// 2. 重定向 EXTRACTED_RELATIONSHIP 边:别名节点作为 source 的情况
|
||||
MATCH (alias:ExtractedEntity {end_user_id: $end_user_id})-[ar2:EXTRACTED_RELATIONSHIP]->(user2:ExtractedEntity {end_user_id: $end_user_id})
|
||||
WHERE ar2.predicate IN ['别名属于', '别名失效']
|
||||
WITH alias, user2, redirected_incoming
|
||||
MATCH (alias)-[r:EXTRACTED_RELATIONSHIP]->(other)
|
||||
WHERE NOT (r.predicate IN ['别名属于', '别名失效'])
|
||||
AND other.id <> user2.id
|
||||
WITH user2, other, r, redirected_incoming
|
||||
CREATE (user2)-[nr:EXTRACTED_RELATIONSHIP]->(other)
|
||||
SET nr = properties(r)
|
||||
DELETE r
|
||||
WITH redirected_incoming, count(*) AS redirected_outgoing
|
||||
|
||||
// 3. 重定向 STATEMENT_ENTITY 边:陈述句 → 别名节点
|
||||
MATCH (alias:ExtractedEntity {end_user_id: $end_user_id})-[ar3:EXTRACTED_RELATIONSHIP]->(user3:ExtractedEntity {end_user_id: $end_user_id})
|
||||
WHERE ar3.predicate IN ['别名属于', '别名失效']
|
||||
WITH alias, user3, redirected_incoming, redirected_outgoing
|
||||
MATCH (stmt)-[r:STATEMENT_ENTITY]->(alias)
|
||||
WITH user3, stmt, r, redirected_incoming, redirected_outgoing
|
||||
CREATE (stmt)-[nr:STATEMENT_ENTITY]->(user3)
|
||||
SET nr = properties(r)
|
||||
DELETE r
|
||||
|
||||
RETURN redirected_incoming, redirected_outgoing, count(*) AS redirected_stmt
|
||||
"""
|
||||
|
||||
# 删除别名节点:在别名归并和边重定向完成后,删除所有 predicate="别名属于" 关系的 source 节点。
|
||||
# 此时这些节点的其他边已被 REDIRECT_ALIAS_EDGES 重定向完毕,
|
||||
# 唯一剩余的边就是 (alias)-[:EXTRACTED_RELATIONSHIP {predicate:'别名属于'}]->(user),
|
||||
# 使用 DETACH DELETE 一并删除节点和该关系。
|
||||
DELETE_ALIAS_NODES = """
|
||||
MATCH (alias:ExtractedEntity {end_user_id: $end_user_id})-[r:EXTRACTED_RELATIONSHIP]->(user:ExtractedEntity {end_user_id: $end_user_id})
|
||||
WHERE r.predicate IN ['别名属于', '别名失效']
|
||||
WITH alias, count(r) AS rel_count
|
||||
DETACH DELETE alias
|
||||
RETURN count(alias) AS deleted_count
|
||||
"""
|
||||
|
||||
# 失效别名处理:将 predicate="别名失效" 的 source.name 从 target.aliases 中移除。
|
||||
# 在 MERGE_ALIAS_BELONGS_TO(追加新别名)之后、DELETE_ALIAS_NODES(删除节点)之前执行。
|
||||
REMOVE_INVALID_ALIASES = """
|
||||
MATCH (source:ExtractedEntity {end_user_id: $end_user_id})-[r:EXTRACTED_RELATIONSHIP]->(target:ExtractedEntity {end_user_id: $end_user_id})
|
||||
WHERE r.predicate = '别名失效'
|
||||
WITH source, target,
|
||||
coalesce(target.aliases, []) AS existing_aliases,
|
||||
source.name AS invalid_name
|
||||
|
||||
SET target.aliases = [a IN existing_aliases WHERE toLower(a) <> toLower(invalid_name)]
|
||||
|
||||
RETURN source.name AS removed_alias, target.name AS target_name
|
||||
"""
|
||||
|
||||
CHECK_COMMUNITY_IS_COMPLETE_WITH_EMBEDDING = """
|
||||
MATCH (c:Community {community_id: $community_id, end_user_id: $end_user_id})
|
||||
RETURN (
|
||||
@@ -1352,154 +1705,58 @@ ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
SEARCH_STATEMENTS_BY_KEYWORD = """
|
||||
CALL db.index.fulltext.queryNodes("statementsFulltext", $query) YIELD node AS s, score
|
||||
WHERE ($end_user_id IS NULL OR s.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s)
|
||||
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity)
|
||||
RETURN s.id AS id,
|
||||
s.statement AS statement,
|
||||
s.end_user_id AS end_user_id,
|
||||
s.chunk_id AS chunk_id,
|
||||
s.created_at AS created_at,
|
||||
s.expired_at AS expired_at,
|
||||
s.valid_at AS valid_at,
|
||||
properties(s)['invalid_at'] AS invalid_at,
|
||||
c.id AS chunk_id_from_rel,
|
||||
collect(DISTINCT e.id) AS entity_ids,
|
||||
COALESCE(s.activation_value, s.importance_score, 0.5) AS activation_value,
|
||||
COALESCE(s.importance_score, 0.5) AS importance_score,
|
||||
s.last_access_time AS last_access_time,
|
||||
COALESCE(s.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
SEARCH_ENTITIES_BY_NAME_OR_ALIAS = """
|
||||
CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score
|
||||
WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id)
|
||||
WITH e, score
|
||||
With collect({entity: e, score: score}) AS fulltextResults
|
||||
# ── Assistant Pruning Nodes & Edges ──
|
||||
|
||||
OPTIONAL MATCH (ae:ExtractedEntity)
|
||||
WHERE ($end_user_id IS NULL OR ae.end_user_id = $end_user_id)
|
||||
AND ae.aliases IS NOT NULL
|
||||
AND ANY(alias IN ae.aliases WHERE toLower(alias) CONTAINS toLower($query))
|
||||
WITH fulltextResults, collect(ae) AS aliasEntities
|
||||
|
||||
UNWIND (fulltextResults + [x IN aliasEntities | {entity: x, score:
|
||||
CASE
|
||||
WHEN ANY(alias IN x.aliases WHERE toLower(alias) = toLower($query)) THEN 1.0
|
||||
WHEN ANY(alias IN x.aliases WHERE toLower(alias) STARTS WITH toLower($query)) THEN 0.9
|
||||
ELSE 0.8
|
||||
END
|
||||
}]) AS row
|
||||
WITH row.entity AS e, row.score AS score
|
||||
WITH DISTINCT e, MAX(score) AS score
|
||||
OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e)
|
||||
OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s)
|
||||
RETURN e.id AS id,
|
||||
e.name AS name,
|
||||
e.end_user_id AS end_user_id,
|
||||
e.entity_type AS entity_type,
|
||||
e.created_at AS created_at,
|
||||
e.expired_at AS expired_at,
|
||||
e.entity_idx AS entity_idx,
|
||||
e.statement_id AS statement_id,
|
||||
e.description AS description,
|
||||
e.aliases AS aliases,
|
||||
e.name_embedding AS name_embedding,
|
||||
e.connect_strength AS connect_strength,
|
||||
collect(DISTINCT s.id) AS statement_ids,
|
||||
collect(DISTINCT c.id) AS chunk_ids,
|
||||
COALESCE(e.activation_value, e.importance_score, 0.5) AS activation_value,
|
||||
COALESCE(e.importance_score, 0.5) AS importance_score,
|
||||
e.last_access_time AS last_access_time,
|
||||
COALESCE(e.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
SEARCH_CHUNKS_BY_CONTENT = """
|
||||
CALL db.index.fulltext.queryNodes("chunksFulltext", $query) YIELD node AS c, score
|
||||
WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (c)-[:CONTAINS]->(s:Statement)
|
||||
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity)
|
||||
RETURN c.id AS id,
|
||||
c.end_user_id AS end_user_id,
|
||||
c.content AS content,
|
||||
c.dialog_id AS dialog_id,
|
||||
c.sequence_number AS sequence_number,
|
||||
collect(DISTINCT s.id) AS statement_ids,
|
||||
collect(DISTINCT e.id) AS entity_ids,
|
||||
COALESCE(c.activation_value, 0.5) AS activation_value,
|
||||
c.last_access_time AS last_access_time,
|
||||
COALESCE(c.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
# MemorySummary keyword search using fulltext index
|
||||
SEARCH_MEMORY_SUMMARIES_BY_KEYWORD = """
|
||||
CALL db.index.fulltext.queryNodes("summariesFulltext", $query) YIELD node AS m, score
|
||||
WHERE ($end_user_id IS NULL OR m.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (m)-[:DERIVED_FROM_STATEMENT]->(s:Statement)
|
||||
RETURN m.id AS id,
|
||||
m.name AS name,
|
||||
m.end_user_id AS end_user_id,
|
||||
m.dialog_id AS dialog_id,
|
||||
m.chunk_ids AS chunk_ids,
|
||||
m.content AS content,
|
||||
m.created_at AS created_at,
|
||||
COALESCE(m.activation_value, m.importance_score, 0.5) AS activation_value,
|
||||
COALESCE(m.importance_score, 0.5) AS importance_score,
|
||||
m.last_access_time AS last_access_time,
|
||||
COALESCE(m.access_count, 0) AS access_count,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
# Community keyword search: matches name or summary via fulltext index
|
||||
SEARCH_COMMUNITIES_BY_KEYWORD = """
|
||||
CALL db.index.fulltext.queryNodes("communitiesFulltext", $query) YIELD node AS c, score
|
||||
WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id)
|
||||
RETURN c.community_id AS id,
|
||||
c.name AS name,
|
||||
c.summary AS content,
|
||||
c.core_entities AS core_entities,
|
||||
c.member_count AS member_count,
|
||||
c.end_user_id AS end_user_id,
|
||||
c.updated_at AS updated_at,
|
||||
score
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
FULLTEXT_QUERY_CYPHER_MAPPING = {
|
||||
Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_KEYWORD,
|
||||
Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_NAME_OR_ALIAS,
|
||||
Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_CONTENT,
|
||||
Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_KEYWORD,
|
||||
Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_KEYWORD,
|
||||
Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUALS_BY_KEYWORD
|
||||
ASSISTANT_ORIGINAL_NODE_SAVE = """
|
||||
UNWIND $originals AS orig
|
||||
MERGE (o:AssistantOriginal {id: orig.id})
|
||||
SET o += {
|
||||
end_user_id: orig.end_user_id,
|
||||
run_id: orig.run_id,
|
||||
dialog_id: orig.dialog_id,
|
||||
pair_id: orig.pair_id,
|
||||
text: orig.text,
|
||||
created_at: orig.created_at
|
||||
}
|
||||
USER_ID_QUERY_CYPHER_MAPPING = {
|
||||
Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_USER_ID,
|
||||
Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_USER_ID,
|
||||
Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_USER_ID,
|
||||
Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_USER_ID,
|
||||
Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_USER_ID,
|
||||
Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUAL_BY_USER_ID
|
||||
}
|
||||
NODE_ID_QUERY_CYPHER_MAPPING = {
|
||||
Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_IDS,
|
||||
Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_IDS,
|
||||
Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_IDS,
|
||||
Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_IDS,
|
||||
Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_IDS,
|
||||
Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUAL_BY_IDS
|
||||
RETURN o.id AS uuid
|
||||
"""
|
||||
|
||||
ASSISTANT_PRUNED_NODE_SAVE = """
|
||||
UNWIND $pruneds AS p
|
||||
MERGE (pr:AssistantPruned {id: p.id})
|
||||
SET pr += {
|
||||
end_user_id: p.end_user_id,
|
||||
run_id: p.run_id,
|
||||
dialog_id: p.dialog_id,
|
||||
pair_id: p.pair_id,
|
||||
text: p.text,
|
||||
memory_type: p.memory_type,
|
||||
text_embedding: p.text_embedding,
|
||||
created_at: p.created_at
|
||||
}
|
||||
RETURN pr.id AS uuid
|
||||
"""
|
||||
|
||||
ASSISTANT_PRUNED_EDGE_SAVE = """
|
||||
UNWIND $edges AS edge
|
||||
MATCH (o:AssistantOriginal {id: edge.source})
|
||||
MATCH (p:AssistantPruned {id: edge.target})
|
||||
MERGE (o)-[r:PRUNED_TO]->(p)
|
||||
SET r.pair_id = edge.pair_id,
|
||||
r.end_user_id = edge.end_user_id,
|
||||
r.run_id = edge.run_id,
|
||||
r.created_at = edge.created_at
|
||||
RETURN elementId(r) AS uuid
|
||||
"""
|
||||
|
||||
ASSISTANT_DIALOG_EDGE_SAVE = """
|
||||
UNWIND $edges AS edge
|
||||
MATCH (o:AssistantOriginal {id: edge.source})
|
||||
MATCH (d:Dialogue {id: edge.target})
|
||||
MERGE (o)-[r:BELONGS_TO_DIALOG]->(d)
|
||||
SET r.end_user_id = edge.end_user_id,
|
||||
r.run_id = edge.run_id,
|
||||
r.created_at = edge.created_at
|
||||
RETURN elementId(r) AS uuid
|
||||
"""
|
||||
|
||||
@@ -49,8 +49,6 @@ class DialogRepository(BaseNeo4jRepository[DialogueNode]):
|
||||
# 处理datetime字段
|
||||
if isinstance(n.get('created_at'), str):
|
||||
n['created_at'] = datetime.fromisoformat(n['created_at'])
|
||||
if n.get('expired_at') and isinstance(n['expired_at'], str):
|
||||
n['expired_at'] = datetime.fromisoformat(n['expired_at'])
|
||||
|
||||
return DialogueNode(**n)
|
||||
|
||||
|
||||
@@ -48,8 +48,6 @@ class EntityRepository(BaseNeo4jRepository[ExtractedEntityNode]):
|
||||
# 处理datetime字段
|
||||
if isinstance(n.get('created_at'), str):
|
||||
n['created_at'] = datetime.fromisoformat(n['created_at'])
|
||||
if n.get('expired_at') and isinstance(n.get('expired_at'), str):
|
||||
n['expired_at'] = datetime.fromisoformat(n['expired_at'])
|
||||
|
||||
# 确保aliases字段存在且为列表
|
||||
if 'aliases' not in n or n['aliases'] is None:
|
||||
|
||||
@@ -24,6 +24,10 @@ from app.core.memory.models.graph_models import (
|
||||
EntityEntityEdge,
|
||||
PerceptualNode,
|
||||
PerceptualEdge,
|
||||
AssistantOriginalNode,
|
||||
AssistantPrunedNode,
|
||||
AssistantPrunedEdge,
|
||||
AssistantDialogEdge,
|
||||
)
|
||||
import logging
|
||||
|
||||
@@ -44,13 +48,13 @@ async def save_entities_and_relationships(
|
||||
'source_id': edge.source,
|
||||
'target_id': edge.target,
|
||||
'predicate': edge.relation_type,
|
||||
'predicate_description': edge.relation_type_description,
|
||||
'statement_id': edge.source_statement_id,
|
||||
'value': edge.relation_value,
|
||||
'statement': edge.statement,
|
||||
'valid_at': edge.valid_at.isoformat() if edge.valid_at else None,
|
||||
'invalid_at': edge.invalid_at.isoformat() if edge.invalid_at else None,
|
||||
'created_at': edge.created_at.isoformat() if edge.created_at else None,
|
||||
'expired_at': edge.expired_at.isoformat() if edge.expired_at else None,
|
||||
'run_id': edge.run_id,
|
||||
'end_user_id': edge.end_user_id,
|
||||
}
|
||||
@@ -110,7 +114,6 @@ async def save_statement_chunk_edges(
|
||||
"end_user_id": edge.end_user_id,
|
||||
"run_id": edge.run_id,
|
||||
"created_at": edge.created_at.isoformat() if edge.created_at else None,
|
||||
"expired_at": edge.expired_at.isoformat() if edge.expired_at else None,
|
||||
})
|
||||
|
||||
try:
|
||||
@@ -140,7 +143,6 @@ async def save_statement_entity_edges(
|
||||
"run_id": edge.run_id,
|
||||
"connect_strength": edge.connect_strength,
|
||||
"created_at": edge.created_at.isoformat() if edge.created_at else None,
|
||||
"expired_at": edge.expired_at.isoformat() if edge.expired_at else None,
|
||||
}
|
||||
all_se_edges.append(edge_data)
|
||||
|
||||
@@ -165,6 +167,10 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
statement_entity_edges: List[StatementEntityEdge],
|
||||
perceptual_edges: List[PerceptualEdge],
|
||||
connector: Neo4jConnector,
|
||||
assistant_original_nodes: Optional[List[AssistantOriginalNode]] = None,
|
||||
assistant_pruned_nodes: Optional[List[AssistantPrunedNode]] = None,
|
||||
assistant_pruned_edges: Optional[List[AssistantPrunedEdge]] = None,
|
||||
assistant_dialog_edges: Optional[List[AssistantDialogEdge]] = None,
|
||||
) -> bool:
|
||||
"""Save dialogue nodes, chunk nodes, statement nodes, entities, and all relationships to Neo4j using graph models.
|
||||
|
||||
@@ -251,7 +257,7 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
result = await tx.run(DIALOGUE_NODE_SAVE, dialogues=dialogue_data)
|
||||
dialogue_uuids = [record["uuid"] async for record in result]
|
||||
results['dialogues'] = dialogue_uuids
|
||||
logger.info(f"Dialogues saved to Neo4j with UUIDs: {dialogue_uuids}")
|
||||
logger.debug(f"Dialogues saved to Neo4j with UUIDs: {dialogue_uuids}")
|
||||
|
||||
# 2. Save all chunk nodes in batch
|
||||
if chunk_nodes:
|
||||
@@ -260,7 +266,7 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
result = await tx.run(CHUNK_NODE_SAVE, chunks=chunk_data)
|
||||
chunk_uuids = [record["uuid"] async for record in result]
|
||||
results['chunks'] = chunk_uuids
|
||||
logger.info(f"Successfully saved {len(chunk_uuids)} chunk nodes to Neo4j")
|
||||
logger.debug(f"Successfully saved {len(chunk_uuids)} chunk nodes to Neo4j")
|
||||
|
||||
if perceptual_nodes:
|
||||
from app.repositories.neo4j.cypher_queries import PERCEPTUAL_NODE_SAVE
|
||||
@@ -268,7 +274,7 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
result = await tx.run(PERCEPTUAL_NODE_SAVE, perceptuals=perceptual_data)
|
||||
perceptual_uuids = [record["uuid"] async for record in result]
|
||||
results["perceptuals"] = perceptual_uuids
|
||||
logger.info(f"Successfully saved {len(perceptual_uuids)} perceptual nodes to Neo4j")
|
||||
logger.debug(f"Successfully saved {len(perceptual_uuids)} perceptual nodes to Neo4j")
|
||||
|
||||
# 3. Save all statement nodes in batch
|
||||
if statement_nodes:
|
||||
@@ -277,7 +283,7 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
result = await tx.run(STATEMENT_NODE_SAVE, statements=statement_data)
|
||||
statement_uuids = [record["uuid"] async for record in result]
|
||||
results['statements'] = statement_uuids
|
||||
logger.info(f"Successfully saved {len(statement_uuids)} statement nodes to Neo4j")
|
||||
logger.debug(f"Successfully saved {len(statement_uuids)} statement nodes to Neo4j")
|
||||
|
||||
# 4. Save entities
|
||||
if entity_nodes:
|
||||
@@ -286,7 +292,7 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
result = await tx.run(EXTRACTED_ENTITY_NODE_SAVE, entities=entity_data)
|
||||
entity_uuids = [record["uuid"] async for record in result]
|
||||
results['entities'] = entity_uuids
|
||||
logger.info(f"Successfully saved {len(entity_uuids)} entity nodes to Neo4j")
|
||||
logger.debug(f"Successfully saved {len(entity_uuids)} entity nodes to Neo4j")
|
||||
|
||||
# 5. Create entity relationships
|
||||
if entity_edges:
|
||||
@@ -297,20 +303,20 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
'source_id': edge.source,
|
||||
'target_id': edge.target,
|
||||
'predicate': edge.relation_type,
|
||||
'predicate_description': edge.relation_type_description,
|
||||
'statement_id': edge.source_statement_id,
|
||||
'value': edge.relation_value,
|
||||
'statement': edge.statement,
|
||||
'valid_at': edge.valid_at.isoformat() if edge.valid_at else None,
|
||||
'invalid_at': edge.invalid_at.isoformat() if edge.invalid_at else None,
|
||||
'created_at': edge.created_at.isoformat() if edge.created_at else None,
|
||||
'expired_at': edge.expired_at.isoformat() if edge.expired_at else None,
|
||||
'run_id': edge.run_id,
|
||||
'end_user_id': edge.end_user_id,
|
||||
})
|
||||
result = await tx.run(ENTITY_RELATIONSHIP_SAVE, relationships=relationship_data)
|
||||
rel_uuids = [record["uuid"] async for record in result]
|
||||
results['entity_relationships'] = rel_uuids
|
||||
logger.info(f"Successfully saved {len(rel_uuids)} entity relationships to Neo4j")
|
||||
logger.debug(f"Successfully saved {len(rel_uuids)} entity relationships to Neo4j")
|
||||
|
||||
# 6. Save statement-chunk edges
|
||||
if statement_chunk_edges:
|
||||
@@ -322,14 +328,13 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
"source": edge.source,
|
||||
"target": edge.target,
|
||||
"created_at": edge.created_at.isoformat() if edge.created_at else None,
|
||||
"expired_at": edge.expired_at.isoformat() if edge.expired_at else None,
|
||||
"run_id": edge.run_id,
|
||||
"end_user_id": edge.end_user_id,
|
||||
})
|
||||
result = await tx.run(CHUNK_STATEMENT_EDGE_SAVE, chunk_statement_edges=sc_edge_data)
|
||||
sc_uuids = [record["uuid"] async for record in result]
|
||||
results['statement_chunk_edges'] = sc_uuids
|
||||
logger.info(f"Successfully saved {len(sc_uuids)} statement-chunk edges to Neo4j")
|
||||
logger.debug(f"Successfully saved {len(sc_uuids)} statement-chunk edges to Neo4j")
|
||||
|
||||
# 7. Save statement-entity edges
|
||||
if statement_entity_edges:
|
||||
@@ -340,7 +345,6 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
"source": edge.source,
|
||||
"target": edge.target,
|
||||
"created_at": edge.created_at.isoformat() if edge.created_at else None,
|
||||
"expired_at": edge.expired_at.isoformat() if edge.expired_at else None,
|
||||
"run_id": edge.run_id,
|
||||
"end_user_id": edge.end_user_id,
|
||||
"connect_strength": getattr(edge, "connect_strength", "strong"),
|
||||
@@ -348,7 +352,7 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
result = await tx.run(STATEMENT_ENTITY_EDGE_SAVE, relationships=se_edge_data)
|
||||
se_uuids = [record["uuid"] async for record in result]
|
||||
results['statement_entity_edges'] = se_uuids
|
||||
logger.info(f"Successfully saved {len(se_uuids)} statement-entity edges to Neo4j")
|
||||
logger.debug(f"Successfully saved {len(se_uuids)} statement-entity edges to Neo4j")
|
||||
|
||||
if perceptual_edges:
|
||||
from app.repositories.neo4j.cypher_queries import PERCEPTUAL_CHUNK_EDGE_SAVE
|
||||
@@ -364,7 +368,56 @@ async def save_dialog_and_statements_to_neo4j(
|
||||
result = await tx.run(PERCEPTUAL_CHUNK_EDGE_SAVE, edges=perceptual_edge_data)
|
||||
perceptual_edges_uuids = [record["uuid"] async for record in result]
|
||||
results['perceptual_chunk_edges'] = perceptual_edges_uuids
|
||||
logger.info(f"Successfully saved {len(perceptual_edges_uuids)} perceptual-chunk edges to Neo4j")
|
||||
logger.debug(f"Successfully saved {len(perceptual_edges_uuids)} perceptual-chunk edges to Neo4j")
|
||||
|
||||
# 8. Save assistant original nodes
|
||||
if assistant_original_nodes:
|
||||
from app.repositories.neo4j.cypher_queries import ASSISTANT_ORIGINAL_NODE_SAVE
|
||||
original_data = [node.model_dump() for node in assistant_original_nodes]
|
||||
result = await tx.run(ASSISTANT_ORIGINAL_NODE_SAVE, originals=original_data)
|
||||
original_uuids = [record["uuid"] async for record in result]
|
||||
results['assistant_originals'] = original_uuids
|
||||
logger.debug(f"Successfully saved {len(original_uuids)} assistant original nodes to Neo4j")
|
||||
|
||||
# 9. Save assistant pruned nodes
|
||||
if assistant_pruned_nodes:
|
||||
from app.repositories.neo4j.cypher_queries import ASSISTANT_PRUNED_NODE_SAVE
|
||||
pruned_data = [node.model_dump() for node in assistant_pruned_nodes]
|
||||
result = await tx.run(ASSISTANT_PRUNED_NODE_SAVE, pruneds=pruned_data)
|
||||
pruned_uuids = [record["uuid"] async for record in result]
|
||||
results['assistant_pruneds'] = pruned_uuids
|
||||
logger.debug(f"Successfully saved {len(pruned_uuids)} assistant pruned nodes to Neo4j")
|
||||
|
||||
# 10. Save PRUNED_TO edges (Original → Pruned)
|
||||
if assistant_pruned_edges:
|
||||
from app.repositories.neo4j.cypher_queries import ASSISTANT_PRUNED_EDGE_SAVE
|
||||
edge_data = [{
|
||||
"source": edge.source,
|
||||
"target": edge.target,
|
||||
"pair_id": edge.pair_id,
|
||||
"end_user_id": edge.end_user_id,
|
||||
"run_id": edge.run_id,
|
||||
"created_at": edge.created_at.isoformat() if edge.created_at else None,
|
||||
} for edge in assistant_pruned_edges]
|
||||
result = await tx.run(ASSISTANT_PRUNED_EDGE_SAVE, edges=edge_data)
|
||||
pruned_edge_uuids = [record["uuid"] async for record in result]
|
||||
results['assistant_pruned_edges'] = pruned_edge_uuids
|
||||
logger.debug(f"Successfully saved {len(pruned_edge_uuids)} PRUNED_TO edges to Neo4j")
|
||||
|
||||
# 11. Save BELONGS_TO_DIALOG edges (Original → Dialogue)
|
||||
if assistant_dialog_edges:
|
||||
from app.repositories.neo4j.cypher_queries import ASSISTANT_DIALOG_EDGE_SAVE
|
||||
edge_data = [{
|
||||
"source": edge.source,
|
||||
"target": edge.target,
|
||||
"end_user_id": edge.end_user_id,
|
||||
"run_id": edge.run_id,
|
||||
"created_at": edge.created_at.isoformat() if edge.created_at else None,
|
||||
} for edge in assistant_dialog_edges]
|
||||
result = await tx.run(ASSISTANT_DIALOG_EDGE_SAVE, edges=edge_data)
|
||||
dialog_edge_uuids = [record["uuid"] async for record in result]
|
||||
results['assistant_dialog_edges'] = dialog_edge_uuids
|
||||
logger.debug(f"Successfully saved {len(dialog_edge_uuids)} BELONGS_TO_DIALOG edges to Neo4j")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -232,8 +232,6 @@ async def neo4j_data(solved_data):
|
||||
updata_entity = {}
|
||||
ori_edge = {}
|
||||
updata_edge = {}
|
||||
ori_expired_at={}
|
||||
updat_expired_at={}
|
||||
for i in solved_data:
|
||||
databasets = i['data']
|
||||
for key, values in databasets.items():
|
||||
@@ -247,12 +245,9 @@ async def neo4j_data(solved_data):
|
||||
key = 'name'
|
||||
ori_entity[key] = values[0]
|
||||
updata_entity[key] = values[1]
|
||||
ori_expired_at[key] = values[0]
|
||||
if key == 'statement':
|
||||
ori_edge[key] = values[0]
|
||||
updata_edge[key] = values[1]
|
||||
if key=='expired_at':
|
||||
updat_expired_at[key] = values[1]
|
||||
|
||||
elif key == 'id':
|
||||
ori_edge[key] = values
|
||||
@@ -260,8 +255,6 @@ async def neo4j_data(solved_data):
|
||||
|
||||
ori_entity[key] = values
|
||||
updata_entity[key] = values
|
||||
|
||||
ori_expired_at[key] = values
|
||||
elif key == 'rel_id':
|
||||
key='id'
|
||||
ori_edge[key] = values
|
||||
@@ -270,18 +263,12 @@ async def neo4j_data(solved_data):
|
||||
ori_entity[key] = values
|
||||
updata_entity[key] = values
|
||||
|
||||
ori_expired_at[key] = values
|
||||
|
||||
|
||||
print(ori_entity)
|
||||
print(updata_entity)
|
||||
print(100*'-')
|
||||
print(ori_edge)
|
||||
print(updata_edge)
|
||||
expired_at_ = updat_expired_at.get('expired_at', None)
|
||||
if expired_at_ is not None:
|
||||
await update_neo4j_data(ori_expired_at, updat_expired_at)
|
||||
success_count += 1
|
||||
if ori_entity != updata_entity:
|
||||
await update_neo4j_data(ori_entity, updata_entity)
|
||||
success_count += 1
|
||||
|
||||
@@ -50,12 +50,12 @@ class StatementRepository(BaseNeo4jRepository[StatementNode]):
|
||||
# 处理datetime字段
|
||||
if isinstance(n.get('created_at'), str):
|
||||
n['created_at'] = datetime.fromisoformat(n['created_at'])
|
||||
if n.get('expired_at') and isinstance(n['expired_at'], str):
|
||||
n['expired_at'] = datetime.fromisoformat(n['expired_at'])
|
||||
if n.get('valid_at') and isinstance(n['valid_at'], str):
|
||||
n['valid_at'] = datetime.fromisoformat(n['valid_at'])
|
||||
if n.get('invalid_at') and isinstance(n['invalid_at'], str):
|
||||
n['invalid_at'] = datetime.fromisoformat(n['invalid_at'])
|
||||
if n.get('dialog_at') and isinstance(n['dialog_at'], str):
|
||||
n['dialog_at'] = datetime.fromisoformat(n['dialog_at'])
|
||||
|
||||
# 处理temporal_info字段
|
||||
if isinstance(n.get('temporal_info'), str):
|
||||
|
||||
@@ -227,7 +227,7 @@ class OntologyClassRepository:
|
||||
).all()
|
||||
|
||||
logger.info(
|
||||
f"Found {len(classes)} ontology classes in scene {scene_id}"
|
||||
f"Found {len(classes)} ontology classes in scene_id: {scene_id}"
|
||||
)
|
||||
|
||||
return classes
|
||||
|
||||
@@ -1,9 +1,37 @@
|
||||
from abc import ABC
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
from typing import Any, Optional
|
||||
from pydantic import Field
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class StorageType(str, Enum):
|
||||
"""记忆存储后端类型"""
|
||||
NEO4J = "neo4j"
|
||||
RAG = "rag"
|
||||
|
||||
|
||||
class Language(str, Enum): # 没有传递到聚类的celery任务中去,任务会回退失败用默认值,考虑统一语言问题
|
||||
"""支持的语言"""
|
||||
ZH = "zh"
|
||||
EN = "en"
|
||||
|
||||
|
||||
class MessageItem(BaseModel):
|
||||
"""单条消息结构"""
|
||||
role: str
|
||||
content: str
|
||||
dialog_at: Optional[str] = Field(
|
||||
None,
|
||||
description="该条消息发生的绝对时间(ISO 8601 格式),不传则使用服务端当前时间",
|
||||
)
|
||||
files: Optional[list[dict]] = None
|
||||
file_content: Optional[list[Any]] = None
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
|
||||
class UserInput(BaseModel):
|
||||
message: str
|
||||
history: list[dict]
|
||||
@@ -18,6 +46,16 @@ class Write_UserInput(BaseModel):
|
||||
config_id: Optional[str] = None
|
||||
|
||||
|
||||
class WriteMemoryRequest(BaseModel):
|
||||
"""write_memory() 的参数封装"""
|
||||
end_user_id: str
|
||||
messages: list[MessageItem]
|
||||
config_id: Optional[Any] = None
|
||||
storage_type: StorageType = StorageType.NEO4J
|
||||
user_rag_memory_id: str = ""
|
||||
language: Language = Language.ZH
|
||||
|
||||
|
||||
class AgentMemory_Long_Term(ABC):
|
||||
"""长期记忆配置常量"""
|
||||
STORAGE_NEO4J = "neo4j"
|
||||
@@ -25,7 +63,7 @@ class AgentMemory_Long_Term(ABC):
|
||||
STRATEGY_AGGREGATE = "aggregate"
|
||||
STRATEGY_CHUNK = "chunk"
|
||||
STRATEGY_TIME = "time"
|
||||
DEFAULT_SCOPE = 6
|
||||
DEFAULT_SCOPE = 1
|
||||
TIME_SCOPE = 5
|
||||
|
||||
|
||||
|
||||
@@ -421,6 +421,9 @@ class MemoryConfig:
|
||||
pruning_scene: Optional[str] = "education"
|
||||
pruning_threshold: float = 0.5
|
||||
|
||||
# Pipeline config: Emotion extraction
|
||||
emotion_enabled: bool = False
|
||||
|
||||
# Ontology scene association
|
||||
scene_id: Optional[UUID] = None
|
||||
ontology_class_infos: list[dict] = field(default_factory=list)
|
||||
|
||||
@@ -4,7 +4,7 @@ Order Schema
|
||||
Defines request and response models for order operations.
|
||||
"""
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from typing import Any, Optional
|
||||
|
||||
|
||||
@@ -17,8 +17,8 @@ class CreateOrderRequest(BaseModel):
|
||||
customer_email: Optional[str] = Field(None, description="Customer email")
|
||||
notes: Optional[str] = Field(None, description="Order notes")
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"product_id": "PROD-001",
|
||||
"quantity": 2,
|
||||
@@ -27,6 +27,7 @@ class CreateOrderRequest(BaseModel):
|
||||
"notes": "Please deliver before 5pm"
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class OrderResponse(BaseModel):
|
||||
@@ -40,8 +41,8 @@ class OrderResponse(BaseModel):
|
||||
created_at: Optional[str] = Field(None, description="Creation timestamp")
|
||||
message: Optional[str] = Field(None, description="Response message")
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"order_id": "ORD-20231224-001",
|
||||
"status": "pending",
|
||||
@@ -52,6 +53,7 @@ class OrderResponse(BaseModel):
|
||||
"message": "Order created successfully"
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class ExternalOrderResponse(BaseModel):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from dataclasses import field
|
||||
from pydantic import BaseModel, EmailStr, Field, field_validator, validator, ConfigDict
|
||||
from pydantic import BaseModel, EmailStr, Field, field_validator, ConfigDict
|
||||
from typing import Optional, List
|
||||
import datetime
|
||||
import uuid
|
||||
@@ -90,7 +90,8 @@ class User(UserBase):
|
||||
permissions: Optional[List[str]] = None # 用户权限列表,由 external_source 的 permissions 控制
|
||||
|
||||
# 将 datetime 转换为毫秒时间戳
|
||||
@validator("created_at", pre=True)
|
||||
@field_validator("created_at", mode="before")
|
||||
@classmethod
|
||||
def _created_at_to_ms(cls, v):
|
||||
if isinstance(v, datetime.datetime):
|
||||
return int(v.timestamp() * 1000)
|
||||
|
||||
@@ -34,15 +34,15 @@ from app.core.memory.agent.utils.messages_tools import (
|
||||
reorder_output_results,
|
||||
)
|
||||
from app.core.memory.agent.utils.type_classifier import status_typle
|
||||
from app.core.memory.agent.utils.write_tools import write as write_neo4j
|
||||
from app.core.memory.analytics.hot_memory_tags import get_interest_distribution
|
||||
from app.core.memory.memory_service import MemoryService
|
||||
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
|
||||
from app.core.memory.utils.log.audit_logger import audit_logger
|
||||
from app.db import get_db_context
|
||||
from app.models.knowledge_model import Knowledge, KnowledgeType
|
||||
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
|
||||
from app.schemas import FileInput
|
||||
from app.schemas.memory_agent_schema import Write_UserInput
|
||||
from app.schemas.memory_agent_schema import Language, MessageItem, StorageType, Write_UserInput, WriteMemoryRequest
|
||||
from app.schemas.memory_config_schema import ConfigurationError
|
||||
from app.services.memory_config_service import MemoryConfigService
|
||||
from app.services.memory_konwledges_server import (
|
||||
@@ -63,7 +63,7 @@ class MemoryAgentService:
|
||||
def writer_messages_deal(self, messages, start_time, end_user_id, config_id, message, context):
|
||||
duration = time.time() - start_time
|
||||
if str(messages) == 'success':
|
||||
logger.info(f"Write operation successful for group {end_user_id} with config_id {config_id}")
|
||||
logger.info(f"Write operation successful for end_id: {end_user_id} with config_id: {config_id}")
|
||||
# 记录成功的操作
|
||||
audit_logger.log_operation(operation="WRITE", config_id=config_id, end_user_id=end_user_id,
|
||||
success=True,
|
||||
@@ -267,25 +267,15 @@ class MemoryAgentService:
|
||||
|
||||
async def write_memory(
|
||||
self,
|
||||
end_user_id: str,
|
||||
messages: list[dict],
|
||||
config_id: Optional[uuid.UUID] | int,
|
||||
request: WriteMemoryRequest,
|
||||
db: Session,
|
||||
storage_type: str,
|
||||
user_rag_memory_id: str,
|
||||
language: str = "zh"
|
||||
) -> str:
|
||||
"""
|
||||
Process write operation with config_id
|
||||
长期记忆写入
|
||||
|
||||
Args:
|
||||
end_user_id: Group identifier (also used as end_user_id)
|
||||
messages: Message to write
|
||||
config_id: Configuration ID from database
|
||||
request: 写入请求参数(end_user_id、messages、config_id、storage_type、language 等)
|
||||
db: SQLAlchemy database session
|
||||
storage_type: Storage type (neo4j or rag)
|
||||
user_rag_memory_id: User RAG memory ID
|
||||
language: 语言类型 ("zh" 中文, "en" 英文)
|
||||
|
||||
Returns:
|
||||
Write operation result status
|
||||
@@ -293,96 +283,50 @@ class MemoryAgentService:
|
||||
Raises:
|
||||
ValueError: If config loading fails or write operation fails
|
||||
"""
|
||||
# Resolve config_id and workspace_id
|
||||
# Always get workspace_id from end_user for fallback, even if config_id is provided
|
||||
workspace_id = None
|
||||
try:
|
||||
connected_config = get_end_user_connected_config(end_user_id, db)
|
||||
workspace_id = connected_config.get("workspace_id")
|
||||
if config_id is None:
|
||||
config_id = connected_config.get("memory_config_id")
|
||||
logger.info(f"Resolved config from end_user: config_id={config_id}, workspace_id={workspace_id}")
|
||||
if config_id is None and workspace_id is None:
|
||||
raise ValueError(f"No memory configuration found for end_user {end_user_id}. "
|
||||
f"Please ensure the user has a connected memory configuration.")
|
||||
except Exception as e:
|
||||
if "No memory configuration found" in str(e):
|
||||
raise # Re-raise our specific error
|
||||
logger.error(f"Failed to get connected config for end_user {end_user_id}: {e}")
|
||||
if config_id is None:
|
||||
raise ValueError(f"Unable to determine memory configuration for end_user {end_user_id}: {e}")
|
||||
# If config_id was provided, continue without workspace_id fallback
|
||||
|
||||
import time
|
||||
end_user_id = request.end_user_id
|
||||
messages = request.messages
|
||||
config_id = request.config_id
|
||||
storage_type = request.storage_type
|
||||
user_rag_memory_id = request.user_rag_memory_id
|
||||
language = request.language
|
||||
start_time = time.time()
|
||||
|
||||
# Load configuration from database with workspace fallback
|
||||
# Use a separate database session to avoid transaction failures
|
||||
# ── Step 1: 解析配置 ── 通过 end_user_id 查找关联的 config_id / workspace_id,并从数据库加载完整 memory_config
|
||||
memory_config = await self._resolve_and_load_config(
|
||||
end_user_id, config_id, db, start_time
|
||||
)
|
||||
|
||||
# ── Step 2: 文件预处理 ── 将消息中附带的文件转换为感知记忆对象,挂载到 message["file_content"]
|
||||
messages = await self._preprocess_files(messages, end_user_id, memory_config, db)
|
||||
message_text = "\n".join([
|
||||
f"{(msg['role'] if isinstance(msg, dict) else msg.role)}: {(msg['content'] if isinstance(msg, dict) else msg.content)}"
|
||||
for msg in messages
|
||||
])
|
||||
|
||||
# ── Step 3: 写入存储 ── 根据 storage_type 分流到 RAG 或 Neo4j 流水线
|
||||
try:
|
||||
from app.db import get_db_context
|
||||
with get_db_context() as config_db:
|
||||
config_service = MemoryConfigService(config_db)
|
||||
memory_config = config_service.load_memory_config(
|
||||
config_id=config_id,
|
||||
workspace_id=workspace_id,
|
||||
service_name="MemoryAgentService"
|
||||
)
|
||||
logger.info(f"Configuration loaded successfully: {memory_config.config_name}")
|
||||
except ConfigurationError as e:
|
||||
error_msg = f"Failed to load configuration for config_id: {config_id}: {e}"
|
||||
logger.error(error_msg)
|
||||
|
||||
# Log failed operation
|
||||
duration = time.time() - start_time
|
||||
audit_logger.log_operation(operation="WRITE", config_id=config_id, end_user_id=end_user_id,
|
||||
success=False, duration=duration, error=error_msg)
|
||||
|
||||
raise ValueError(error_msg)
|
||||
|
||||
perceptual_serivce = MemoryPerceptualService(db)
|
||||
for message in messages:
|
||||
message["file_content"] = []
|
||||
for file in (message.get("files") or []):
|
||||
file_object = await perceptual_serivce.generate_perceptual_memory(
|
||||
end_user_id=end_user_id,
|
||||
memory_config=memory_config,
|
||||
file=FileInput(**file)
|
||||
)
|
||||
if file_object is None:
|
||||
continue
|
||||
message["file_content"].append((file_object, file["type"]))
|
||||
logger.info(messages)
|
||||
|
||||
message_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
|
||||
try:
|
||||
if storage_type == "rag":
|
||||
# For RAG storage, convert messages to single string
|
||||
if storage_type == StorageType.RAG:
|
||||
await write_rag(end_user_id, message_text, user_rag_memory_id)
|
||||
return "success"
|
||||
else:
|
||||
await write_neo4j(
|
||||
end_user_id=end_user_id,
|
||||
messages=messages,
|
||||
memory_config=memory_config,
|
||||
ref_id='',
|
||||
language=language
|
||||
)
|
||||
for lang in ["zh", "en"]:
|
||||
deleted = await InterestMemoryCache.delete_interest_distribution(
|
||||
end_user_id, lang
|
||||
)
|
||||
if deleted:
|
||||
logger.info(
|
||||
f"Invalidated interest distribution cache: end_user_id={end_user_id}, language={lang}")
|
||||
await self._write_neo4j(end_user_id, messages, memory_config, language)
|
||||
|
||||
# ── Step 4: 后处理 ── 失效缓存、序列化文件路径、记录审计日志并返回结果
|
||||
await self._invalidate_interest_cache(end_user_id)
|
||||
for message in messages:
|
||||
message["file_content"] = [
|
||||
perceptual[0].file_path for perceptual in message["file_content"]
|
||||
]
|
||||
if isinstance(message, dict):
|
||||
message["file_content"] = [
|
||||
perceptual[0].file_path for perceptual in (message["file_content"] or [])
|
||||
]
|
||||
else:
|
||||
message.file_content = [
|
||||
perceptual[0].file_path for perceptual in (message.file_content or [])
|
||||
]
|
||||
return self.writer_messages_deal(
|
||||
"success",
|
||||
start_time,
|
||||
end_user_id,
|
||||
config_id,
|
||||
memory_config.config_id,
|
||||
message_text,
|
||||
{
|
||||
"status": "success",
|
||||
@@ -392,15 +336,140 @@ class MemoryAgentService:
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
# Ensure proper error handling and logging
|
||||
error_msg = f"Write operation failed: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
|
||||
duration = time.time() - start_time
|
||||
audit_logger.log_operation(operation="WRITE", config_id=config_id, end_user_id=end_user_id,
|
||||
success=False, duration=duration, error=error_msg)
|
||||
audit_logger.log_operation(
|
||||
operation="WRITE",
|
||||
config_id=memory_config.config_id,
|
||||
end_user_id=end_user_id,
|
||||
success=False,
|
||||
duration=time.time() - start_time,
|
||||
error=error_msg,
|
||||
)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
async def _resolve_and_load_config(
|
||||
self,
|
||||
end_user_id: str,
|
||||
config_id: Optional[uuid.UUID] | int,
|
||||
db: Session,
|
||||
start_time: float,
|
||||
):
|
||||
"""解析 end_user 关联配置并从数据库加载完整 memory_config。"""
|
||||
workspace_id = None
|
||||
try:
|
||||
connected_config = get_end_user_connected_config(end_user_id, db)
|
||||
# get_end_user_connected_config 返回字符串,需转为 UUID
|
||||
workspace_id_raw = connected_config.get("workspace_id")
|
||||
if workspace_id_raw and workspace_id_raw != "None":
|
||||
try:
|
||||
workspace_id = uuid.UUID(str(workspace_id_raw))
|
||||
except (ValueError, AttributeError):
|
||||
workspace_id = None
|
||||
if config_id is None:
|
||||
config_id_raw = connected_config.get("memory_config_id")
|
||||
if config_id_raw and config_id_raw != "None":
|
||||
try:
|
||||
config_id = uuid.UUID(str(config_id_raw))
|
||||
except (ValueError, AttributeError):
|
||||
config_id = None
|
||||
logger.info(f"Resolved config from end_user: config_id = {config_id}, workspace_id = {workspace_id}")
|
||||
if config_id is None and workspace_id is None:
|
||||
raise ValueError(
|
||||
f"No memory configuration found for end_user {end_user_id}. "
|
||||
f"Please ensure the user has a connected memory configuration."
|
||||
)
|
||||
except Exception as e:
|
||||
if "No memory configuration found" in str(e):
|
||||
raise
|
||||
logger.error(f"Failed to get connected config for end_user {end_user_id}: {e}")
|
||||
if config_id is None:
|
||||
raise ValueError(f"Unable to determine memory configuration for end_user {end_user_id}: {e}")
|
||||
|
||||
try:
|
||||
with get_db_context() as config_db:
|
||||
memory_config = MemoryConfigService(config_db).load_memory_config(
|
||||
config_id=config_id,
|
||||
workspace_id=workspace_id,
|
||||
service_name="MemoryAgentService",
|
||||
)
|
||||
logger.info(f"Configuration loaded successfully: {memory_config.config_name}")
|
||||
return memory_config
|
||||
except ConfigurationError as e:
|
||||
error_msg = f"Failed to load configuration for config_id: {config_id}: {e}"
|
||||
logger.error(error_msg)
|
||||
audit_logger.log_operation(
|
||||
operation="WRITE",
|
||||
config_id=config_id,
|
||||
end_user_id=end_user_id,
|
||||
success=False,
|
||||
duration=time.time() - start_time,
|
||||
error=error_msg,
|
||||
)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
async def _preprocess_files(
|
||||
self,
|
||||
messages: list[MessageItem] | list[dict],
|
||||
end_user_id: str,
|
||||
memory_config,
|
||||
db: Session,
|
||||
) -> list[dict]:
|
||||
"""处理消息中附带的文件,生成感知记忆对象并挂载到 message['file_content']。"""
|
||||
perceptual_service = MemoryPerceptualService(db)
|
||||
for message in messages:
|
||||
if isinstance(message, dict):
|
||||
message["file_content"] = []
|
||||
files = message.get("files") or []
|
||||
else:
|
||||
message.file_content = []
|
||||
files = message.files or []
|
||||
for file in files:
|
||||
file_object = await perceptual_service.generate_perceptual_memory(
|
||||
end_user_id=end_user_id,
|
||||
memory_config=memory_config,
|
||||
file=FileInput(**file),
|
||||
)
|
||||
if file_object is None:
|
||||
continue
|
||||
if isinstance(message, dict):
|
||||
message["file_content"].append((file_object, file["type"]))
|
||||
else:
|
||||
message.file_content.append((file_object, file["type"]))
|
||||
logger.info(messages)
|
||||
return messages
|
||||
|
||||
async def _write_neo4j(
|
||||
self,
|
||||
end_user_id: str,
|
||||
messages: list[MessageItem] | list[dict],
|
||||
memory_config,
|
||||
language: Language | str,
|
||||
) -> None:
|
||||
"""使用新流水线(MemoryService → WritePipeline)写入 Neo4j。"""
|
||||
messages_dict = [
|
||||
msg if isinstance(msg, dict) else msg.model_dump(exclude_none=True)
|
||||
for msg in messages
|
||||
]
|
||||
service = MemoryService(memory_config=memory_config, end_user_id=end_user_id)
|
||||
result = await service.write(
|
||||
messages=messages_dict, language=language, ref_id='',
|
||||
)
|
||||
logger.info(
|
||||
f"[WritePipeline] 完成: status={result.status}, "
|
||||
f"elapsed={result.elapsed_seconds:.2f}s, "
|
||||
f"extraction={result.extraction}"
|
||||
)
|
||||
|
||||
async def _invalidate_interest_cache(self, end_user_id: str) -> None:
|
||||
"""写入完成后失效兴趣分布缓存。"""
|
||||
for lang in ["zh", "en"]:
|
||||
deleted = await InterestMemoryCache.delete_interest_distribution(end_user_id, lang)
|
||||
if deleted:
|
||||
logger.info(
|
||||
f"Invalidated interest distribution cache: end_user_id={end_user_id}, language={lang}"
|
||||
)
|
||||
|
||||
async def read_memory(
|
||||
self,
|
||||
end_user_id: str,
|
||||
@@ -448,7 +517,7 @@ class MemoryAgentService:
|
||||
workspace_id = connected_config.get("workspace_id")
|
||||
if config_id is None:
|
||||
config_id = connected_config.get("memory_config_id")
|
||||
logger.info(f"Resolved config from end_user: config_id={config_id}, workspace_id={workspace_id}")
|
||||
logger.info(f"Resolved config from end_user: config_id = {config_id}, workspace_id = {workspace_id}")
|
||||
if config_id is None and workspace_id is None:
|
||||
raise ValueError(
|
||||
f"No memory configuration found for end_user {end_user_id}. Please ensure the user has a connected memory configuration.")
|
||||
@@ -460,7 +529,7 @@ class MemoryAgentService:
|
||||
raise ValueError(f"Unable to determine memory configuration for end_user {end_user_id}: {e}")
|
||||
# If config_id was provided, continue without workspace_id fallback
|
||||
|
||||
logger.info(f"Read operation for group {end_user_id} with config_id {config_id}")
|
||||
logger.info(f"Read operation for end_user_id: {end_user_id} with config_id: {config_id}")
|
||||
|
||||
config_load_start = time.time()
|
||||
try:
|
||||
@@ -771,16 +840,16 @@ class MemoryAgentService:
|
||||
workspace_id = connected_config.get('workspace_id')
|
||||
if config_id is None:
|
||||
config_id = connected_config.get('memory_config_id')
|
||||
logger.info(f"Resolved config from end_user: config_id={config_id}, workspace_id={workspace_id}")
|
||||
logger.info(f"Resolved config from end_user: config_id = {config_id}, workspace_id = {workspace_id}")
|
||||
if config_id is None and workspace_id is None:
|
||||
raise ValueError(
|
||||
f"No memory configuration found for end_user {end_user_id}. Please ensure the user has a connected memory configuration.")
|
||||
f"No memory configuration found for end_user_id {end_user_id}. Please ensure the user has a connected memory configuration.")
|
||||
except Exception as e:
|
||||
if "No memory configuration found" in str(e):
|
||||
raise # Re-raise our specific error
|
||||
logger.error(f"Failed to get connected config for end_user {end_user_id}: {e}")
|
||||
logger.error(f"Failed to get connected config for end_user_id {end_user_id}: {e}")
|
||||
if config_id is None:
|
||||
raise ValueError(f"Unable to determine memory configuration for end_user {end_user_id}: {e}")
|
||||
raise ValueError(f"Unable to determine memory configuration for end_user_id {end_user_id}: {e}")
|
||||
# If config_id was provided, continue without workspace_id fallback
|
||||
|
||||
logger.info(f"Generating summary from retrieve info for query: {query[:50]}...")
|
||||
@@ -1181,7 +1250,7 @@ def get_end_user_connected_config(end_user_id: str, db: Session) -> Dict[str, An
|
||||
from app.models.end_user_model import EndUser
|
||||
from app.services.memory_config_service import MemoryConfigService
|
||||
|
||||
logger.info(f"Getting connected config for end_user: {end_user_id}")
|
||||
logger.info(f"Getting connected config for end_user_id: {end_user_id}")
|
||||
|
||||
# TODO: check sources for enduserid, should be one of these three: chat, draft, apikey
|
||||
# 1. 获取 end_user 及其 app_id
|
||||
@@ -1282,7 +1351,7 @@ def get_end_user_connected_config(end_user_id: str, db: Session) -> Dict[str, An
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"Successfully retrieved connected config: memory_config_id={memory_config_id}, workspace_id={end_user.workspace_id}")
|
||||
f"Successfully retrieved connected config: memory_config_id = {memory_config_id}, workspace_id = {end_user.workspace_id}")
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ from app.core.logging_config import get_logger
|
||||
from app.models.app_model import App
|
||||
from app.models.end_user_model import EndUser
|
||||
from app.schemas.memory_config_schema import ConfigurationError
|
||||
from app.schemas.memory_agent_schema import WriteMemoryRequest
|
||||
from app.services.memory_agent_service import MemoryAgentService
|
||||
|
||||
logger = get_logger(__name__)
|
||||
@@ -291,12 +292,14 @@ class MemoryAPIService:
|
||||
try:
|
||||
messages = message if isinstance(message, list) else [{"role": "user", "content": message}]
|
||||
result = await MemoryAgentService().write_memory(
|
||||
end_user_id=end_user_id,
|
||||
messages=messages,
|
||||
config_id=config_id,
|
||||
db=self.db,
|
||||
storage_type=storage_type,
|
||||
user_rag_memory_id=user_rag_memory_id or "",
|
||||
WriteMemoryRequest(
|
||||
end_user_id=end_user_id,
|
||||
messages=messages,
|
||||
config_id=config_id,
|
||||
storage_type=storage_type,
|
||||
user_rag_memory_id=user_rag_memory_id or "",
|
||||
),
|
||||
self.db,
|
||||
)
|
||||
|
||||
logger.info(f"Memory write (sync) successful for end_user: {end_user_id}")
|
||||
|
||||
@@ -418,6 +418,9 @@ class MemoryConfigService:
|
||||
pruning_scene=memory_config.pruning_scene or "education",
|
||||
pruning_threshold=float(
|
||||
memory_config.pruning_threshold) if memory_config.pruning_threshold is not None else 0.5,
|
||||
# Pipeline config: Emotion extraction
|
||||
emotion_enabled=bool(
|
||||
memory_config.emotion_enabled) if memory_config.emotion_enabled is not None else False,
|
||||
# Ontology scene association
|
||||
scene_id=memory_config.scene_id,
|
||||
ontology_class_infos=_load_ontology_class_infos(self.db, memory_config.scene_id),
|
||||
@@ -573,6 +576,7 @@ class MemoryConfigService:
|
||||
statement_extraction=stmt_config,
|
||||
deduplication=dedup_config,
|
||||
forgetting_engine=forget_config,
|
||||
emotion_enabled=getattr(memory_config, "emotion_enabled", False),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -441,21 +441,12 @@ class DataConfigService: # 数据配置服务类(PostgreSQL)
|
||||
with open(result_path, "r", encoding="utf-8") as rf:
|
||||
extracted_result = json.load(rf)
|
||||
|
||||
# 步骤 6: 计算本体覆盖率并合并到结果中
|
||||
# 步骤 6: 组装结果(试运行不做额外覆盖率后处理)
|
||||
result_data = {
|
||||
"config_id": cid,
|
||||
"time_log": os.path.join(project_root, "logs", "time.log"),
|
||||
"extracted_result": extracted_result,
|
||||
}
|
||||
try:
|
||||
ontology_coverage = await self._compute_ontology_coverage(
|
||||
extracted_result=extracted_result,
|
||||
memory_config=memory_config,
|
||||
)
|
||||
if ontology_coverage:
|
||||
result_data["ontology_coverage"] = ontology_coverage
|
||||
except Exception as cov_err:
|
||||
logger.warning(f"[PILOT_RUN_STREAM] Ontology coverage computation failed: {cov_err}", exc_info=True)
|
||||
|
||||
yield format_sse_message("result", result_data)
|
||||
|
||||
@@ -479,100 +470,6 @@ class DataConfigService: # 数据配置服务类(PostgreSQL)
|
||||
"time": int(time.time() * 1000)
|
||||
})
|
||||
|
||||
async def _compute_ontology_coverage(
|
||||
self,
|
||||
extracted_result: Dict[str, Any],
|
||||
memory_config,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""根据提取结果中的实体类型,与场景/通用本体类型做互斥分类统计。
|
||||
|
||||
分类规则(互斥):场景类型优先 > 通用类型 > 未匹配
|
||||
确保: 场景实体数 + 通用实体数 + 未匹配数 = 总实体数
|
||||
|
||||
Returns:
|
||||
包含三部分统计的字典,或 None(无实体数据时)
|
||||
"""
|
||||
core_entities = extracted_result.get("core_entities", [])
|
||||
if not core_entities:
|
||||
return None
|
||||
|
||||
# 1. 加载场景本体类型集合
|
||||
scene_ontology_types: set = set()
|
||||
try:
|
||||
from app.repositories.ontology_class_repository import OntologyClassRepository
|
||||
|
||||
if memory_config.scene_id:
|
||||
class_repo = OntologyClassRepository(self.db)
|
||||
ontology_classes = class_repo.get_classes_by_scene(memory_config.scene_id)
|
||||
scene_ontology_types = {oc.class_name for oc in ontology_classes}
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load scene ontology types: {e}")
|
||||
|
||||
# 2. 加载通用本体类型集合
|
||||
general_ontology_types: set = set()
|
||||
try:
|
||||
from app.core.memory.ontology_services.ontology_type_loader import (
|
||||
get_general_ontology_registry,
|
||||
is_general_ontology_enabled,
|
||||
)
|
||||
|
||||
if is_general_ontology_enabled():
|
||||
registry = get_general_ontology_registry()
|
||||
if registry:
|
||||
general_ontology_types = set(registry.types.keys())
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load general ontology types: {e}")
|
||||
|
||||
# 3. 互斥分类:场景优先 > 通用 > 未匹配
|
||||
scene_distribution: list = []
|
||||
general_distribution: list = []
|
||||
unmatched_distribution: list = []
|
||||
scene_total = 0
|
||||
general_total = 0
|
||||
unmatched_total = 0
|
||||
|
||||
for item in core_entities:
|
||||
entity_type = item.get("type", "")
|
||||
count = item.get("count", 0)
|
||||
|
||||
if entity_type in scene_ontology_types:
|
||||
scene_distribution.append({"type": entity_type, "count": count})
|
||||
scene_total += count
|
||||
elif entity_type in general_ontology_types:
|
||||
general_distribution.append({"type": entity_type, "count": count})
|
||||
general_total += count
|
||||
else:
|
||||
unmatched_distribution.append({"type": entity_type, "count": count})
|
||||
unmatched_total += count
|
||||
|
||||
# 按数量降序排列
|
||||
scene_distribution.sort(key=lambda x: x["count"], reverse=True)
|
||||
general_distribution.sort(key=lambda x: x["count"], reverse=True)
|
||||
unmatched_distribution.sort(key=lambda x: x["count"], reverse=True)
|
||||
|
||||
total_entities = scene_total + general_total + unmatched_total
|
||||
|
||||
return {
|
||||
"scene_type_distribution": {
|
||||
"type_count": len(scene_distribution),
|
||||
"entity_total": scene_total,
|
||||
"types": scene_distribution,
|
||||
},
|
||||
"general_type_distribution": {
|
||||
"type_count": len(general_distribution),
|
||||
"entity_total": general_total,
|
||||
"types": general_distribution,
|
||||
},
|
||||
"unmatched": {
|
||||
"type_count": len(unmatched_distribution),
|
||||
"entity_total": unmatched_total,
|
||||
"types": unmatched_distribution,
|
||||
},
|
||||
"total_entities": total_entities,
|
||||
"time": int(time.time() * 1000),
|
||||
}
|
||||
|
||||
|
||||
# -------------------- Neo4j Search & Analytics (fused from data_search_service.py) --------------------
|
||||
# Ensure env for connector (e.g., NEO4J_PASSWORD)
|
||||
|
||||
|
||||
@@ -2,6 +2,11 @@
|
||||
Pilot Run Service - 试运行服务
|
||||
|
||||
用于执行记忆系统的试运行流程,不保存到 Neo4j。
|
||||
|
||||
职责边界:
|
||||
- 文本解析、语义剪枝、语义分块(预处理)
|
||||
- 调用 PilotWritePipeline 执行萃取链路
|
||||
- 输出结果文件
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -10,27 +15,59 @@ import time
|
||||
from datetime import datetime
|
||||
from typing import Awaitable, Callable, Optional
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.logging_config import get_memory_logger, log_time
|
||||
from app.core.memory.models.message_models import (
|
||||
ConversationContext,
|
||||
ConversationMessage,
|
||||
DialogData,
|
||||
)
|
||||
from app.core.memory.storage_services.extraction_engine.extraction_orchestrator import (
|
||||
ExtractionOrchestrator,
|
||||
get_chunked_dialogs_from_preprocessed,
|
||||
from app.core.memory.storage_services.extraction_engine.pipeline_help import (
|
||||
_write_extracted_result_summary,
|
||||
export_test_input_doc,
|
||||
)
|
||||
from app.core.memory.utils.config.config_utils import (
|
||||
get_pipeline_config,
|
||||
)
|
||||
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
|
||||
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
|
||||
from app.schemas.memory_config_schema import MemoryConfig
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
logger = get_memory_logger(__name__)
|
||||
|
||||
|
||||
def _save_triplets_from_dialogs(dialog_data_list: list[DialogData], output_path: str) -> None:
|
||||
"""Write triplet/entity text report compatible with pipeline_help parsers."""
|
||||
all_triplets = []
|
||||
all_entities = []
|
||||
|
||||
for dialog in dialog_data_list:
|
||||
for chunk in getattr(dialog, "chunks", []) or []:
|
||||
for statement in getattr(chunk, "statements", []) or []:
|
||||
triplet_info = getattr(statement, "triplet_extraction_info", None)
|
||||
if not triplet_info:
|
||||
continue
|
||||
all_triplets.extend(getattr(triplet_info, "triplets", []) or [])
|
||||
all_entities.extend(getattr(triplet_info, "entities", []) or [])
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(f"=== EXTRACTED TRIPLETS ({len(all_triplets)} total) ===\n\n")
|
||||
for i, triplet in enumerate(all_triplets, 1):
|
||||
f.write(f"Triplet {i}:\n")
|
||||
f.write(f" Subject: {triplet.subject_name} (ID: {triplet.subject_id})\n")
|
||||
f.write(f" Predicate: {triplet.predicate}\n")
|
||||
f.write(f" Object: {triplet.object_name} (ID: {triplet.object_id})\n")
|
||||
value = getattr(triplet, "value", None)
|
||||
if value:
|
||||
f.write(f" Value: {value}\n")
|
||||
f.write("\n")
|
||||
|
||||
f.write(f"\n=== EXTRACTED ENTITIES ({len(all_entities)} total) ===\n\n")
|
||||
for i, entity in enumerate(all_entities, 1):
|
||||
f.write(f"Entity {i}:\n")
|
||||
f.write(f" ID: {entity.entity_idx}\n")
|
||||
f.write(f" Name: {entity.name}\n")
|
||||
f.write(f" Type: {entity.type}\n")
|
||||
f.write(f" Description: {entity.description}\n")
|
||||
f.write("\n")
|
||||
|
||||
|
||||
async def run_pilot_extraction(
|
||||
memory_config: MemoryConfig,
|
||||
dialogue_text: str,
|
||||
@@ -38,18 +75,19 @@ async def run_pilot_extraction(
|
||||
progress_callback: Optional[Callable[[str, str, Optional[dict]], Awaitable[None]]] = None,
|
||||
language: str = "zh",
|
||||
) -> None:
|
||||
"""
|
||||
执行试运行模式的知识提取流水线。
|
||||
"""执行试运行模式的知识提取流水线。
|
||||
|
||||
职责:
|
||||
1. 文本解析 → 语义剪枝 → 语义分块(预处理,需要 llm_client)
|
||||
2. 调用 PilotWritePipeline 执行萃取链路(Pipeline 自行管理客户端)
|
||||
3. 将萃取结果写入输出文件
|
||||
|
||||
Args:
|
||||
memory_config: 从数据库加载的内存配置对象
|
||||
dialogue_text: 输入的对话文本
|
||||
db: 数据库会话
|
||||
progress_callback: 可选的进度回调函数
|
||||
- 参数1 (stage): 当前处理阶段标识符
|
||||
- 参数2 (message): 人类可读的进度消息
|
||||
- 参数3 (data): 可选的附加数据字典
|
||||
language: 语言类型 ("zh" 中文, "en" 英文),默认中文
|
||||
db: 数据库会话(用于初始化预处理所需的 LLM 客户端)
|
||||
progress_callback: 可选的进度回调 (stage, message, data)
|
||||
language: 语言类型 ("zh" | "en")
|
||||
"""
|
||||
log_file = "logs/time.log"
|
||||
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
||||
@@ -58,26 +96,18 @@ async def run_pilot_extraction(
|
||||
f.write(f"\n=== Pilot Run Started: {timestamp} ===\n")
|
||||
|
||||
pipeline_start = time.time()
|
||||
neo4j_connector = None
|
||||
|
||||
try:
|
||||
# 步骤 1: 初始化客户端
|
||||
logger.info("Initializing clients...")
|
||||
# ── 步骤 1: 初始化预处理所需的 LLM 客户端 ──────────────────────────
|
||||
# 只用于语义剪枝和分块,PilotWritePipeline 内部会自行初始化萃取客户端
|
||||
step_start = time.time()
|
||||
|
||||
client_factory = MemoryClientFactory(db)
|
||||
llm_client = client_factory.get_llm_client(str(memory_config.llm_model_id))
|
||||
embedder_client = client_factory.get_embedder_client(str(memory_config.embedding_model_id))
|
||||
|
||||
neo4j_connector = Neo4jConnector()
|
||||
|
||||
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
|
||||
factory = MemoryClientFactory(db)
|
||||
llm_client = factory.get_llm_client(str(memory_config.llm_model_id))
|
||||
log_time("Client Initialization", time.time() - step_start, log_file)
|
||||
|
||||
# 步骤 2: 解析对话文本
|
||||
logger.info("Parsing dialogue text...")
|
||||
# ── 步骤 2: 文本解析 ────────────────────────────────────────────────
|
||||
step_start = time.time()
|
||||
|
||||
# 解析对话文本,支持 "用户:" 和 "AI:" 格式
|
||||
pattern = r"(用户|AI)[::]\s*([^\n]+(?:\n(?!(?:用户|AI)[::])[^\n]*)*?)"
|
||||
matches = re.findall(pattern, dialogue_text, re.MULTILINE | re.DOTALL)
|
||||
messages = [
|
||||
@@ -85,14 +115,11 @@ async def run_pilot_extraction(
|
||||
for r, c in matches
|
||||
if c.strip()
|
||||
]
|
||||
|
||||
# 如果没有匹配到格式化的对话,将整个文本作为用户消息
|
||||
if not messages:
|
||||
messages = [ConversationMessage(role="用户", msg=dialogue_text.strip())]
|
||||
|
||||
context = ConversationContext(msgs=messages)
|
||||
dialog = DialogData(
|
||||
context=context,
|
||||
context=ConversationContext(msgs=messages),
|
||||
ref_id="pilot_dialog_1",
|
||||
end_user_id=str(memory_config.workspace_id),
|
||||
user_id=str(memory_config.tenant_id),
|
||||
@@ -103,269 +130,149 @@ async def run_pilot_extraction(
|
||||
if progress_callback:
|
||||
await progress_callback("text_preprocessing", "开始预处理文本(语义剪枝 + 语义分块)...")
|
||||
|
||||
# ========== 步骤 2.1: 语义剪枝 ==========
|
||||
# ── 步骤 2.1: 语义剪枝 ─────────────────────────────────────────────
|
||||
pruned_dialogs = [dialog]
|
||||
deleted_messages = [] # 记录被删除的消息
|
||||
pruning_stats = None # 保存剪枝统计信息,用于最终汇总
|
||||
|
||||
pruning_stats: dict = {"enabled": False}
|
||||
|
||||
if memory_config.pruning_enabled:
|
||||
try:
|
||||
from app.core.memory.storage_services.extraction_engine.data_preprocessing.data_pruning import (
|
||||
SemanticPruner,
|
||||
)
|
||||
from app.core.memory.models.config_models import PruningConfig
|
||||
|
||||
# 构建剪枝配置
|
||||
pruning_config_dict = {
|
||||
"pruning_switch": memory_config.pruning_enabled,
|
||||
"pruning_scene": memory_config.pruning_scene,
|
||||
"pruning_threshold": memory_config.pruning_threshold,
|
||||
"scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
|
||||
"ontology_class_infos": memory_config.ontology_class_infos,
|
||||
}
|
||||
config = PruningConfig(**pruning_config_dict)
|
||||
|
||||
logger.info(f"[PILOT_RUN] 开始语义剪枝: scene={config.pruning_scene}, threshold={config.pruning_threshold}")
|
||||
|
||||
# 记录剪枝前的消息(用于对比)
|
||||
original_messages = [{"role": msg.role, "content": msg.msg} for msg in dialog.context.msgs]
|
||||
original_msg_count = len(original_messages)
|
||||
|
||||
# 执行剪枝
|
||||
pruner = SemanticPruner(config=config, llm_client=llm_client)
|
||||
pruned_dialogs = await pruner.prune_dataset([dialog])
|
||||
|
||||
# 计算剪枝结果并找出被删除的消息
|
||||
|
||||
config = PruningConfig(
|
||||
pruning_switch=memory_config.pruning_enabled,
|
||||
pruning_scene=memory_config.pruning_scene,
|
||||
pruning_threshold=memory_config.pruning_threshold,
|
||||
scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
|
||||
ontology_class_infos=memory_config.ontology_class_infos,
|
||||
)
|
||||
original_msgs = [{"role": m.role, "content": m.msg} for m in dialog.context.msgs]
|
||||
pruned_dialogs = await SemanticPruner(config=config, llm_client=llm_client).prune_dataset([dialog])
|
||||
|
||||
if pruned_dialogs and pruned_dialogs[0].context:
|
||||
remaining_messages = [{"role": msg.role, "content": msg.msg} for msg in pruned_dialogs[0].context.msgs]
|
||||
remaining_msg_count = len(remaining_messages)
|
||||
deleted_msg_count = original_msg_count - remaining_msg_count
|
||||
|
||||
# 找出被删除的消息(基于索引精确匹配)
|
||||
# 为剩余消息创建带索引的列表,用于精确追踪
|
||||
remaining_with_index = []
|
||||
remaining_idx = 0
|
||||
for orig_idx, orig_msg in enumerate(original_messages):
|
||||
if remaining_idx < len(remaining_messages) and \
|
||||
orig_msg["role"] == remaining_messages[remaining_idx]["role"] and \
|
||||
orig_msg["content"] == remaining_messages[remaining_idx]["content"]:
|
||||
remaining_with_index.append(orig_idx)
|
||||
remaining_idx += 1
|
||||
|
||||
# 找出未在保留列表中的消息索引
|
||||
remaining = [{"role": m.role, "content": m.msg} for m in pruned_dialogs[0].context.msgs]
|
||||
# 找出被删除的消息(顺序匹配)
|
||||
kept_indices: list[int] = []
|
||||
ri = 0
|
||||
for oi, om in enumerate(original_msgs):
|
||||
if ri < len(remaining) and om == remaining[ri]:
|
||||
kept_indices.append(oi)
|
||||
ri += 1
|
||||
deleted_messages = [
|
||||
{"index": idx, "role": msg["role"], "content": msg["content"]}
|
||||
for idx, msg in enumerate(original_messages)
|
||||
if idx not in remaining_with_index
|
||||
{"index": i, "role": m["role"], "content": m["content"]}
|
||||
for i, m in enumerate(original_msgs)
|
||||
if i not in kept_indices
|
||||
]
|
||||
|
||||
# 保存剪枝统计信息(用于最终汇总,只保留deleted_count)
|
||||
pruning_stats = {
|
||||
"enabled": True,
|
||||
"scene": config.pruning_scene,
|
||||
"threshold": config.pruning_threshold,
|
||||
"deleted_count": deleted_msg_count,
|
||||
"deleted_count": len(deleted_messages),
|
||||
}
|
||||
|
||||
# 输出剪枝结果(显示删除的消息详情)
|
||||
pruning_result = {
|
||||
"type": "pruning",
|
||||
"deleted_messages": deleted_messages,
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"[PILOT_RUN] 语义剪枝完成: 原始{original_msg_count}条 -> "
|
||||
f"保留{remaining_msg_count}条 (删除{deleted_msg_count}条)"
|
||||
f"[PILOT_RUN] 语义剪枝完成: {len(original_msgs)} → {len(remaining)} 条"
|
||||
f"(删除 {len(deleted_messages)} 条)"
|
||||
)
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback("text_preprocessing_result", "语义剪枝完成", pruning_result)
|
||||
await progress_callback(
|
||||
"text_preprocessing_result", "语义剪枝完成",
|
||||
{"type": "pruning", "deleted_messages": deleted_messages},
|
||||
)
|
||||
else:
|
||||
logger.warning("[PILOT_RUN] 剪枝后对话为空,使用原始对话")
|
||||
pruned_dialogs = [dialog]
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[PILOT_RUN] 语义剪枝失败,使用原始对话: {e}", exc_info=True)
|
||||
pruned_dialogs = [dialog]
|
||||
if progress_callback:
|
||||
error_result = {
|
||||
"type": "pruning",
|
||||
"error": str(e),
|
||||
"fallback": "使用原始对话"
|
||||
}
|
||||
await progress_callback("text_preprocessing_result", "语义剪枝失败", error_result)
|
||||
else:
|
||||
logger.info("[PILOT_RUN] 语义剪枝已关闭,跳过")
|
||||
pruning_stats = {
|
||||
"enabled": False,
|
||||
}
|
||||
await progress_callback(
|
||||
"text_preprocessing_result", "语义剪枝失败",
|
||||
{"type": "pruning", "error": str(e), "fallback": "使用原始对话"},
|
||||
)
|
||||
|
||||
# ========== 步骤 2.2: 语义分块 ==========
|
||||
chunked_dialogs = await get_chunked_dialogs_from_preprocessed(
|
||||
data=pruned_dialogs,
|
||||
chunker_strategy=memory_config.chunker_strategy,
|
||||
llm_client=llm_client,
|
||||
# ── 步骤 2.2: 语义分块 ─────────────────────────────────────────────
|
||||
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.chunk_extraction import (
|
||||
DialogueChunker,
|
||||
)
|
||||
|
||||
remaining_msg_count = len(pruned_dialogs[0].context.msgs) if pruned_dialogs and pruned_dialogs[0].context else 0
|
||||
logger.info(f"Processed dialogue text: {remaining_msg_count} messages after pruning")
|
||||
chunked_dialogs = []
|
||||
for dlg in pruned_dialogs:
|
||||
dlg.chunks = await DialogueChunker(memory_config.chunker_strategy, llm_client=llm_client).process_dialogue(dlg)
|
||||
chunked_dialogs.append(dlg)
|
||||
|
||||
# 进度回调:输出每个分块的结果
|
||||
if progress_callback:
|
||||
for dlg in chunked_dialogs:
|
||||
if hasattr(dlg, 'chunks') and dlg.chunks:
|
||||
for i, chunk in enumerate(dlg.chunks):
|
||||
chunk_result = {
|
||||
for i, chunk in enumerate(dlg.chunks or []):
|
||||
await progress_callback(
|
||||
"text_preprocessing_result", f"分块 {i + 1} 处理完成",
|
||||
{
|
||||
"type": "chunking",
|
||||
"chunk_index": i + 1,
|
||||
"content": chunk.content[:200] + "..." if len(chunk.content) > 200 else chunk.content,
|
||||
"full_length": len(chunk.content),
|
||||
"dialog_id": dlg.id,
|
||||
"chunker_strategy": memory_config.chunker_strategy,
|
||||
}
|
||||
await progress_callback("text_preprocessing_result", f"分块 {i + 1} 处理完成", chunk_result)
|
||||
|
||||
# 构建预处理完成总结(包含剪枝统计)
|
||||
preprocessing_summary = {
|
||||
"total_chunks": sum(len(dlg.chunks) for dlg in chunked_dialogs if hasattr(dlg, 'chunks') and dlg.chunks),
|
||||
"total_dialogs": len(chunked_dialogs),
|
||||
"chunker_strategy": memory_config.chunker_strategy,
|
||||
}
|
||||
|
||||
# 添加剪枝统计信息(始终包含 pruning 字段,确保前端不会因字段缺失报错)
|
||||
preprocessing_summary["pruning"] = pruning_stats if pruning_stats else {
|
||||
"enabled": memory_config.pruning_enabled,
|
||||
"deleted_count": 0,
|
||||
}
|
||||
|
||||
await progress_callback("text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", preprocessing_summary)
|
||||
},
|
||||
)
|
||||
await progress_callback(
|
||||
"text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)",
|
||||
{
|
||||
"total_chunks": sum(len(dlg.chunks or []) for dlg in chunked_dialogs),
|
||||
"total_dialogs": len(chunked_dialogs),
|
||||
"chunker_strategy": memory_config.chunker_strategy,
|
||||
"pruning": pruning_stats,
|
||||
},
|
||||
)
|
||||
|
||||
log_time("Data Loading & Chunking", time.time() - step_start, log_file)
|
||||
|
||||
# 步骤 3: 初始化流水线编排器
|
||||
logger.info("Initializing extraction orchestrator...")
|
||||
step_start = time.time()
|
||||
|
||||
config = get_pipeline_config(memory_config)
|
||||
logger.info(
|
||||
f"Pipeline config loaded: enable_llm_dedup_blockwise={config.deduplication.enable_llm_dedup_blockwise}, "
|
||||
f"enable_llm_disambiguation={config.deduplication.enable_llm_disambiguation}"
|
||||
)
|
||||
|
||||
# 加载本体类型(如果配置了 scene_id),支持通用类型回退
|
||||
ontology_types = None
|
||||
try:
|
||||
from app.core.memory.ontology_services.ontology_type_loader import load_ontology_types_with_fallback
|
||||
|
||||
ontology_types = load_ontology_types_with_fallback(
|
||||
scene_id=memory_config.scene_id,
|
||||
workspace_id=memory_config.workspace_id,
|
||||
db=db,
|
||||
enable_general_fallback=True
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load ontology types: {e}", exc_info=True)
|
||||
|
||||
orchestrator = ExtractionOrchestrator(
|
||||
llm_client=llm_client,
|
||||
embedder_client=embedder_client,
|
||||
connector=neo4j_connector,
|
||||
config=config,
|
||||
progress_callback=progress_callback,
|
||||
embedding_id=str(memory_config.embedding_model_id),
|
||||
language=language,
|
||||
ontology_types=ontology_types,
|
||||
)
|
||||
|
||||
log_time("Orchestrator Initialization", time.time() - step_start, log_file)
|
||||
|
||||
# 步骤 4: 执行知识提取流水线
|
||||
logger.info("Running extraction pipeline...")
|
||||
# ── 步骤 3: 萃取(PilotWritePipeline 自行管理客户端和本体加载)──────
|
||||
step_start = time.time()
|
||||
logger.info("Running pilot extraction pipeline...")
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback("knowledge_extraction", "正在知识抽取...")
|
||||
|
||||
extraction_result = await orchestrator.run(
|
||||
dialog_data_list=chunked_dialogs,
|
||||
is_pilot_run=True,
|
||||
)
|
||||
from app.core.memory.pipelines.pilot_write_pipeline import PilotWritePipeline
|
||||
|
||||
# 解包 extraction_result tuple (与 main.py 保持一致)
|
||||
(
|
||||
dialogue_nodes,
|
||||
chunk_nodes,
|
||||
statement_nodes,
|
||||
entity_nodes,
|
||||
_,
|
||||
statement_chunk_edges,
|
||||
statement_entity_edges,
|
||||
entity_edges,
|
||||
_,
|
||||
_
|
||||
) = extraction_result
|
||||
pilot_result = await PilotWritePipeline(
|
||||
memory_config=memory_config,
|
||||
end_user_id=str(memory_config.workspace_id),
|
||||
language=language,
|
||||
progress_callback=progress_callback,
|
||||
).run(chunked_dialogs)
|
||||
|
||||
log_time("Extraction Pipeline", time.time() - step_start, log_file)
|
||||
|
||||
# ── 步骤 4: 输出结果文件 ────────────────────────────────────────────
|
||||
if progress_callback:
|
||||
await progress_callback("generating_results", "正在生成结果...")
|
||||
|
||||
# 步骤 5: 生成记忆摘要(与 main.py 保持一致)
|
||||
try:
|
||||
logger.info("Generating memory summaries...")
|
||||
step_start = time.time()
|
||||
|
||||
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.memory_summary import (
|
||||
memory_summary_generation,
|
||||
)
|
||||
|
||||
summaries = await memory_summary_generation(
|
||||
chunked_dialogs,
|
||||
llm_client=llm_client,
|
||||
embedder_client=embedder_client,
|
||||
language=language,
|
||||
)
|
||||
|
||||
log_time("Memory Summary Generation", time.time() - step_start, log_file)
|
||||
except Exception as e:
|
||||
logger.error(f"Memory summary step failed: {e}", exc_info=True)
|
||||
|
||||
logger.info("Pilot run completed: Skipping Neo4j save")
|
||||
|
||||
# 将提取统计写入 Redis,按 workspace_id 存储
|
||||
try:
|
||||
from app.cache.memory.activity_stats_cache import ActivityStatsCache
|
||||
|
||||
stats_to_cache = {
|
||||
"chunk_count": len(chunk_nodes) if chunk_nodes else 0,
|
||||
"statements_count": len(statement_nodes) if statement_nodes else 0,
|
||||
"triplet_entities_count": len(entity_nodes) if entity_nodes else 0,
|
||||
"triplet_relations_count": len(entity_edges) if entity_edges else 0,
|
||||
"temporal_count": 0, # temporal 数据在日志中,此处暂置0
|
||||
}
|
||||
await ActivityStatsCache.set_activity_stats(
|
||||
workspace_id=str(memory_config.workspace_id),
|
||||
stats=stats_to_cache,
|
||||
)
|
||||
logger.info(f"[PILOT_RUN] 活动统计已写入 Redis: workspace_id={memory_config.workspace_id}")
|
||||
except Exception as cache_err:
|
||||
logger.warning(f"[PILOT_RUN] 写入活动统计缓存失败(不影响主流程): {cache_err}", exc_info=True)
|
||||
graph = pilot_result.graph
|
||||
settings.ensure_memory_output_dir()
|
||||
export_test_input_doc(
|
||||
entity_nodes=graph.entity_nodes,
|
||||
statement_entity_edges=graph.stmt_entity_edges,
|
||||
entity_entity_edges=graph.entity_entity_edges,
|
||||
)
|
||||
_save_triplets_from_dialogs(
|
||||
dialog_data_list=pilot_result.dialog_data_list,
|
||||
output_path=settings.get_memory_output_path("extracted_triplets.txt"),
|
||||
)
|
||||
_write_extracted_result_summary(
|
||||
chunk_nodes=graph.chunk_nodes,
|
||||
pipeline_output_dir=settings.get_memory_output_path(),
|
||||
)
|
||||
logger.info("Pilot run completed: stop after layer-1 dedup (no Neo4j write)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Pilot run failed: {e}", exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
if neo4j_connector:
|
||||
try:
|
||||
await neo4j_connector.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
total_time = time.time() - pipeline_start
|
||||
log_time("TOTAL PILOT RUN TIME", total_time, log_file)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
with open(log_file, "a", encoding="utf-8") as f:
|
||||
f.write(f"=== Pilot Run Completed: {timestamp} ===\n\n")
|
||||
|
||||
f.write(f"=== Pilot Run Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n\n")
|
||||
logger.info(f"Pilot run complete. Total time: {total_time:.2f}s")
|
||||
|
||||
@@ -399,25 +399,17 @@ class UserMemoryService:
|
||||
}
|
||||
|
||||
# 构建响应数据(转换时间为毫秒时间戳)
|
||||
# 将 meta_data 中的 profile、knowledge_tags、behavioral_hints 平铺到顶层
|
||||
meta = end_user_info_record.meta_data or {}
|
||||
|
||||
# profile 列表字段截断:只返回前 MAX_PROFILE_LIST_SIZE 条(按时间从新到旧)
|
||||
MAX_PROFILE_LIST_SIZE = 5
|
||||
profile = meta.get("profile")
|
||||
if isinstance(profile, dict):
|
||||
for key in ("role", "domain", "expertise", "interests"):
|
||||
if isinstance(profile.get(key), list):
|
||||
profile[key] = profile[key][:MAX_PROFILE_LIST_SIZE]
|
||||
# meta_data 只暴露四个核心字段
|
||||
_META_FIELDS = ("goals", "traits", "interests", "core_facts")
|
||||
raw_meta = end_user_info_record.meta_data or {}
|
||||
filtered_meta = {k: raw_meta[k] for k in _META_FIELDS if k in raw_meta}
|
||||
|
||||
response_data = {
|
||||
"end_user_info_id": str(end_user_info_record.id),
|
||||
"end_user_id": str(end_user_info_record.end_user_id),
|
||||
"other_name": end_user_info_record.other_name,
|
||||
"aliases": end_user_info_record.aliases,
|
||||
"profile": profile,
|
||||
"knowledge_tags": meta.get("knowledge_tags"),
|
||||
"behavioral_hints": meta.get("behavioral_hints"),
|
||||
"meta_data": filtered_meta,
|
||||
"created_at": datetime_to_timestamp(end_user_info_record.created_at),
|
||||
"updated_at": datetime_to_timestamp(end_user_info_record.updated_at)
|
||||
}
|
||||
|
||||
1053
api/app/tasks.py
1053
api/app/tasks.py
File diff suppressed because it is too large
Load Diff
@@ -142,5 +142,10 @@ SMTP_PASSWORD=
|
||||
GENERAL_ONTOLOGY_FILES=api/app/core/memory/ontology_services/General_purpose_entity.ttl # 指定要加载的本体文件路径,多个文件用逗号分隔
|
||||
ENABLE_GENERAL_ONTOLOGY_TYPES=true # 总开关,控制是否启用通用本体类型融合功能(false = 不使用任何本体类型指导)
|
||||
MAX_ONTOLOGY_TYPES_IN_PROMPT=100 # 限制传给 LLM 的类型数量,防止 Prompt 过长
|
||||
CORE_GENERAL_TYPES=Person,Organization,Place,Event,Work,Concept # 定义核心类型列表,这些类型会优先包含在合并结果中
|
||||
ONTOLOGY_EXPERIMENT_MODE=true # 是否允许通过 API 动态切换本体配置
|
||||
|
||||
# 核心类型白名单,与 ontology.md Entity Ontology 的 13 类保持一致
|
||||
CORE_GENERAL_TYPES=人物,组织,群体,角色职业,地点设施,物品设备,软件平台,识别联系信息,文档媒体,知识能力,偏好习惯,具体目标,称呼别名
|
||||
ONTOLOGY_EXPERIMENT_MODE=true # 是否允许通过 API 动态切换本体配置
|
||||
|
||||
# 萃取阶段快照:将每个阶段的输出保存到 logs/memory-output/snapshots/
|
||||
PIPELINE_SNAPSHOT_ENABLED=false
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user