From 9dc9b7aee7210545c069137e4f6115e72055f890 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Thu, 30 Apr 2026 15:58:08 +0800 Subject: [PATCH] refactor(memory): remove legacy extraction pipeline and add dialog_at temporal grounding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete ExtractionOrchestrator (~2500 lines) and write_tools legacy path; MemoryService/WritePipeline is now the sole write path - Remove NEW_PIPELINE_ENABLED feature flag from memory_agent_service - Simplify pilot_run_service to always use PilotWritePipeline - Add dialog_at field to statement and triplet extraction prompts as the primary reference time for resolving relative temporal expressions - Rewrite relative time phrases (e.g. 昨天, 下周) into concrete dates directly in statement_text when stably resolvable from dialog_at - Rename extracat_Pruning.jinja2 to extracat_pruning.jinja2; expand few-shot examples and update memory type enum (drop NULL, add agreement/repetition/other) --- api/app/core/config.py | 6 - .../langgraph_graph/nodes/write_nodes.py | 67 - .../core/memory/agent/utils/write_tools.py | 413 --- .../data_preprocessing/data_pruning.py | 6 +- .../data_preprocessing/scene_config.py | 2 +- .../extraction_orchestrator.py | 2425 ----------------- .../extraction_pipeline_orchestrator.py | 16 +- .../steps/schema/extraction_step_schema.py | 3 + .../steps/statement_temporal_step.py | 3 + .../extraction_engine/steps/triplet_step.py | 1 + .../prompt/prompts/extracat_Pruning.jinja2 | 130 - .../prompt/prompts/extracat_pruning.jinja2 | 180 ++ .../prompts/extract_statement_temporal.jinja2 | 81 +- .../prompt/prompts/extract_triplet.jinja2 | 10 +- api/app/services/memory_agent_service.py | 33 +- api/app/services/pilot_run_service.py | 337 +-- 16 files changed, 386 insertions(+), 3327 deletions(-) delete mode 100644 api/app/core/memory/agent/langgraph_graph/nodes/write_nodes.py delete mode 100644 api/app/core/memory/agent/utils/write_tools.py delete mode 100644 api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py delete mode 100644 api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 create mode 100644 api/app/core/memory/utils/prompt/prompts/extracat_pruning.jinja2 diff --git a/api/app/core/config.py b/api/app/core/config.py index 615f5d98..56a07f3f 100644 --- a/api/app/core/config.py +++ b/api/app/core/config.py @@ -272,12 +272,6 @@ class Settings: MEMORY_OUTPUT_DIR: str = os.getenv("MEMORY_OUTPUT_DIR", "logs/memory-output") MEMORY_CONFIG_DIR: str = os.getenv("MEMORY_CONFIG_DIR", "app/core/memory") - # Pilot run pipeline switch: - # true -> use refactored PilotWritePipeline - # false -> use legacy ExtractionOrchestrator pipeline - PILOT_RUN_USE_REFACTORED_PIPELINE: bool = ( - os.getenv("PILOT_RUN_USE_REFACTORED_PIPELINE", "true").lower() == "true" - ) # Tool Management Configuration TOOL_CONFIG_DIR: str = os.getenv("TOOL_CONFIG_DIR", "app/core/tools") diff --git a/api/app/core/memory/agent/langgraph_graph/nodes/write_nodes.py b/api/app/core/memory/agent/langgraph_graph/nodes/write_nodes.py deleted file mode 100644 index 10fe96ba..00000000 --- a/api/app/core/memory/agent/langgraph_graph/nodes/write_nodes.py +++ /dev/null @@ -1,67 +0,0 @@ -from app.cache.memory.interest_memory import InterestMemoryCache -from app.core.memory.agent.utils.llm_tools import WriteState -from app.core.memory.agent.utils.write_tools import write -from app.core.logging_config import get_agent_logger - -logger = get_agent_logger(__name__) - - -async def write_node(state: WriteState) -> WriteState: - """ - Write data to the database/file system. - - Args: - state: WriteState containing messages, end_user_id, memory_config, and language - - Returns: - dict: Contains 'write_result' with status and data fields - """ - messages = state.get('messages', []) - end_user_id = state.get('end_user_id', '') - memory_config = state.get('memory_config', '') - language = state.get('language', 'zh') # 默认中文 - - # Convert LangChain messages to structured format expected by write() - structured_messages = [] - for msg in messages: - if hasattr(msg, 'type') and hasattr(msg, 'content'): - # Map LangChain message types to role names - role = 'user' if msg.type == 'human' else 'assistant' if msg.type == 'ai' else msg.type - structured_messages.append({ - "role": role, - "content": msg.content # content is now guaranteed to be a string - }) - - try: - result = await write( - messages=structured_messages, - end_user_id=end_user_id, - memory_config=memory_config, - language=language, - ) - logger.info(f"Write completed successfully! Config: {memory_config.config_name}") - - # 写入 neo4j 成功后,删除该用户的兴趣分布缓存,确保下次请求重新生成 - for lang in ["zh", "en"]: - deleted = await InterestMemoryCache.delete_interest_distribution( - end_user_id=end_user_id, - language=lang, - ) - if deleted: - logger.info(f"Invalidated interest distribution cache: end_user_id={end_user_id}, language={lang}") - - write_result = { - "status": "success", - "data": structured_messages, - "config_id": memory_config.config_id, - "config_name": memory_config.config_name, - } - return {"write_result": write_result} - - except Exception as e: - logger.error(f"Data_write failed: {e}", exc_info=True) - write_result = { - "status": "error", - "message": str(e), - } - return {"write_result": write_result} diff --git a/api/app/core/memory/agent/utils/write_tools.py b/api/app/core/memory/agent/utils/write_tools.py deleted file mode 100644 index fd260130..00000000 --- a/api/app/core/memory/agent/utils/write_tools.py +++ /dev/null @@ -1,413 +0,0 @@ -""" -Write Tools for Memory Knowledge Extraction Pipeline - -This module provides the main write function for executing the knowledge extraction -pipeline. Only MemoryConfig is needed - clients are constructed internally. -""" -import asyncio -import time -import uuid -from datetime import datetime -from typing import List, Optional - -from dotenv import load_dotenv - -from app.core.logging_config import get_agent_logger -from app.core.memory.agent.utils.get_dialogs import get_chunked_dialogs -from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import _USER_PLACEHOLDER_NAMES -from app.core.memory.storage_services.extraction_engine.extraction_orchestrator import ExtractionOrchestrator -from app.core.memory.storage_services.extraction_engine.knowledge_extraction.memory_summary import \ - memory_summary_generation -from app.core.memory.utils.llm.llm_utils import MemoryClientFactory -from app.core.memory.utils.log.logging_utils import log_time -from app.db import get_db_context -from app.repositories.neo4j.add_edges import add_memory_summary_statement_edges -from app.repositories.neo4j.add_nodes import add_memory_summary_nodes -from app.repositories.neo4j.graph_saver import save_dialog_and_statements_to_neo4j -from app.repositories.neo4j.neo4j_connector import Neo4jConnector -from app.schemas.memory_config_schema import MemoryConfig - -load_dotenv() - -logger = get_agent_logger(__name__) - - -async def write( - end_user_id: str, - memory_config: MemoryConfig, - messages: list, - ref_id: str = "", - language: str = "zh", -) -> None: - """ - Execute the complete knowledge extraction pipeline. - - Args: - end_user_id: Group identifier - memory_config: MemoryConfig object containing all configuration - messages: Structured message list [{"role": "user", "content": "..."}, ...] - ref_id: Reference ID, defaults to "" - language: 语言类型 ("zh" 中文, "en" 英文),默认中文 - """ - if not ref_id: - ref_id = uuid.uuid4().hex - # Extract config values - embedding_model_id = str(memory_config.embedding_model_id) - chunker_strategy = memory_config.chunker_strategy - config_id = str(memory_config.config_id) - - logger.info("=== MemSci Knowledge Extraction Pipeline ===") - logger.info(f"Config: {memory_config.config_name} (ID: {config_id})") - logger.info(f"Workspace: {memory_config.workspace_name}") - logger.info(f"LLM model: {memory_config.llm_model_name}") - logger.info(f"Embedding model: {memory_config.embedding_model_name}") - logger.info(f"Chunker strategy: {chunker_strategy}") - logger.info(f"end_user_id ID: {end_user_id}") - - # Construct clients from memory_config using factory pattern with db session - with get_db_context() as db: - factory = MemoryClientFactory(db) - llm_client = factory.get_llm_client_from_config(memory_config) - embedder_client = factory.get_embedder_client_from_config(memory_config) - logger.info("LLM and embedding clients constructed") - - # Initialize timing log - log_file = "logs/time.log" - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - with open(log_file, "a", encoding="utf-8") as f: - f.write(f"\n=== Pipeline Run Started: {timestamp} ===\n") - f.write(f"Config: {memory_config.config_name} (ID: {config_id})\n") - - pipeline_start = time.time() - - # Initialize Neo4j connector - neo4j_connector = Neo4jConnector() - - # Step 1: Load and chunk data - step_start = time.time() - chunked_dialogs = await get_chunked_dialogs( - chunker_strategy=chunker_strategy, - end_user_id=end_user_id, - messages=messages, - ref_id=ref_id, - config_id=config_id, - ) - log_time("Data Loading & Chunking", time.time() - step_start, log_file) - - # Step 2: Initialize and run ExtractionOrchestrator - step_start = time.time() - from app.core.memory.utils.config.config_utils import get_pipeline_config - pipeline_config = get_pipeline_config(memory_config) - - # Fetch ontology types if scene_id is configured - ontology_types = None - if memory_config.scene_id: - try: - from app.core.memory.ontology_services.ontology_type_loader import load_ontology_types_for_scene - - with get_db_context() as db: - ontology_types = load_ontology_types_for_scene( - scene_id=memory_config.scene_id, - workspace_id=memory_config.workspace_id, - db=db - ) - - if ontology_types: - logger.info( - f"Loaded {len(ontology_types.types)} ontology types for scene_id: {memory_config.scene_id}" - ) - else: - logger.info(f"No ontology classes found for scene_id: {memory_config.scene_id}") - except Exception as e: - logger.warning( - f"Failed to fetch ontology types for scene_id {memory_config.scene_id}: {e}", - exc_info=True - ) - - orchestrator = ExtractionOrchestrator( - llm_client=llm_client, - embedder_client=embedder_client, - connector=neo4j_connector, - config=pipeline_config, - embedding_id=embedding_model_id, - language=language, - ontology_types=ontology_types, - ) - - # Run the complete extraction pipeline - ( - all_dialogue_nodes, - all_chunk_nodes, - all_statement_nodes, - all_entity_nodes, - all_perceptual_nodes, - all_statement_chunk_edges, - all_statement_entity_edges, - all_entity_entity_edges, - all_perceptual_edges, - all_dedup_details, - ) = await orchestrator.run(chunked_dialogs, is_pilot_run=False) - -# region TODO 乐力齐 重构流水线切换至生产环境稳定后,移除快照对比代码 - # ── Snapshot: 旧流水线萃取结果(按 phase2_step_io_schema_v1.md 格式) ── - from app.core.memory.utils.debug.pipeline_snapshot import PipelineSnapshot - snapshot = PipelineSnapshot("legacy") - - # Statement 输出(从 dialog_data_list 中提取) - stmt_snapshot = [] - for d in all_dedup_details: - if not hasattr(d, "chunks"): - continue - for c in d.chunks: - for s in c.statements: - stmt_snapshot.append({ - "statement_id": s.id, - "statement_text": s.statement, - "statement_type": str(getattr(s, "stmt_type", "")), - "temporal_type": str(getattr(s, "temporal_info", "")), - "relevance": str(getattr(s, "relevence_info", "RELEVANT")), - "speaker": getattr(s, "speaker", "user") or "user", - "valid_at": s.temporal_validity.valid_at if s.temporal_validity else "NULL", - "invalid_at": s.temporal_validity.invalid_at if s.temporal_validity else "NULL", - }) - snapshot.save_stage("2_statement_outputs", stmt_snapshot) - - # Triplet 输出(从 dialog_data_list 中提取) - triplet_snapshot = {} - for d in all_dedup_details: - if not hasattr(d, "chunks"): - continue - for c in d.chunks: - for s in c.statements: - if s.triplet_extraction_info: - triplet_snapshot[s.id] = { - "entities": [ - { - "entity_idx": e.entity_idx, "name": e.name, - "type": e.type, "type_description": getattr(e, "type_description", ""), - "description": e.description, - "is_explicit_memory": getattr(e, "is_explicit_memory", False), - } - for e in s.triplet_extraction_info.entities - ], - "triplets": [ - { - "subject_name": t.subject_name, "subject_id": t.subject_id, - "predicate": t.predicate, - "predicate_description": getattr(t, "predicate_description", ""), - "object_name": t.object_name, "object_id": t.object_id, - } - for t in s.triplet_extraction_info.triplets - ], - } - snapshot.save_stage("3_triplet_outputs", triplet_snapshot) - - # 图节点和边(去重后) - snapshot.save_stage("6_nodes_edges_after_dedup", { - "dialogue_nodes_count": len(all_dialogue_nodes), - "chunk_nodes_count": len(all_chunk_nodes), - "statement_nodes_count": len(all_statement_nodes), - "entity_nodes": [ - {"id": e.id, "name": e.name, "entity_type": e.entity_type, "type_description": e.type_description, "description": e.description} - for e in all_entity_nodes - ], - "entity_entity_edges": [ - { - "source": e.source, "target": e.target, - "relation_type": e.relation_type, "relation_type_description": e.relation_type_description, "statement": e.statement, - } - for e in all_entity_entity_edges - ], - }) - snapshot.save_summary({ - "dialogue_count": len(all_dialogue_nodes), - "chunk_count": len(all_chunk_nodes), - "statement_count": len(all_statement_nodes), - "entity_count": len(all_entity_nodes), - "relation_count": len(all_entity_entity_edges), - }) -# endregion - - log_time("Extraction Pipeline", time.time() - step_start, log_file) - - # Step 3: Save all data to Neo4j database - step_start = time.time() - - # Neo4j 写入前:清洗用户/AI助手实体之间的别名交叉污染 - # 从 Neo4j 查询已有的 AI 助手别名,与本轮实体中的 AI 助手别名合并, - # 确保用户实体的 aliases 不包含 AI 助手的名字 - try: - from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( - clean_cross_role_aliases, - fetch_neo4j_assistant_aliases, - ) - neo4j_assistant_aliases = set() - if all_entity_nodes: - _eu_id = all_entity_nodes[0].end_user_id - if _eu_id: - neo4j_assistant_aliases = await fetch_neo4j_assistant_aliases(neo4j_connector, _eu_id) - clean_cross_role_aliases(all_entity_nodes, external_assistant_aliases=neo4j_assistant_aliases) - logger.info(f"Neo4j 写入前别名清洗完成,AI助手别名排除集大小: {len(neo4j_assistant_aliases)}") - except Exception as e: - logger.warning(f"Neo4j 写入前别名清洗失败(不影响主流程): {e}") - - # 添加死锁重试机制 - max_retries = 3 - retry_delay = 1 # 秒 - - for attempt in range(max_retries): - try: - success = await save_dialog_and_statements_to_neo4j( - dialogue_nodes=all_dialogue_nodes, - chunk_nodes=all_chunk_nodes, - statement_nodes=all_statement_nodes, - entity_nodes=all_entity_nodes, - perceptual_nodes=all_perceptual_nodes, - statement_chunk_edges=all_statement_chunk_edges, - statement_entity_edges=all_statement_entity_edges, - entity_edges=all_entity_entity_edges, - perceptual_edges=all_perceptual_edges, - connector=neo4j_connector, - ) - if success: - logger.info("Successfully saved all data to Neo4j") - - if all_entity_nodes: - end_user_id = all_entity_nodes[0].end_user_id - - # Neo4j 写入完成后,用 PgSQL 权威 aliases 覆盖 Neo4j 用户实体 - try: - from app.repositories.end_user_info_repository import EndUserInfoRepository - if end_user_id: - with get_db_context() as db_session: - info = EndUserInfoRepository(db_session).get_by_end_user_id(uuid.UUID(end_user_id)) - pg_aliases = info.aliases if info and info.aliases else [] - if info is not None: - # 将 Python 侧占位名集合作为参数传入,避免 Cypher 硬编码 - placeholder_names = list(_USER_PLACEHOLDER_NAMES) - await neo4j_connector.execute_query( - """ - MATCH (e:ExtractedEntity) - WHERE e.end_user_id = $end_user_id AND toLower(e.name) IN $placeholder_names - SET e.aliases = $aliases - """, - end_user_id=end_user_id, aliases=pg_aliases, - placeholder_names=placeholder_names, - ) - logger.info(f"[AliasSync] Neo4j 用户实体 aliases 已用 PgSQL 权威源覆盖: {pg_aliases}") - except Exception as sync_err: - logger.warning(f"[AliasSync] PgSQL→Neo4j aliases 同步失败(不影响主流程): {sync_err}") - - # 使用 Celery 异步任务触发聚类(不阻塞主流程) - try: - from app.tasks import run_incremental_clustering - - new_entity_ids = [e.id for e in all_entity_nodes] - task = run_incremental_clustering.apply_async( - kwargs={ - "end_user_id": end_user_id, - "new_entity_ids": new_entity_ids, - "llm_model_id": str(memory_config.llm_model_id) if memory_config.llm_model_id else None, - "embedding_model_id": str(memory_config.embedding_model_id) if memory_config.embedding_model_id else None, - }, - priority=3, - ) - logger.info( - f"[Clustering] 增量聚类任务已提交到 Celery - " - f"task_id={task.id}, end_user_id={end_user_id}, entity_count={len(new_entity_ids)}" - ) - except Exception as e: - logger.error(f"[Clustering] 提交聚类任务失败(不影响主流程): {e}", exc_info=True) - - break - else: - logger.warning("Failed to save some data to Neo4j") - if attempt < max_retries - 1: - logger.info(f"Retrying... (attempt {attempt + 2}/{max_retries})") - await asyncio.sleep(retry_delay * (attempt + 1)) # 指数退避 - except Exception as e: - error_msg = str(e) - # 检查是否是死锁错误 - if "DeadlockDetected" in error_msg or "deadlock" in error_msg.lower(): - if attempt < max_retries - 1: - logger.warning(f"Deadlock detected, retrying... (attempt {attempt + 2}/{max_retries})") - await asyncio.sleep(retry_delay * (attempt + 1)) # 指数退避 - else: - logger.error(f"Failed after {max_retries} attempts due to deadlock: {e}") - raise - else: - # 非死锁错误,直接抛出 - raise - - try: - await neo4j_connector.close() - except Exception as e: - logger.error(f"Error closing Neo4j connector: {e}") - - log_time("Neo4j Database Save", time.time() - step_start, log_file) - - # Step 4: Generate Memory summaries and save to Neo4j - step_start = time.time() - try: - summaries = await memory_summary_generation( - chunked_dialogs, llm_client=llm_client, embedder_client=embedder_client, language=language - ) - ms_connector = Neo4jConnector() - try: - await add_memory_summary_nodes(summaries, ms_connector) - await add_memory_summary_statement_edges(summaries, ms_connector) - finally: - try: - await ms_connector.close() - except Exception: - pass - except Exception as e: - logger.error(f"Memory summary step failed: {e}", exc_info=True) - finally: - log_time("Memory Summary (Neo4j)", time.time() - step_start, log_file) - - # Log total pipeline time - total_time = time.time() - pipeline_start - log_time("TOTAL PIPELINE TIME", total_time, log_file) - - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - with open(log_file, "a", encoding="utf-8") as f: - f.write(f"=== Pipeline Run Completed: {timestamp} ===\n\n") - - # 将提取统计写入 Redis,按 workspace_id 存储 - try: - from app.cache.memory.activity_stats_cache import ActivityStatsCache - - stats_to_cache = { - "chunk_count": len(all_chunk_nodes) if all_chunk_nodes else 0, - "statements_count": len(all_statement_nodes) if all_statement_nodes else 0, - "triplet_entities_count": len(all_entity_nodes) if all_entity_nodes else 0, - "triplet_relations_count": len(all_entity_entity_edges) if all_entity_entity_edges else 0, - "temporal_count": 0, - } - await ActivityStatsCache.set_activity_stats( - workspace_id=str(memory_config.workspace_id), - stats=stats_to_cache, - ) - logger.info(f"[WRITE] 活动统计已写入 Redis: workspace_id = {memory_config.workspace_id}") - except Exception as cache_err: - logger.warning(f"[WRITE] 写入活动统计缓存失败(不影响主流程): {cache_err}", exc_info=True) - - # Close LLM/Embedder underlying httpx clients to prevent - # 'RuntimeError: Event loop is closed' during garbage collection - for client_obj in (llm_client, embedder_client): - try: - underlying = getattr(client_obj, 'client', None) or getattr(client_obj, 'model', None) - if underlying is None: - continue - # Unwrap RedBearLLM / RedBearEmbeddings to get the LangChain model - inner = getattr(underlying, '_model', underlying) - # LangChain OpenAI models expose async_client (httpx.AsyncClient) - http_client = getattr(inner, 'async_client', None) - if http_client is not None and hasattr(http_client, 'aclose'): - await http_client.aclose() - except Exception: - pass - - logger.info("=== Pipeline Complete ===") - logger.info(f"Total execution time: {total_time:.2f} seconds") diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py index 07481070..fd824a2d 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py @@ -98,7 +98,7 @@ class SemanticPruner: self._snapshot = snapshot # PipelineSnapshot 实例,用于输出剪枝快照 # 加载 Jinja2 模板 - self.template = prompt_env.get_template("extracat_Pruning.jinja2") + self.template = prompt_env.get_template("extracat_pruning.jinja2") # LRU 缓存:避免对相同消息对重复调用 LLM self._cache: OrderedDict[str, AssistantPruningResponse] = OrderedDict() @@ -360,7 +360,7 @@ class SemanticPruner: ) -> AssistantPruningResponse: """调用 LLM 从 User-Assistant 消息对中提取 Assistant 记忆摘要。 - 使用 extracat_Pruning.jinja2 模板,输入格式: + 使用 extracat_pruning.jinja2 模板,输入格式: {"msgs": [{"role": "User", "msg": "..."}, {"role": "Assistant", "msg": "..."}]} """ # 构建模板输入 @@ -387,7 +387,7 @@ class SemanticPruner: # 渲染模板 rendered = self.template.render(dialog_text=dialog_text) - log_template_rendering("extracat_Pruning.jinja2", { + log_template_rendering("extracat_pruning.jinja2", { "language": self.language, }) log_prompt_rendering("pruning-assistant-hint", rendered) diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py index 8e97163e..26c7eecb 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py @@ -1,7 +1,7 @@ """ 场景特定配置 - 统一填充词库 -重要性判断已完全交由 extracat_Pruning.jinja2 提示词 + LLM preserve_tokens 机制承担。 +重要性判断已完全交由 extracat_pruning.jinja2 提示词 + LLM preserve_tokens 机制承担。 本模块仅保留统一填充词库(filler_phrases),用于识别无意义寒暄/表情/口头禅。 所有场景共用同一份词库,场景差异由 LLM 语义判断处理。 """ diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py deleted file mode 100644 index 7770d5d4..00000000 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py +++ /dev/null @@ -1,2425 +0,0 @@ -""" -萃取引擎 - 流水线编排器 - -该模块提供了一个统一的流水线编排器,用于协调整个知识提取流程。 -它整合了数据预处理、知识提取、去重消歧等模块,提供统一的执行接口。 - -主要功能: -1. 协调数据预处理、分块、陈述句提取、三元组提取、时间信息提取等步骤 -2. 管理嵌入向量生成 -3. 执行两阶段去重和消歧 -4. 将提取结果转换为图数据库节点和边 -5. 提供错误处理和日志记录 -6. 支持试运行模式(不写入数据库) - -作者: -日期:2025-11-21 -""" - -import asyncio -import logging -import os -import uuid -from datetime import datetime -from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple - -from app.core.memory.llm_tools.openai_client import LLMClient -from app.core.memory.llm_tools.openai_embedder import OpenAIEmbedderClient -from app.core.memory.models.graph_models import ( - ChunkNode, - DialogueNode, - EntityEntityEdge, - ExtractedEntityNode, - StatementChunkEdge, - StatementEntityEdge, - StatementNode, - PerceptualEdge, - PerceptualNode -) -from app.core.memory.models.message_models import DialogData -from app.core.memory.models.ontology_extraction_models import OntologyTypeList -from app.core.memory.models.variate_config import ( - ExtractionPipelineConfig, -) -from app.core.memory.storage_services.extraction_engine.deduplication.two_stage_dedup import ( - dedup_layers_and_merge_and_return, -) -from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( - _USER_PLACEHOLDER_NAMES, - fetch_neo4j_assistant_aliases, -) -from app.core.memory.storage_services.extraction_engine.knowledge_extraction.embedding_generation import ( - embedding_generation, - generate_entity_embeddings_from_triplets, -) -# 导入各个提取模块 -from app.core.memory.storage_services.extraction_engine.knowledge_extraction.statement_extraction import ( - StatementExtractor, -) -from app.core.memory.storage_services.extraction_engine.knowledge_extraction.temporal_extraction import ( - TemporalExtractor, -) -from app.core.memory.storage_services.extraction_engine.knowledge_extraction.triplet_extraction import ( - TripletExtractor, -) -from app.core.memory.storage_services.extraction_engine.pipeline_help import ( - _write_extracted_result_summary, - export_test_input_doc, -) -from app.core.memory.utils.data.ontology import TemporalInfo -from app.db import get_db_context -from app.models.end_user_info_model import EndUserInfo -from app.repositories.end_user_info_repository import EndUserInfoRepository -from app.repositories.end_user_repository import EndUserRepository -from app.repositories.neo4j.neo4j_connector import Neo4jConnector - -# 配置日志 -logger = logging.getLogger(__name__) - - -class ExtractionOrchestrator: - """ - 知识提取流水线编排器 - - 该类负责协调整个知识提取流程,包括: - 1. 陈述句提取 - 2. 三元组提取 - 3. 时间信息提取 - 4. 嵌入向量生成 - 5. 数据赋值到语句 - 6. 节点和边的创建 - 7. 两阶段去重和消歧 - 8. 结果汇总和输出 - - Attributes: - llm_client: LLM 客户端,用于调用大语言模型 - embedder_client: 嵌入模型客户端,用于生成向量嵌入 - connector: Neo4j 连接器,用于数据库操作 - config: 流水线配置 - """ - - def __init__( - self, - llm_client: LLMClient, - embedder_client: OpenAIEmbedderClient, - connector: Neo4jConnector, - config: Optional[ExtractionPipelineConfig] = None, - progress_callback: Optional[Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]] = None, - embedding_id: Optional[str] = None, - ontology_types: Optional[OntologyTypeList] = None, - enable_general_types: bool = True, - language: str = "zh", - ): - """ - 初始化流水线编排器 - - Args: - llm_client: LLM 客户端 - embedder_client: 嵌入模型客户端 - connector: Neo4j 连接器 - config: 流水线配置,如果为 None 则使用默认配置 - progress_callback: 进度回调函数 - - 接受 (stage: str, message: str, data: Optional[Dict[str, Any]]) 并返回 Awaitable[None] - - 在管线关键点调用以报告进度和结果数据 - embedding_id: 嵌入模型ID,如果为 None 则从全局配置获取(向后兼容) - language: 语言类型 ("zh" 中文, "en" 英文),默认中文 - """ - self.llm_client = llm_client - self.embedder_client = embedder_client - self.connector = connector - self.config = config or ExtractionPipelineConfig() - self.is_pilot_run = False # 默认非试运行模式 - self.progress_callback = progress_callback # 保存进度回调函数 - self.embedding_id = embedding_id # 保存嵌入模型ID - self.language = language # 保存语言配置 - - # 处理本体类型配置 - # 根据 enable_general_types 参数决定是否将通用本体类型与场景特定类型合并 - # 如果启用合并且配置中开启了通用本体功能,则使用 OntologyTypeMerger 进行融合 - if enable_general_types and ontology_types: - from app.core.memory.ontology_services.ontology_type_loader import ( - get_ontology_type_merger, - is_general_ontology_enabled, - ) - if is_general_ontology_enabled(): - merger = get_ontology_type_merger() - self.ontology_types = merger.merge(ontology_types) - logger.info( - f"已启用通用本体类型融合: 场景类型 {len(ontology_types.types) if ontology_types.types else 0} 个 -> " - f"合并后 {len(self.ontology_types.types) if self.ontology_types.types else 0} 个" - ) - else: - self.ontology_types = ontology_types - logger.info("通用本体类型功能已在配置中禁用,仅使用场景类型") - else: - self.ontology_types = ontology_types - if not enable_general_types and ontology_types: - logger.info("enable_general_types=False,仅使用场景类型") - - # 保存去重消歧的详细记录(内存中的数据结构) - self.dedup_merge_records: List[Dict[str, Any]] = [] # 实体合并记录 - self.dedup_disamb_records: List[Dict[str, Any]] = [] # 实体消歧记录 - self.id_redirect_map: Dict[str, str] = {} # ID重定向映射 - - # 初始化各个提取器 - self.statement_extractor = StatementExtractor( - llm_client=llm_client, - config=self.config.statement_extraction, - ) - self.triplet_extractor = TripletExtractor(llm_client=llm_client, ontology_types=self.ontology_types, - language=language) - self.temporal_extractor = TemporalExtractor(llm_client=llm_client) - - logger.info("ExtractionOrchestrator 初始化完成") - - async def run( - self, - dialog_data_list: List[DialogData], - is_pilot_run: bool = False, - ) -> tuple[ - list[DialogueNode], - list[ChunkNode], - list[StatementNode], - list[ExtractedEntityNode], - list[PerceptualNode], - list[StatementChunkEdge], - list[StatementEntityEdge], - list[EntityEntityEdge], - list[PerceptualEdge], - list[DialogData] - ]: - """ - 运行完整的知识提取流水线(优化版:并行执行) - - 该方法协调所有提取步骤,优化执行顺序: - 1. 陈述句提取 - 2. 并行执行:三元组提取 + 时间信息提取 + 陈述句/分块嵌入生成 - 3. 实体嵌入生成(依赖三元组) - 4. 数据赋值 - 5. 节点和边创建 - 6. 两阶段去重 - 7. 结果汇总 - - Args: - dialog_data_list: 已分块的对话数据列表 - is_pilot_run: 是否为试运行模式(不写入数据库) - - Returns: - 包含三个元组的元组: - - 第一个元组:(对话节点列表, 分块节点列表, 陈述句节点列表) - - 第二个元组:去重前的 (实体节点列表, 陈述句-实体边列表, 实体-实体边列表) - - 第三个元组:去重后的 (实体节点列表, 陈述句-实体边列表, 实体-实体边列表) - """ - try: - # 设置试运行模式标志 - self.is_pilot_run = is_pilot_run - mode_str = "试运行模式" if is_pilot_run else "正式模式" - logger.info(f"开始运行知识提取流水线(优化版 - {mode_str}),共 {len(dialog_data_list)} 个对话") - - # 步骤 1: 陈述句提取 - logger.info("步骤 1/6: 陈述句提取(全局分块级并行)") - dialog_data_list = await self._extract_statements(dialog_data_list) - - # 收集陈述句内容和统计数量 - all_statements_list = [] - for dialog in dialog_data_list: - for chunk in dialog.chunks: - all_statements_list.extend(chunk.statements) - - # 步骤 2: 并行执行三元组提取、时间信息提取、情绪提取和基础嵌入生成 - logger.info("步骤 2/6: 并行执行三元组提取、时间信息提取、情绪提取和嵌入生成") - ( - triplet_maps, - temporal_maps, - emotion_maps, - statement_embedding_maps, - chunk_embedding_maps, - dialog_embeddings, - ) = await self._parallel_extract_and_embed(dialog_data_list) - - # 收集实体和三元组内容,并统计数量 - all_entities_list = [] - all_triplets_list = [] - for triplet_map in triplet_maps: - for triplet_info in triplet_map.values(): - if triplet_info: - all_entities_list.extend(triplet_info.entities) - all_triplets_list.extend(triplet_info.triplets) - - # 步骤 3: 生成实体嵌入(依赖三元组提取结果) - logger.info("步骤 3/6: 生成实体嵌入") - triplet_maps = await self._generate_entity_embeddings(triplet_maps) - - # 步骤 4: 将提取的数据赋值到语句 - logger.info("步骤 4/6: 数据赋值") - dialog_data_list = await self._assign_extracted_data( - dialog_data_list, - temporal_maps, - triplet_maps, - emotion_maps, - statement_embedding_maps, - chunk_embedding_maps, - dialog_embeddings, - ) - - # 步骤 5: 创建节点和边 - logger.info("步骤 5/6: 创建节点和边") - - # 注意:creating_nodes_edges 消息已在知识抽取完成后立即发送 - - ( - dialogue_nodes, - chunk_nodes, - statement_nodes, - entity_nodes, - perceptual_nodes, - statement_chunk_edges, - statement_entity_edges, - entity_entity_edges, - perceptual_edges - ) = await self._create_nodes_and_edges(dialog_data_list) - - # 导出去重前的测试输入文档(试运行和正式模式都需要,用于生成结果汇总) - export_test_input_doc(entity_nodes, statement_entity_edges, entity_entity_edges) - - # 步骤 6: 两阶段去重和消歧 - if is_pilot_run: - logger.info("步骤 6/6: 去重和消歧(试运行模式:仅第一层去重)") - else: - logger.info("步骤 6/6: 两阶段去重和消歧") - - # 注意:deduplication 消息已在创建节点和边完成后立即发送 - - ( - dialogue_nodes, - chunk_nodes, - statement_nodes, - entity_nodes, - statement_chunk_edges, - statement_entity_edges, - entity_entity_edges, - dialog_data_list, - dedup_details, - ) = await self._run_dedup_and_write_summary( - dialogue_nodes, - chunk_nodes, - statement_nodes, - entity_nodes, - statement_chunk_edges, - statement_entity_edges, - entity_entity_edges, - dialog_data_list, - ) - - # 步骤 7: 元数据提取已迁移到新流水线(WritePipeline._extract_metadata), - # 旧编排器不再触发异步元数据提取任务。 - - logger.info(f"知识提取流水线运行完成({mode_str})") - return ( - dialogue_nodes, - chunk_nodes, - statement_nodes, - entity_nodes, - perceptual_nodes, - statement_chunk_edges, - statement_entity_edges, - entity_entity_edges, - perceptual_edges, - dialog_data_list, - ) - - except Exception as e: - logger.error(f"知识提取流水线运行失败: {e}", exc_info=True) - raise - - async def _extract_statements( - self, dialog_data_list: List[DialogData] - ) -> List[DialogData]: - """ - 从对话中提取陈述句(流式输出版本:边提取边发送进度) - - Args: - dialog_data_list: 对话数据列表 - - Returns: - 更新后的对话数据列表(包含提取的陈述句) - """ - logger.info("开始陈述句提取(全局分块级并行 + 流式输出)") - - # 收集所有分块及其元数据 - all_chunks = [] - chunk_metadata = [] # (dialog_idx, chunk_idx) - - for d_idx, dialog in enumerate(dialog_data_list): - dialogue_content = dialog.content if self.config.statement_extraction.include_dialogue_context else None - for c_idx, chunk in enumerate(dialog.chunks): - all_chunks.append((chunk, dialog.end_user_id, dialogue_content)) - chunk_metadata.append((d_idx, c_idx)) - - logger.info(f"收集到 {len(all_chunks)} 个分块,开始全局并行提取") - - # 用于跟踪已完成的分块数量 - completed_chunks = 0 - total_chunks = len(all_chunks) - - # 全局并行处理所有分块 - async def extract_for_chunk(chunk_data, chunk_index): - nonlocal completed_chunks - chunk, end_user_id, dialogue_content = chunk_data - try: - statements = await self.statement_extractor._extract_statements(chunk, end_user_id, dialogue_content) - - # 流式输出:每提取完一个分块的陈述句,立即发送进度 - # 注意:只在试运行模式下发送陈述句详情,正式模式不发送 - completed_chunks += 1 - if self.progress_callback and statements and self.is_pilot_run: - # 发送前3个陈述句作为示例 - for idx, stmt in enumerate(statements[:3]): - stmt_result = { - "extraction_type": "statement", - "statement": stmt.statement, - "statement_id": stmt.id, - "chunk_progress": f"{completed_chunks}/{total_chunks}", - "statement_index_in_chunk": idx + 1 - } - await self.progress_callback( - "knowledge_extraction_result", - f"陈述句提取中 ({completed_chunks}/{total_chunks})", - stmt_result - ) - - return statements - except Exception as e: - logger.error(f"分块 {chunk.id} 陈述句提取失败: {e}") - completed_chunks += 1 - return [] - - tasks = [extract_for_chunk(chunk_data, i) for i, chunk_data in enumerate(all_chunks)] - results = await asyncio.gather(*tasks, return_exceptions=True) - - # 将结果分配回对话 - for i, result in enumerate(results): - d_idx, c_idx = chunk_metadata[i] - if isinstance(result, Exception): - logger.error(f"分块处理异常: {result}") - dialog_data_list[d_idx].chunks[c_idx].statements = [] - elif isinstance(result, list): - dialog_data_list[d_idx].chunks[c_idx].statements = result - else: - dialog_data_list[d_idx].chunks[c_idx].statements = [] - - # 统计并保存(试运行和正式模式都需要保存,用于生成结果汇总) - all_statements = [] - for dialog in dialog_data_list: - for chunk in dialog.chunks: - if chunk.statements: - all_statements.extend(chunk.statements) - - # 保存陈述句到文件(试运行和正式模式都需要) - self.statement_extractor.save_statements(all_statements) - - logger.info(f"陈述句提取完成,共提取 {len(all_statements)} 条陈述句") - - # 试运行模式下,所有分块提取完成后发送完成事件 - if self.progress_callback and self.is_pilot_run: - await self.progress_callback( - "knowledge_extraction_complete", - f"陈述句提取完成,共提取 {len(all_statements)} 条", - {"total_statements": len(all_statements), "total_chunks": total_chunks} - ) - - return dialog_data_list - - async def _extract_triplets( - self, dialog_data_list: List[DialogData] - ) -> List[Dict[str, Any]]: - """ - 从对话中提取三元组(流式输出版本:边提取边发送进度) - - Args: - dialog_data_list: 对话数据列表 - - Returns: - 三元组映射列表,每个对话对应一个字典 - """ - logger.info("开始三元组提取(全局陈述句级并行 + 流式输出)") - - # 收集所有陈述句及其元数据 - all_statements = [] - statement_metadata = [] # (dialog_idx, statement_id, chunk_content) - - for d_idx, dialog in enumerate(dialog_data_list): - for chunk in dialog.chunks: - for statement in chunk.statements: - all_statements.append((statement, chunk.content)) - statement_metadata.append((d_idx, statement.id)) - - logger.info(f"收集到 {len(all_statements)} 个陈述句,开始全局并行提取三元组") - - # 用于跟踪已完成的陈述句数量 - completed_statements = 0 - len(all_statements) - - # 全局并行处理所有陈述句 - async def extract_for_statement(stmt_data, stmt_index): - nonlocal completed_statements - statement, chunk_content = stmt_data - try: - triplet_info = await self.triplet_extractor._extract_triplets(statement, chunk_content) - - # 注意:不再发送三元组提取的流式输出 - # 三元组提取在后台执行,但不向前端发送详细信息 - completed_statements += 1 - - return triplet_info - except Exception as e: - logger.error(f"陈述句 {statement.id} 三元组提取失败: {e}") - completed_statements += 1 - from app.core.memory.models.triplet_models import ( - TripletExtractionResponse, - ) - return TripletExtractionResponse(triplets=[], entities=[]) - - tasks = [extract_for_statement(stmt_data, i) for i, stmt_data in enumerate(all_statements)] - results = await asyncio.gather(*tasks, return_exceptions=True) - - # 将结果组织成对话级别的映射 - triplet_maps = [{} for _ in dialog_data_list] - all_responses = [] - - for i, result in enumerate(results): - d_idx, stmt_id = statement_metadata[i] - if isinstance(result, Exception): - logger.error(f"陈述句处理异常: {result}") - from app.core.memory.models.triplet_models import ( - TripletExtractionResponse, - ) - triplet_maps[d_idx][stmt_id] = TripletExtractionResponse(triplets=[], entities=[]) - else: - triplet_maps[d_idx][stmt_id] = result - all_responses.append(result) - - # 统计提取结果 - total_triplets = sum(len(m) for m in triplet_maps) - logger.info(f"三元组提取完成,共提取 {total_triplets} 个三元组") - - # 保存三元组到文件(试运行和正式模式都需要,用于生成结果汇总) - if all_responses: - try: - self.triplet_extractor.save_triplets(all_responses) - logger.info("三元组数据已保存到文件") - except Exception as e: - logger.error(f"保存三元组到文件失败: {e}", exc_info=True) - - return triplet_maps - - async def _extract_temporal( - self, dialog_data_list: List[DialogData] - ) -> List[Dict[str, Any]]: - """ - 从对话中提取时间信息(流式输出版本:边提取边发送进度) - - Args: - dialog_data_list: 对话数据列表 - - Returns: - 时间信息映射列表,每个对话对应一个字典 - """ - # 试运行模式:跳过时间提取以节省时间 - if self.is_pilot_run: - logger.info("试运行模式:跳过时间信息提取(节省约 10-15 秒)") - # 为所有陈述句返回空的时间范围 - from app.core.memory.models.message_models import TemporalValidityRange - temporal_maps = [] - for dialog in dialog_data_list: - temporal_map = {} - for chunk in dialog.chunks: - for statement in chunk.statements: - temporal_map[statement.id] = TemporalValidityRange(valid_at=None, invalid_at=None) - temporal_maps.append(temporal_map) - return temporal_maps - - logger.info("开始时间信息提取(全局陈述句级并行 + 流式输出)") - - # 收集所有需要提取时间的陈述句 - all_statements = [] - statement_metadata = [] # (dialog_idx, statement_id, ref_dates) - - for d_idx, dialog in enumerate(dialog_data_list): - # 获取参考日期 - ref_dates = {} - if hasattr(dialog, 'metadata') and dialog.metadata: - if 'conversation_date' in dialog.metadata: - ref_dates['conversation_date'] = dialog.metadata['conversation_date'] - if 'publication_date' in dialog.metadata: - ref_dates['publication_date'] = dialog.metadata['publication_date'] - - if not ref_dates: - from datetime import datetime - ref_dates = {"today": datetime.now().strftime("%Y-%m-%d")} - - for chunk in dialog.chunks: - for statement in chunk.statements: - # 跳过 ATEMPORAL 类型的陈述句 - from app.core.memory.utils.data.ontology import TemporalInfo - if statement.temporal_info != TemporalInfo.ATEMPORAL: - all_statements.append((statement, ref_dates)) - statement_metadata.append((d_idx, statement.id)) - - logger.info(f"收集到 {len(all_statements)} 个需要时间提取的陈述句,开始全局并行提取") - - # 用于跟踪已完成的时间提取数量 - completed_temporal = 0 - len(all_statements) - - # 全局并行处理所有陈述句 - async def extract_for_statement(stmt_data, stmt_index): - nonlocal completed_temporal - statement, ref_dates = stmt_data - try: - temporal_range = await self.temporal_extractor._extract_temporal_ranges(statement, ref_dates) - - # 注意:不再发送时间提取的流式输出 - # 时间提取在后台执行,但不向前端发送详细信息 - completed_temporal += 1 - - return temporal_range - except Exception as e: - logger.error(f"陈述句 {statement.id} 时间信息提取失败: {e}") - completed_temporal += 1 - from app.core.memory.models.message_models import TemporalValidityRange - return TemporalValidityRange(valid_at=None, invalid_at=None) - - tasks = [extract_for_statement(stmt_data, i) for i, stmt_data in enumerate(all_statements)] - results = await asyncio.gather(*tasks, return_exceptions=True) - - # 将结果组织成对话级别的映射 - temporal_maps = [{} for _ in dialog_data_list] - - for i, result in enumerate(results): - d_idx, stmt_id = statement_metadata[i] - if isinstance(result, Exception): - logger.error(f"陈述句处理异常: {result}") - from app.core.memory.models.message_models import TemporalValidityRange - temporal_maps[d_idx][stmt_id] = TemporalValidityRange(valid_at=None, invalid_at=None) - else: - temporal_maps[d_idx][stmt_id] = result - - # 为 ATEMPORAL 陈述句添加空的时间范围 - from app.core.memory.models.message_models import TemporalValidityRange - from app.core.memory.utils.data.ontology import TemporalInfo - for d_idx, dialog in enumerate(dialog_data_list): - for chunk in dialog.chunks: - for statement in chunk.statements: - if statement.temporal_info == TemporalInfo.ATEMPORAL and statement.id not in temporal_maps[d_idx]: - temporal_maps[d_idx][statement.id] = TemporalValidityRange(valid_at=None, invalid_at=None) - - # 统计提取结果 - total_temporal = sum(len(m) for m in temporal_maps) - logger.info(f"时间信息提取完成,共提取 {total_temporal} 个时间范围") - - return temporal_maps - - async def _extract_emotions( - self, dialog_data_list: List[DialogData] - ) -> List[Dict[str, Any]]: - """ - 从对话中提取情绪信息(仅针对用户消息,全局陈述句级并行) - - Args: - dialog_data_list: 对话数据列表 - - Returns: - 情绪信息映射列表,每个对话对应一个字典 - """ - logger.info("开始情绪信息提取(仅处理用户消息)") - - # 收集所有陈述句及其配置 - all_statements = [] - statement_metadata = [] # (dialog_idx, statement_id) - - # 获取第一个对话的config_id来加载配置 - config_id = None - if dialog_data_list and hasattr(dialog_data_list[0], 'config_id'): - config_id = dialog_data_list[0].config_id - - # 加载MemoryConfig - memory_config = None - if config_id: - try: - from app.db import SessionLocal - from app.repositories.memory_config_repository import MemoryConfigRepository - - db = SessionLocal() - try: - memory_config = MemoryConfigRepository.get_by_id(db, config_id) - finally: - db.close() - - if memory_config and not memory_config.emotion_enabled: - logger.info("情绪提取已在配置中禁用,跳过情绪提取") - return [{} for _ in dialog_data_list] - - except Exception as e: - logger.warning(f"加载MemoryConfig失败: {e},将跳过情绪提取") - return [{} for _ in dialog_data_list] - else: - logger.info("未找到config_id,跳过情绪提取") - return [{} for _ in dialog_data_list] - - # 如果配置未启用情绪提取,直接返回空映射 - if not memory_config or not memory_config.emotion_enabled: - logger.info("情绪提取未启用,跳过") - return [{} for _ in dialog_data_list] - - # 收集所有陈述句(只收集 speaker 为 "user" 的) - total_statements = 0 - filtered_statements = 0 - - for d_idx, dialog in enumerate(dialog_data_list): - for chunk in dialog.chunks: - for statement in chunk.statements: - total_statements += 1 - # 只处理用户的陈述句 (role 为 "user") - if hasattr(statement, 'speaker') and statement.speaker == "user": - all_statements.append((statement, memory_config)) - statement_metadata.append((d_idx, statement.id)) - filtered_statements += 1 - - logger.info(f"总陈述句: {total_statements}, 用户陈述句: {filtered_statements}, 开始全局并行提取情绪") - - # 初始化情绪提取服务 - # 如果 emotion_model_id 为空,回退到工作空间默认 LLM - from app.services.emotion_extraction_service import EmotionExtractionService - - emotion_model_id = memory_config.emotion_model_id - if not emotion_model_id and memory_config.workspace_id: - from app.repositories.workspace_repository import get_workspace_models_configs - from app.db import SessionLocal - - db = SessionLocal() - try: - workspace_models = get_workspace_models_configs(db, memory_config.workspace_id) - if workspace_models and workspace_models.get("llm"): - emotion_model_id = workspace_models["llm"] - logger.info(f"emotion_model_id 为空,使用工作空间默认 LLM: {emotion_model_id}") - finally: - db.close() - - emotion_service = EmotionExtractionService( - llm_id=emotion_model_id if emotion_model_id else None - ) - - # 全局并行处理所有陈述句 - async def extract_for_statement(stmt_data): - statement, config = stmt_data - try: - return await emotion_service.extract_emotion(statement.statement, config) - except Exception as e: - logger.error(f"陈述句 {statement.id} 情绪提取失败: {e}") - return None - - tasks = [extract_for_statement(stmt_data) for stmt_data in all_statements] - results = await asyncio.gather(*tasks, return_exceptions=True) - - # 将结果组织成对话级别的映射 - emotion_maps = [{} for _ in dialog_data_list] - successful_extractions = 0 - - for i, result in enumerate(results): - d_idx, stmt_id = statement_metadata[i] - if isinstance(result, Exception): - logger.error(f"陈述句处理异常: {result}") - emotion_maps[d_idx][stmt_id] = None - else: - emotion_maps[d_idx][stmt_id] = result - if result is not None: - successful_extractions += 1 - - # 统计提取结果 - logger.info(f"情绪信息提取完成,共成功提取 {successful_extractions}/{len(all_statements)} 个情绪") - - return emotion_maps - - async def _parallel_extract_and_embed( - self, dialog_data_list: List[DialogData] - ) -> Tuple[ - List[Dict[str, Any]], - List[Dict[str, Any]], - List[Dict[str, Any]], - List[Dict[str, List[float]]], - List[Dict[str, List[float]]], - List[List[float]], - ]: - """ - 并行执行三元组提取、时间信息提取、情绪提取和基础嵌入生成 - - 这四个任务都依赖陈述句提取的结果,但彼此独立,可以并行执行: - - 三元组提取:从陈述句中提取实体和关系 - - 时间信息提取:从陈述句中提取时间范围 - - 情绪提取:从陈述句中提取情绪信息 - - 嵌入生成:为陈述句、分块和对话生成向量(不依赖三元组) - - Args: - dialog_data_list: 对话数据列表 - - Returns: - 六个列表的元组: - - 三元组映射列表 - - 时间信息映射列表 - - 情绪映射列表 - - 陈述句嵌入映射列表 - - 分块嵌入映射列表 - - 对话嵌入列表 - """ - logger.info("并行执行:三元组提取 + 时间信息提取 + 情绪提取 + 基础嵌入生成") - - # 创建四个并行任务 - triplet_task = self._extract_triplets(dialog_data_list) - temporal_task = self._extract_temporal(dialog_data_list) - emotion_task = self._extract_emotions(dialog_data_list) - embedding_task = self._generate_basic_embeddings(dialog_data_list) - - # 并行执行 - results = await asyncio.gather( - triplet_task, - temporal_task, - emotion_task, - embedding_task, - return_exceptions=True - ) - - # 解包结果 - triplet_maps = results[0] if not isinstance(results[0], Exception) else [{} for _ in dialog_data_list] - temporal_maps = results[1] if not isinstance(results[1], Exception) else [{} for _ in dialog_data_list] - emotion_maps = results[2] if not isinstance(results[2], Exception) else [{} for _ in dialog_data_list] - - if isinstance(results[3], Exception): - logger.error(f"基础嵌入生成失败: {results[3]}") - statement_embedding_maps = [{} for _ in dialog_data_list] - chunk_embedding_maps = [{} for _ in dialog_data_list] - dialog_embeddings = [[] for _ in dialog_data_list] - else: - statement_embedding_maps, chunk_embedding_maps, dialog_embeddings = results[3] - - logger.info("并行任务执行完成") - return ( - triplet_maps, - temporal_maps, - emotion_maps, - statement_embedding_maps, - chunk_embedding_maps, - dialog_embeddings, - ) - - async def _generate_basic_embeddings( - self, dialog_data_list: List[DialogData] - ) -> Tuple[List[Dict[str, List[float]]], List[Dict[str, List[float]]], List[List[float]]]: - """ - 生成基础嵌入向量(陈述句、分块、对话) - - 这些嵌入不依赖三元组提取结果,可以提前生成 - 在试运行模式下,跳过嵌入生成以节省时间 - - Args: - dialog_data_list: 对话数据列表 - - Returns: - 三个列表的元组: - - 陈述句嵌入映射列表 - - 分块嵌入映射列表 - - 对话嵌入列表 - """ - # 试运行模式:跳过嵌入生成 - if self.is_pilot_run: - logger.info("试运行模式:跳过基础嵌入生成(节省约 20 秒)") - return ( - [{} for _ in dialog_data_list], - [{} for _ in dialog_data_list], - [[] for _ in dialog_data_list], - ) - - logger.info("开始生成基础嵌入向量(陈述句、分块、对话)") - - try: - # embedding_id is required - no fallback to global variable - if not self.embedding_id: - logger.error("embedding_id is required but was not provided to ExtractionOrchestrator") - raise ValueError("embedding_id is required but was not provided") - - # 只生成陈述句、分块和对话的嵌入(不包括实体) - statement_embedding_maps, chunk_embedding_maps, dialog_embeddings = await embedding_generation( - dialog_data_list, self.embedding_id - ) - - # 统计生成结果 - total_statement_embeddings = sum(len(m) for m in statement_embedding_maps) - total_chunk_embeddings = sum(len(m) for m in chunk_embedding_maps) - logger.info( - f"基础嵌入生成完成:{total_statement_embeddings} 个陈述句嵌入," - f"{total_chunk_embeddings} 个分块嵌入,{len(dialog_embeddings)} 个对话嵌入" - ) - - return statement_embedding_maps, chunk_embedding_maps, dialog_embeddings - - except Exception as e: - logger.error(f"基础嵌入生成失败: {e}", exc_info=True) - # 返回空结果 - return ( - [{} for _ in dialog_data_list], - [{} for _ in dialog_data_list], - [[] for _ in dialog_data_list], - ) - - async def _generate_entity_embeddings( - self, triplet_maps: List[Dict[str, Any]] - ) -> List[Dict[str, Any]]: - """ - 生成实体嵌入向量 - - 在试运行模式下,跳过实体嵌入生成以节省时间 - - Args: - triplet_maps: 三元组映射列表 - - Returns: - 更新后的三元组映射列表(包含实体嵌入) - """ - # 试运行模式:跳过实体嵌入生成 - if self.is_pilot_run: - logger.info("试运行模式:跳过实体嵌入生成(节省约 5-8 秒)") - return triplet_maps - - logger.info("开始生成实体嵌入向量") - - try: - # embedding_id is required - no fallback to global variable - if not self.embedding_id: - logger.error("embedding_id is required but was not provided to ExtractionOrchestrator") - return triplet_maps - - # 生成实体嵌入 - updated_triplet_maps = await generate_entity_embeddings_from_triplets( - triplet_maps, self.embedding_id - ) - - logger.info("实体嵌入生成完成") - return updated_triplet_maps - - except Exception as e: - logger.error(f"实体嵌入生成失败: {e}", exc_info=True) - return triplet_maps - - async def _assign_extracted_data( - self, - dialog_data_list: List[DialogData], - temporal_maps: List[Dict[str, Any]], - triplet_maps: List[Dict[str, Any]], - emotion_maps: List[Dict[str, Any]], - statement_embedding_maps: List[Dict[str, List[float]]], - chunk_embedding_maps: List[Dict[str, List[float]]], - dialog_embeddings: List[List[float]], - ) -> List[DialogData]: - """ - 将提取的数据赋值到语句 - - Args: - dialog_data_list: 对话数据列表 - temporal_maps: 时间信息映射列表 - triplet_maps: 三元组映射列表 - emotion_maps: 情绪信息映射列表 - statement_embedding_maps: 陈述句嵌入映射列表 - chunk_embedding_maps: 分块嵌入映射列表 - dialog_embeddings: 对话嵌入列表 - - Returns: - 更新后的对话数据列表 - """ - logger.info("开始将提取数据赋值到语句") - - # 确保列表长度匹配 - expected_length = len(dialog_data_list) - if ( - len(temporal_maps) != expected_length - or len(triplet_maps) != expected_length - or len(emotion_maps) != expected_length - or len(statement_embedding_maps) != expected_length - or len(chunk_embedding_maps) != expected_length - or len(dialog_embeddings) != expected_length - ): - logger.warning( - f"数据大小不匹配 - 对话: {len(dialog_data_list)}, " - f"时间映射: {len(temporal_maps)}, 三元组映射: {len(triplet_maps)}, " - f"情绪映射: {len(emotion_maps)}, " - f"陈述句嵌入: {len(statement_embedding_maps)}, " - f"分块嵌入: {len(chunk_embedding_maps)}, " - f"对话嵌入: {len(dialog_embeddings)}" - ) - - total_statements = 0 - assigned_temporal = 0 - assigned_triplets = 0 - assigned_emotions = 0 - assigned_statement_embeddings = 0 - assigned_chunk_embeddings = 0 - assigned_dialog_embeddings = 0 - - # 处理每个对话 - for i, dialog_data in enumerate(dialog_data_list): - # 检查是否有缺失的数据 - if i >= len(temporal_maps) or i >= len(triplet_maps) or i >= len(emotion_maps): - logger.warning(f"对话 {dialog_data.id} 缺少提取数据,跳过赋值") - continue - - temporal_map = temporal_maps[i] - triplet_map = triplet_maps[i] - emotion_map = emotion_maps[i] - statement_embedding_map = statement_embedding_maps[i] if i < len(statement_embedding_maps) else {} - chunk_embedding_map = chunk_embedding_maps[i] if i < len(chunk_embedding_maps) else {} - dialog_embedding = dialog_embeddings[i] if i < len(dialog_embeddings) else [] - - # 赋值对话嵌入 - if dialog_embedding: - dialog_data.dialog_embedding = dialog_embedding - assigned_dialog_embeddings += 1 - - # 处理每个分块 - for chunk in dialog_data.chunks: - # 赋值分块嵌入 - if chunk.id in chunk_embedding_map: - chunk.chunk_embedding = chunk_embedding_map[chunk.id] - assigned_chunk_embeddings += 1 - - # 处理每个陈述句 - for statement in chunk.statements: - total_statements += 1 - - # 赋值时间信息 - if statement.id in temporal_map: - statement.temporal_validity = temporal_map[statement.id] - assigned_temporal += 1 - - # 赋值三元组 - if statement.id in triplet_map: - statement.triplet_extraction_info = triplet_map[statement.id] - assigned_triplets += 1 - - # 赋值情绪信息 - if statement.id in emotion_map: - emotion_data = emotion_map[statement.id] - if emotion_data is not None: - # 将EmotionExtraction对象的字段赋值到Statement - statement.emotion_type = emotion_data.emotion_type - statement.emotion_intensity = emotion_data.emotion_intensity - statement.emotion_keywords = emotion_data.emotion_keywords - statement.emotion_subject = emotion_data.emotion_subject - statement.emotion_target = emotion_data.emotion_target - assigned_emotions += 1 - - # 赋值陈述句嵌入 - if statement.id in statement_embedding_map: - statement.statement_embedding = statement_embedding_map[statement.id] - assigned_statement_embeddings += 1 - - logger.info( - f"数据赋值完成 - 总陈述句: {total_statements}, " - f"时间信息: {assigned_temporal}, 三元组: {assigned_triplets}, " - f"情绪信息: {assigned_emotions}, " - f"陈述句嵌入: {assigned_statement_embeddings}, " - f"分块嵌入: {assigned_chunk_embeddings}, " - f"对话嵌入: {assigned_dialog_embeddings}" - ) - - return dialog_data_list - - async def _create_nodes_and_edges( - self, dialog_data_list: List[DialogData] - ) -> Tuple[ - List[DialogueNode], - List[ChunkNode], - List[StatementNode], - List[ExtractedEntityNode], - List[PerceptualNode], - List[StatementChunkEdge], - List[StatementEntityEdge], - List[EntityEntityEdge], - List[PerceptualEdge] - ]: - """ - 创建图数据库节点和边 - - 将对话数据转换为图数据库的节点和边结构 - - Args: - dialog_data_list: 对话数据列表 - - Returns: - 包含所有节点和边的元组 - """ - logger.info("开始创建节点和边") - - # 注意:开始消息已在 run 方法中发送,这里不再重复发送 - - dialogue_nodes = [] - chunk_nodes = [] - statement_nodes = [] - entity_nodes = [] - statement_chunk_edges = [] - statement_entity_edges = [] - entity_entity_edges = [] - perceptual_nodes = [] - perceptual_edges = [] - - # 用于去重的集合 - entity_id_set = set() - - # 用于跟踪进度 - total_dialogs = len(dialog_data_list) - processed_dialogs = 0 - - for dialog_data in dialog_data_list: - processed_dialogs += 1 - # 创建对话节点 - dialogue_node = DialogueNode( - id=dialog_data.id, - name=f"Dialog_{dialog_data.id}", # 添加必需的 name 字段 - ref_id=dialog_data.ref_id, - end_user_id=dialog_data.end_user_id, - run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id - content=dialog_data.context.content if dialog_data.context else "", - dialog_embedding=dialog_data.dialog_embedding if hasattr(dialog_data, 'dialog_embedding') else None, - created_at=dialog_data.created_at, - metadata=dialog_data.metadata, - config_id=dialog_data.config_id if hasattr(dialog_data, 'config_id') else None, - ) - dialogue_nodes.append(dialogue_node) - - # 处理每个分块 - for chunk_idx, chunk in enumerate(dialog_data.chunks): - # 创建分块节点 - chunk_node = ChunkNode( - id=chunk.id, - name=f"Chunk_{chunk.id}", # 添加必需的 name 字段 - dialog_id=dialog_data.id, - end_user_id=dialog_data.end_user_id, - run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id - content=chunk.content, - speaker=getattr(chunk, 'speaker', None), - chunk_embedding=chunk.chunk_embedding, - sequence_number=chunk_idx, # 添加必需的 sequence_number 字段 - created_at=dialog_data.created_at, - metadata=chunk.metadata, - ) - chunk_nodes.append(chunk_node) - - for p, file_type in chunk.files: - - meta = p.meta_data or {} - content_meta = meta.get("content", {}) - - # 生成 summary embedding(如果有 embedder_client) - summary_embedding = None - if self.embedder_client and p.summary: - try: - summary_embedding = (await self.embedder_client.response([p.summary]))[0] - except Exception as emb_err: - print(f"Failed to embed perceptual summary: {emb_err}") - - perceptual = PerceptualNode( - name=f"Perceptual_{p.id}", - **{ - "id": str(p.id), - "end_user_id": str(p.end_user_id), - "perceptual_type": p.perceptual_type, - "file_path": p.file_path or "", - "file_name": p.file_name or "", - "file_ext": p.file_ext or "", - "summary": p.summary or "", - "keywords": content_meta.get("keywords", []), - "topic": content_meta.get("topic", ""), - "domain": content_meta.get("domain", ""), - "created_at": p.created_time.isoformat() if p.created_time else None, - "file_type": file_type, - "summary_embedding": summary_embedding, - }) - perceptual_nodes.append(perceptual) - perceptual_edges.append(PerceptualEdge( - source=perceptual.id, - target=chunk.id, - end_user_id=dialog_data.end_user_id, - run_id=dialog_data.run_id, - created_at=dialog_data.created_at, - )) - - # 处理每个陈述句 - for statement in chunk.statements: - # 创建陈述句节点 - statement_node = StatementNode( - id=statement.id, - name=f"Statement_{statement.id}", # 添加必需的 name 字段 - chunk_id=chunk.id, - stmt_type=getattr(statement, 'stmt_type', 'general'), # 添加必需的 stmt_type 字段 - temporal_info=getattr(statement, 'temporal_info', TemporalInfo.ATEMPORAL), - # 添加必需的 temporal_info 字段 - connect_strength=statement.connect_strength if statement.connect_strength is not None else 'Strong', - # 添加必需的 connect_strength 字段 - end_user_id=dialog_data.end_user_id, - run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id - statement=statement.statement, - speaker=getattr(statement, 'speaker', None), # 添加 speaker 字段 - statement_embedding=statement.statement_embedding, - valid_at=statement.temporal_validity.valid_at if hasattr(statement, - 'temporal_validity') and statement.temporal_validity else None, - invalid_at=statement.temporal_validity.invalid_at if hasattr(statement, - 'temporal_validity') and statement.temporal_validity else None, - created_at=dialog_data.created_at, - config_id=dialog_data.config_id if hasattr(dialog_data, 'config_id') else None, - # Emotion fields - emotion_type=getattr(statement, 'emotion_type', None), - emotion_intensity=getattr(statement, 'emotion_intensity', None), - emotion_keywords=getattr(statement, 'emotion_keywords', None), - emotion_subject=getattr(statement, 'emotion_subject', None), - emotion_target=getattr(statement, 'emotion_target', None), - ) - statement_nodes.append(statement_node) - - # 创建陈述句-分块边 - statement_chunk_edge = StatementChunkEdge( - source=statement.id, - target=chunk.id, - end_user_id=dialog_data.end_user_id, - run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id - created_at=dialog_data.created_at, - ) - statement_chunk_edges.append(statement_chunk_edge) - - # 处理三元组信息 - if statement.triplet_extraction_info: - triplet_info = statement.triplet_extraction_info - - # 创建实体索引到ID的映射(支持多种索引方式) - entity_idx_to_id = {} - - # 创建实体节点 - for entity_idx, entity in enumerate(triplet_info.entities): - # 映射实体索引到实体ID(使用多个键以提高容错性) - # 1. 使用实体自己的 entity_idx - entity_idx_to_id[entity.entity_idx] = entity.id - # 2. 使用枚举索引(从0开始) - entity_idx_to_id[entity_idx] = entity.id - - if entity.id not in entity_id_set: - entity_connect_strength = getattr(entity, 'connect_strength', 'Strong') - entity_node = ExtractedEntityNode( - id=entity.id, - name=getattr(entity, 'name', f"Entity_{entity.id}"), # 使用 name 而不是 entity_name - entity_idx=entity.entity_idx, # 使用实体自己的 entity_idx - statement_id=statement.id, # 添加必需的 statement_id 字段 - entity_type=getattr(entity, 'type', 'unknown'), # 使用 type 而不是 entity_type - type_description=getattr(entity, 'type_description', ''), - description=getattr(entity, 'description', ''), # 添加必需的 description 字段 - example=getattr(entity, 'example', ''), # 新增:传递示例字段 - # TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 - # fact_summary=getattr(entity, 'fact_summary', ''), # 添加必需的 fact_summary 字段 - connect_strength=entity_connect_strength if entity_connect_strength is not None else 'Strong', - # 添加必需的 connect_strength 字段 - aliases=getattr(entity, 'aliases', []) or [], # 传递从三元组提取阶段获取的aliases - name_embedding=getattr(entity, 'name_embedding', None), - is_explicit_memory=getattr(entity, 'is_explicit_memory', False), # 新增:传递语义记忆标记 - end_user_id=dialog_data.end_user_id, - run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id - created_at=dialog_data.created_at, - config_id=dialog_data.config_id if hasattr(dialog_data, 'config_id') else None, - ) - entity_nodes.append(entity_node) - entity_id_set.add(entity.id) - - # 创建陈述句-实体边 - entity_connect_strength = getattr(entity, 'connect_strength', 'Strong') - statement_entity_edge = StatementEntityEdge( - source=statement.id, - target=entity.id, - connect_strength=entity_connect_strength if entity_connect_strength is not None else 'Strong', - end_user_id=dialog_data.end_user_id, - run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id - created_at=dialog_data.created_at, - ) - statement_entity_edges.append(statement_entity_edge) - - # 创建实体-实体边(从三元组) - for triplet in triplet_info.triplets: - # 将三元组中的整数索引映射到实体ID - subject_entity_id = entity_idx_to_id.get(triplet.subject_id) - object_entity_id = entity_idx_to_id.get(triplet.object_id) - - # 只有当两个实体ID都存在时才创建边 - if subject_entity_id and object_entity_id: - _tv = getattr(statement, "temporal_validity", None) - entity_entity_edge = EntityEntityEdge( - source=subject_entity_id, - target=object_entity_id, - relation_type=triplet.predicate, - relation_type_description=getattr(triplet, 'predicate_description', ''), - statement=statement.statement, - source_statement_id=statement.id, - end_user_id=dialog_data.end_user_id, - run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id - created_at=dialog_data.created_at, - valid_at=_tv.valid_at if _tv else None, - invalid_at=_tv.invalid_at if _tv else None, - ) - entity_entity_edges.append(entity_entity_edge) - - # 流式输出:每创建一个关系边,立即发送进度(限制发送数量) - if self.progress_callback and len(entity_entity_edges) <= 10: - # 获取实体名称 - source_name = triplet.subject_name - target_name = triplet.object_name - relationship_result = { - "result_type": "relationship_creation", - "relationship_index": len(entity_entity_edges), - "source_entity": source_name, - "relation_type": triplet.predicate, - "target_entity": target_name, - "relationship_text": f"{source_name} -[{triplet.predicate}]-> {target_name}", - "dialog_progress": f"{processed_dialogs}/{total_dialogs}" - } - await self.progress_callback( - "creating_nodes_edges_result", - f"关系创建中 ({processed_dialogs}/{total_dialogs})", - relationship_result - ) - else: - # 改进的警告信息,包含更多调试信息 - missing_subject = "subject" if not subject_entity_id else "" - missing_object = "object" if not object_entity_id else "" - missing_both = " and " if (not subject_entity_id and not object_entity_id) else "" - - logger.debug( - f"跳过三元组 - 无法找到{missing_subject}{missing_both}{missing_object}实体ID: " - f"subject_id={triplet.subject_id} ({triplet.subject_name}), " - f"object_id={triplet.object_id} ({triplet.object_name}), " - f"predicate={triplet.predicate}, " - f"statement_id={statement.id}, " - f"available_indices={sorted(entity_idx_to_id.keys())}" - ) - - logger.info( - f"节点和边创建完成 - 对话节点: {len(dialogue_nodes)}, " - f"分块节点: {len(chunk_nodes)}, 陈述句节点: {len(statement_nodes)}, " - f"实体节点: {len(entity_nodes)}, 陈述句-分块边: {len(statement_chunk_edges)}, " - f"陈述句-实体边: {len(statement_entity_edges)}, " - f"实体-实体边: {len(entity_entity_edges)}" - ) - - # 进度回调:创建节点和边完成,传递结果统计 - # 注意:具体的关系创建结果已经在创建过程中实时发送了 - if self.progress_callback: - nodes_edges_stats = { - "dialogue_nodes_count": len(dialogue_nodes), - "chunk_nodes_count": len(chunk_nodes), - "statement_nodes_count": len(statement_nodes), - "entity_nodes_count": len(entity_nodes), - "statement_chunk_edges_count": len(statement_chunk_edges), - "statement_entity_edges_count": len(statement_entity_edges), - "entity_entity_edges_count": len(entity_entity_edges), - } - await self.progress_callback("creating_nodes_edges_complete", "创建节点和边完成", nodes_edges_stats) - - return ( - dialogue_nodes, - chunk_nodes, - statement_nodes, - entity_nodes, - perceptual_nodes, - statement_chunk_edges, - statement_entity_edges, - entity_entity_edges, - perceptual_edges - ) - - async def _update_end_user_other_name( - self, - entity_nodes: List[ExtractedEntityNode], - dialog_data_list: List[DialogData], - ) -> None: - """ - 将本轮提取的用户别名同步到 end_user 和 end_user_info 表。 - - PgSQL end_user_info.aliases 是用户别名的唯一权威源。 - 此方法仅将本轮 LLM 从对话中新提取的别名增量追加到 PgSQL, - 不再从 Neo4j 二层去重合并历史别名,避免脏数据反向污染 PgSQL。 - - 策略: - 1. 从本轮对话原始发言中提取用户别名(current_aliases) - 2. 从 PgSQL end_user_info 读取已有的 aliases(db_aliases) - 3. 合并 db_aliases + current_aliases,去重保序 - 4. 写回 PgSQL - - Args: - entity_nodes: 去重后的实体节点列表(内存中) - dialog_data_list: 对话数据列表 - """ - try: - if not dialog_data_list: - logger.warning("dialog_data_list 为空,跳过用户别名同步") - return - - end_user_id = dialog_data_list[0].end_user_id - if not end_user_id: - logger.warning("end_user_id 为空,跳过用户别名同步") - return - - # 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序) - current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list) - - # 1.6 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源 - # (防止 LLM 未提取出 AI 助手实体时,AI 别名泄漏到用户别名中) - neo4j_assistant_aliases = await self._fetch_neo4j_assistant_aliases(end_user_id) - if neo4j_assistant_aliases: - before_count = len(current_aliases) - current_aliases = [ - a for a in current_aliases - if a.strip().lower() not in neo4j_assistant_aliases - ] - if len(current_aliases) < before_count: - logger.info(f"通过 Neo4j AI 助手别名排除了 {before_count - len(current_aliases)} 个误归属别名") - - if not current_aliases: - logger.debug(f"本轮未提取到用户别名,跳过同步: end_user_id={end_user_id}") - return - - logger.info(f"本轮对话提取的 aliases: {current_aliases}") - - # 2. 同步到数据库 - end_user_uuid = uuid.UUID(end_user_id) - with get_db_context() as db: - # 更新 end_user 表 - end_user = EndUserRepository(db).get_by_id(end_user_uuid) - if not end_user: - logger.warning(f"未找到 end_user_id={end_user_id} 的用户记录") - return - - # 3. 从 PgSQL 读取已有 aliases 并与本轮新增合并 - info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid) - db_aliases = (info.aliases if info and info.aliases else []) - # 过滤掉占位名称 - db_aliases = [a for a in db_aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES] - - # 合并:PgSQL 已有 + 本轮新增,去重保序(不再合并 Neo4j 历史别名) - merged_aliases = list(db_aliases) - seen_lower = {a.strip().lower() for a in merged_aliases} - for alias in current_aliases: - if alias.strip().lower() not in seen_lower: - merged_aliases.append(alias) - seen_lower.add(alias.strip().lower()) - - # 最终过滤:从合并结果中排除 AI 助手别名(清理历史脏数据) - if neo4j_assistant_aliases: - merged_aliases = [ - a for a in merged_aliases - if a.strip().lower() not in neo4j_assistant_aliases - ] - - logger.info(f"PgSQL 已有 aliases: {db_aliases}") - logger.info(f"合并后 aliases: {merged_aliases}") - - # 更新 end_user 表 other_name - new_name = self._resolve_other_name(end_user.other_name, current_aliases, merged_aliases) - if new_name is not None: - end_user.other_name = new_name - logger.info(f"更新 end_user 表 other_name → {new_name}") - else: - logger.debug(f"end_user 表 other_name 保持不变: {end_user.other_name}") - - # 更新或创建 end_user_info 记录 - if info: - new_name_info = self._resolve_other_name(info.other_name, current_aliases, merged_aliases) - if new_name_info is not None: - info.other_name = new_name_info - logger.info(f"更新 end_user_info 表 other_name → {new_name_info}") - if info.aliases != merged_aliases: - info.aliases = merged_aliases - logger.info(f"同步合并后 aliases 到 end_user_info: {merged_aliases}") - else: - first_alias = current_aliases[0].strip() if current_aliases else "" - # 确保 first_alias 不是占位名称 - if first_alias and first_alias.lower() not in self.USER_PLACEHOLDER_NAMES: - db.add(EndUserInfo( - end_user_id=end_user_uuid, - other_name=first_alias, - aliases=merged_aliases, - )) - logger.info(f"创建 end_user_info 记录,other_name={first_alias}, aliases={merged_aliases}") - - db.commit() - - except Exception as e: - logger.error(f"更新 end_user other_name 失败: {e}", exc_info=True) - # 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中 - # 复用 deduped_and_disamb 模块级常量,避免重复维护 - USER_PLACEHOLDER_NAMES = _USER_PLACEHOLDER_NAMES - - def _extract_current_aliases(self, entity_nodes: List[ExtractedEntityNode], dialog_data_list=None) -> List[str]: - """从用户发言的原始实体中提取本轮新增别名(绕过去重污染) - - 策略: - 仅从 dialog_data_list 中找到 speaker="user" 的 statement, - 从这些 statement 的 triplet_extraction_info 中提取用户实体的 aliases。 - 这样拿到的是 LLM 对用户原话的提取结果,不受去重合并的影响。 - - 注意:不再使用去重后 entity_nodes 作为兜底,因为二层去重会将 Neo4j 历史别名 - 合并进来,导致历史别名被误认为"本轮提取"。历史别名的同步由 - _extract_deduped_entity_aliases 负责。 - - Args: - entity_nodes: 去重后的实体节点列表(未使用,保留参数兼容性) - dialog_data_list: 对话数据列表 - - Returns: - 别名列表(保持原始顺序,已过滤) - """ - if not dialog_data_list: - return [] - - all_user_aliases = [] - seen_lower = set() - for dialog in dialog_data_list: - for chunk in dialog.chunks: - speaker = getattr(chunk, 'speaker', None) - for statement in chunk.statements: - stmt_speaker = getattr(statement, 'speaker', None) or speaker - if stmt_speaker != "user": - continue - triplet_info = getattr(statement, 'triplet_extraction_info', None) - if not triplet_info: - continue - for entity in (triplet_info.entities or []): - ent_name = getattr(entity, 'name', '').strip() - if ent_name.lower() in self.USER_PLACEHOLDER_NAMES: - for alias in (getattr(entity, 'aliases', []) or []): - a = alias.strip() - if a and a.lower() not in self.USER_PLACEHOLDER_NAMES and a.lower() not in seen_lower: - all_user_aliases.append(a) - seen_lower.add(a.lower()) - if all_user_aliases: - logger.debug(f"从用户原始发言提取到别名: {all_user_aliases}") - return all_user_aliases - - def _extract_deduped_entity_aliases(self, entity_nodes: List[ExtractedEntityNode]) -> List[str]: - """从去重后的用户实体中提取完整别名列表。 - - 二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 的用户实体中, - 因此这里提取到的别名包含了历史积累的所有别名,可用于同步到 PgSQL。 - - Args: - entity_nodes: 去重后的实体节点列表(含二层去重合并结果) - - Returns: - 别名列表(已过滤占位名称,去重保序) - """ - for entity in entity_nodes: - if getattr(entity, 'name', '').strip().lower() in self.USER_PLACEHOLDER_NAMES: - aliases = getattr(entity, 'aliases', []) or [] - filtered = [ - a for a in aliases - if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES - ] - if filtered: - return filtered - return [] - - async def _fetch_neo4j_assistant_aliases(self, end_user_id: str) -> set: - """从 Neo4j 查询 AI 助手实体的所有别名(用于从用户别名中排除)""" - return await fetch_neo4j_assistant_aliases(self.connector, end_user_id) - - def _resolve_other_name( - self, - current: Optional[str], - current_aliases: List[str], - neo4j_aliases: List[str] - ) -> Optional[str]: - """ - 决定 other_name 是否需要更新,返回新值;无需更新返回 None。 - - 决策规则: - - 为空或为占位名称 → 用本次对话第一个别名 - - 不在 Neo4j aliases 中 → 用 Neo4j 第一个别名(说明已被删除) - - 否则 → 保持不变(返回 None) - - 注意:返回值不允许是占位名称("用户"、"我"、"User"、"I") - """ - # 当前值为空或为占位名称时,需要更新 - if not current or not current.strip() or current.strip().lower() in self.USER_PLACEHOLDER_NAMES: - candidate = current_aliases[0].strip() if current_aliases else None - # 确保候选值不是占位名称 - if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES: - return None - return candidate - if current not in neo4j_aliases: - candidate = neo4j_aliases[0].strip() if neo4j_aliases else None - # 确保候选值不是占位名称 - if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES: - return None - return candidate - return None - - async def _run_dedup_and_write_summary( - self, - dialogue_nodes: List[DialogueNode], - chunk_nodes: List[ChunkNode], - statement_nodes: List[StatementNode], - entity_nodes: List[ExtractedEntityNode], - statement_chunk_edges: List[StatementChunkEdge], - statement_entity_edges: List[StatementEntityEdge], - entity_entity_edges: List[EntityEntityEdge], - dialog_data_list: List[DialogData], - ) -> tuple[ - list[DialogueNode], - list[ChunkNode], - list[StatementNode], - list[ExtractedEntityNode], - list[StatementChunkEdge], - list[StatementEntityEdge], - list[EntityEntityEdge], - list[DialogData], - dict - ]: - """ - 执行两阶段去重并写入汇总 - - Args: - dialogue_nodes: 对话节点列表 - chunk_nodes: 分块节点列表 - statement_nodes: 陈述句节点列表 - entity_nodes: 实体节点列表 - statement_chunk_edges: 陈述句-分块边列表 - statement_entity_edges: 陈述句-实体边列表 - entity_entity_edges: 实体-实体边列表 - dialog_data_list: 对话数据列表 - - Returns: - 包含三个元组的元组: - - 第一个元组:(对话节点列表, 分块节点列表, 陈述句节点列表) - - 第二个元组:去重前的 (实体节点列表, 陈述句-实体边列表, 实体-实体边列表) - - 第三个元组:去重后的 (实体节点列表, 陈述句-实体边列表, 实体-实体边列表) - """ - logger.info("开始两阶段实体去重和消歧") - - # 进度回调:发送去重消歧开始消息 - if self.progress_callback: - await self.progress_callback("deduplication", "正在去重消歧...") - - logger.info( - f"去重前: {len(entity_nodes)} 个实体节点, " - f"{len(statement_entity_edges)} 条陈述句-实体边, " - f"{len(entity_entity_edges)} 条实体-实体边" - ) - - try: - # 在试运行模式下,跳过第二层去重(不查询数据库) - if self.is_pilot_run: - logger.info("试运行模式:仅执行第一层去重,跳过第二层数据库去重") - # 只执行第一层去重 - from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import ( - deduplicate_entities_and_edges, - ) - - dedup_entity_nodes, dedup_statement_entity_edges, dedup_entity_entity_edges, dedup_details = await deduplicate_entities_and_edges( - entity_nodes, - statement_entity_edges, - entity_entity_edges, - report_stage="第一层去重消歧(试运行)", - report_append=False, - dedup_config=self.config.deduplication, - llm_client=self.llm_client, - ) - - # 保存去重消歧的详细记录到实例变量 - self._save_dedup_details(dedup_details, entity_nodes, dedup_entity_nodes) - - result_tuple = ( - dialogue_nodes, - chunk_nodes, - statement_nodes, - dedup_entity_nodes, - statement_chunk_edges, - dedup_statement_entity_edges, - dedup_entity_entity_edges, - dialog_data_list, - dedup_details, - ) - - final_entity_nodes = dedup_entity_nodes - final_statement_entity_edges = dedup_statement_entity_edges - final_entity_entity_edges = dedup_entity_entity_edges - else: - # 正式模式:执行完整的两阶段去重 - ( - dialogue_nodes, - chunk_nodes, - statement_nodes, - final_entity_nodes, - statement_chunk_edges, - final_statement_entity_edges, - final_entity_entity_edges, - dedup_details, - ) = await dedup_layers_and_merge_and_return( - dialogue_nodes, - chunk_nodes, - statement_nodes, - entity_nodes, - statement_chunk_edges, - statement_entity_edges, - entity_entity_edges, - dialog_data_list, - self.config, - self.connector, - llm_client=self.llm_client, - ) - - # 保存去重消歧的详细记录到实例变量 - self._save_dedup_details(dedup_details, entity_nodes, final_entity_nodes) - - result_tuple = ( - dialogue_nodes, - chunk_nodes, - statement_nodes, - final_entity_nodes, - statement_chunk_edges, - final_statement_entity_edges, - final_entity_entity_edges, - dialog_data_list, - dedup_details, - ) - - logger.info( - f"去重后: {len(final_entity_nodes)} 个实体节点, " - f"{len(final_statement_entity_edges)} 条陈述句-实体边, " - f"{len(final_entity_entity_edges)} 条实体-实体边" - ) - logger.info( - f"去重效果: 实体减少 {len(entity_nodes) - len(final_entity_nodes)}, " - f"陈述句-实体边减少 {len(statement_entity_edges) - len(final_statement_entity_edges)}, " - f"实体-实体边减少 {len(entity_entity_edges) - len(final_entity_entity_edges)}" - ) - - # 流式输出:实时输出去重消歧的具体结果 - if self.progress_callback: - # 分析实体合并情况(使用内存中的记录) - merge_info = await self._analyze_entity_merges(entity_nodes, final_entity_nodes) - - # 逐个输出去重合并的实体示例 - for i, merge_detail in enumerate(merge_info[:5]): # 输出前5个去重结果 - dedup_result = { - "result_type": "entity_merge", - "merged_entity_name": merge_detail["main_entity_name"], - "merged_count": merge_detail["merged_count"], - "merge_progress": f"{i + 1}/{min(len(merge_info), 5)}", - "message": f"{merge_detail['main_entity_name']}合并{merge_detail['merged_count']}个:相似实体已合并" - } - await self.progress_callback("dedup_disambiguation_result", "实体去重中", dedup_result) - - # 分析实体消歧情况(使用内存中的记录) - disamb_info = await self._analyze_entity_disambiguation(entity_nodes, final_entity_nodes) - - # 逐个输出实体消歧的结果 - for i, disamb_detail in enumerate(disamb_info[:5]): # 输出前5个消歧结果 - disamb_result = { - "result_type": "entity_disambiguation", - "disambiguated_entity_name": disamb_detail["entity_name"], - "disambiguation_type": disamb_detail["disamb_type"], - "confidence": disamb_detail.get("confidence", "unknown"), - "reason": disamb_detail.get("reason", ""), - "disamb_progress": f"{i + 1}/{min(len(disamb_info), 5)}", - "message": f"{disamb_detail['entity_name']}消歧完成:{disamb_detail['disamb_type']}" - } - await self.progress_callback("dedup_disambiguation_result", "实体消歧中", disamb_result) - - # 进度回调:去重消歧完成,传递去重和消歧的具体效果 - await self._send_dedup_progress_callback( - len(entity_nodes), len(final_entity_nodes), - len(statement_entity_edges), len(final_statement_entity_edges), - len(entity_entity_edges), len(final_entity_entity_edges) - ) - - # 写入提取结果汇总(试运行和正式模式都需要生成) - try: - from app.core.config import settings - settings.ensure_memory_output_dir() - _write_extracted_result_summary( - chunk_nodes=chunk_nodes, - pipeline_output_dir=settings.MEMORY_OUTPUT_DIR, - ) - mode_str = "试运行" if self.is_pilot_run else "正式" - logger.info(f"提取结果汇总已写入({mode_str}模式)") - except Exception as e: - logger.warning(f"写入提取结果汇总失败: {e}") - - return result_tuple - - except Exception as e: - logger.error(f"两阶段去重失败: {e}", exc_info=True) - raise - - def _save_dedup_details( - self, - dedup_details: Dict[str, Any], - original_entities: List[ExtractedEntityNode], - final_entities: List[ExtractedEntityNode] - ): - """ - 保存去重消歧的详细记录到实例变量(基于内存数据结构) - - Args: - dedup_details: 去重函数返回的详细记录 - original_entities: 去重前的实体列表 - final_entities: 去重后的实体列表 - """ - try: - # 保存ID重定向映射 - self.id_redirect_map = dedup_details.get("id_redirect", {}) - - # 处理精确匹配的合并记录 - exact_merge_map = dedup_details.get("exact_merge_map", {}) - for key, info in exact_merge_map.items(): - merged_ids = info.get("merged_ids", set()) - if merged_ids: - self.dedup_merge_records.append({ - "type": "精确匹配", - "canonical_id": info.get("canonical_id"), - "entity_name": info.get("name"), - "entity_type": info.get("entity_type"), - "merged_count": len(merged_ids), - "merged_ids": list(merged_ids) - }) - - # 处理模糊匹配的合并记录 - fuzzy_merge_records = dedup_details.get("fuzzy_merge_records", []) - for record in fuzzy_merge_records: - # 解析模糊匹配记录字符串 - # 格式: "[模糊] 规范实体 id (group|name|type) <- 合并实体 id (group|name|type) | s_name=0.xxx, ..." - try: - import re - match = re.search(r"规范实体 (\S+) \(([^|]+)\|([^|]+)\|([^)]+)\) <- 合并实体 (\S+)", record) - if match: - self.dedup_merge_records.append({ - "type": "模糊匹配", - "canonical_id": match.group(1), - "entity_name": match.group(3), - "entity_type": match.group(4), - "merged_count": 1, - "merged_ids": [match.group(5)] - }) - except Exception as e: - logger.debug(f"解析模糊匹配记录失败: {record}, 错误: {e}") - - # 处理LLM去重的合并记录 - llm_decision_records = dedup_details.get("llm_decision_records", []) - for record in llm_decision_records: - if "[LLM去重]" in str(record): - try: - import re - # 格式: "[LLM去重] 同名类型相似 name1(type1)|name2(type2) | conf=0.xx | reason=..." - match = re.search(r"同名类型相似 ([^(]+)(([^)]+))\|([^(]+)(([^)]+))", record) - if match: - self.dedup_merge_records.append({ - "type": "LLM去重", - "entity_name": match.group(1), - "entity_type": f"{match.group(2)}|{match.group(4)}", - "merged_count": 1, - "merged_ids": [] - }) - except Exception as e: - logger.debug(f"解析LLM去重记录失败: {record}, 错误: {e}") - - # 处理消歧记录 - disamb_records = dedup_details.get("disamb_records", []) - for record in disamb_records: - if "[DISAMB阻断]" in str(record): - try: - import re - # 格式: "[DISAMB阻断] name1(type1)|name2(type2) | conf=0.xx | reason=..." - content = str(record).replace("[DISAMB阻断]", "").strip() - match = re.search(r"([^(]+)(([^)]+))\|([^(]+)(([^)]+))", content) - if match: - entity1_name = match.group(1).strip() - entity1_type = match.group(2) - match.group(3).strip() - entity2_type = match.group(4) - - # 提取置信度和原因 - conf_match = re.search(r"conf=([0-9.]+)", str(record)) - confidence = conf_match.group(1) if conf_match else "unknown" - - reason_match = re.search(r"reason=([^|]+)", str(record)) - reason = reason_match.group(1).strip() if reason_match else "" - - self.dedup_disamb_records.append({ - "entity_name": entity1_name, - "disamb_type": f"消歧阻断:{entity1_type} vs {entity2_type}", - "confidence": confidence, - "reason": reason[:100] + "..." if len(reason) > 100 else reason - }) - except Exception as e: - logger.debug(f"解析消歧记录失败: {record}, 错误: {e}") - - logger.info( - f"保存去重消歧记录:{len(self.dedup_merge_records)} 个合并记录,{len(self.dedup_disamb_records)} 个消歧记录") - - except Exception as e: - logger.error(f"保存去重消歧详情失败: {e}", exc_info=True) - - async def _analyze_entity_merges( - self, - original_entities: List[ExtractedEntityNode], - final_entities: List[ExtractedEntityNode] - ) -> List[Dict[str, Any]]: - """ - 分析实体合并情况,直接使用内存中的合并记录(不再解析日志文件) - - Args: - original_entities: 去重前的实体列表 - final_entities: 去重后的实体列表 - - Returns: - 合并详情列表,每个元素包含主实体名称和合并数量 - """ - try: - # 直接使用保存的合并记录 - if self.dedup_merge_records: - # 按合并数量排序,返回前几个 - sorted_records = sorted( - self.dedup_merge_records, - key=lambda x: x.get("merged_count", 0), - reverse=True - ) - - merge_info = [] - for record in sorted_records: - merge_info.append({ - "main_entity_name": record.get("entity_name", "未知实体"), - "merged_count": record.get("merged_count", 1) - }) - - return merge_info - - # 如果没有保存的记录,返回空列表 - logger.info("未找到实体合并记录") - return [] - - except Exception as e: - logger.error(f"分析实体合并情况失败: {e}", exc_info=True) - return [] - - async def _analyze_entity_disambiguation( - self, - original_entities: List[ExtractedEntityNode], - final_entities: List[ExtractedEntityNode] - ) -> List[Dict[str, Any]]: - """ - 分析实体消歧情况,直接使用内存中的消歧记录(不再解析日志文件) - - Args: - original_entities: 去重前的实体列表 - final_entities: 去重后的实体列表 - - Returns: - 消歧详情列表,每个元素包含实体名称和消歧类型 - """ - try: - # 直接使用保存的消歧记录 - if self.dedup_disamb_records: - return self.dedup_disamb_records - - # 如果没有保存的记录,返回空列表 - logger.info("未找到实体消歧记录") - return [] - - except Exception as e: - logger.error(f"分析实体消歧情况失败: {e}", exc_info=True) - return [] - - def _get_entity_type_display_name(self, entity_type: str) -> str: - """ - 获取实体类型的中文显示名称 - - Args: - entity_type: 英文实体类型 - - Returns: - 中文显示名称 - """ - type_mapping = { - "Person": "人物实体节点", - "Organization": "组织实体节点", - "ORG": "组织实体节点", - "Location": "地点实体节点", - "LOC": "地点实体节点", - "Event": "事件实体节点", - "Concept": "概念实体节点", - "Time": "时间实体节点", - "Position": "职位实体节点", - "WorkRole": "职业实体节点", - "System": "系统实体节点", - "Policy": "政策实体节点", - "HistoricalPeriod": "历史时期实体节点", - "HistoricalState": "历史国家实体节点", - "HistoricalEvent": "历史事件实体节点", - "EconomicFactor": "经济因素实体节点", - "Condition": "条件实体节点", - "Numeric": "数值实体节点" - } - return type_mapping.get(entity_type, f"{entity_type}实体节点") - - async def _output_relationship_creation_results( - self, - entity_entity_edges: List[EntityEntityEdge], - entity_nodes: List[ExtractedEntityNode] - ): - """ - 输出关系创建结果 - - Args: - entity_entity_edges: 实体-实体边列表 - entity_nodes: 实体节点列表 - """ - try: - # 创建实体ID到名称的映射 - entity_id_to_name = {node.id: node.name for node in entity_nodes} - - # 输出关系创建结果 - for i, edge in enumerate(entity_entity_edges[:10]): # 只输出前10个关系 - source_name = entity_id_to_name.get(edge.source, f"Entity_{edge.source}") - target_name = entity_id_to_name.get(edge.target, f"Entity_{edge.target}") - relation_type = edge.relation_type - - relationship_result = { - "result_type": "relationship_creation", - "relationship_index": i + 1, - "source_entity": source_name, - "relation_type": relation_type, - "target_entity": target_name, - "relationship_text": f"{source_name} -[{relation_type}]-> {target_name}" - } - - await self.progress_callback("creating_nodes_edges_result", "关系创建", relationship_result) - - except Exception as e: - logger.error(f"输出关系创建结果失败: {e}", exc_info=True) - - async def _send_dedup_progress_callback( - self, - original_entities: int, - final_entities: int, - original_stmt_edges: int, - final_stmt_edges: int, - original_ent_edges: int, - final_ent_edges: int, - ): - """ - 发送去重消歧完成的进度回调,传递具体的去重和消歧效果 - - Args: - original_entities: 去重前实体数量 - final_entities: 去重后实体数量 - original_stmt_edges: 去重前陈述句-实体边数量 - final_stmt_edges: 去重后陈述句-实体边数量 - original_ent_edges: 去重前实体-实体边数量 - final_ent_edges: 去重后实体-实体边数量 - """ - try: - # 解析去重消歧报告文件,获取具体的去重和消歧效果 - dedup_details = await self._parse_dedup_report() - - # 计算去重效果统计 - entities_reduced = original_entities - final_entities - stmt_edges_reduced = original_stmt_edges - final_stmt_edges - ent_edges_reduced = original_ent_edges - final_ent_edges - - # 构建进度回调数据 - dedup_stats = { - "entities": { - "original_count": original_entities, - "final_count": final_entities, - "reduced_count": entities_reduced, - "reduction_rate": round(entities_reduced / original_entities * 100, - 1) if original_entities > 0 else 0, - }, - "statement_entity_edges": { - "original_count": original_stmt_edges, - "final_count": final_stmt_edges, - "reduced_count": stmt_edges_reduced, - }, - "entity_entity_edges": { - "original_count": original_ent_edges, - "final_count": final_ent_edges, - "reduced_count": ent_edges_reduced, - }, - "dedup_examples": dedup_details.get("dedup_examples", []), - "disamb_examples": dedup_details.get("disamb_examples", []), - "summary": { - "total_merges": dedup_details.get("total_merges", 0), - "total_disambiguations": dedup_details.get("total_disambiguations", 0), - } - } - - await self.progress_callback("dedup_disambiguation_complete", "去重消歧完成", dedup_stats) - - except Exception as e: - logger.error(f"发送去重消歧进度回调失败: {e}", exc_info=True) - # 即使解析失败,也发送基本的统计信息 - try: - basic_stats = { - "entities": { - "original_count": original_entities, - "final_count": final_entities, - "reduced_count": original_entities - final_entities, - }, - "summary": f"实体去重合并{original_entities - final_entities}个" - } - await self.progress_callback("dedup_disambiguation_complete", "去重消歧完成", basic_stats) - except Exception as e2: - logger.error(f"发送基本去重统计失败: {e2}", exc_info=True) - - async def _parse_dedup_report(self) -> Dict[str, Any]: - """ - 获取去重消歧报告,直接使用内存中的记录(不再解析日志文件) - - Returns: - 包含去重和消歧详细信息的字典 - """ - try: - # 直接使用保存的记录构建报告 - dedup_examples = [] - disamb_examples = [] - total_merges = 0 - total_disambiguations = 0 - - # 处理合并记录 - for record in self.dedup_merge_records: - merge_count = record.get("merged_count", 0) - total_merges += merge_count - - dedup_examples.append({ - "type": record.get("type", "未知"), - "entity_name": record.get("entity_name", "未知实体"), - "entity_type": record.get("entity_type", "未知类型"), - "merge_count": merge_count, - "description": f"{record.get('entity_name', '未知实体')}实体去重合并{merge_count}个" - }) - - # 处理消歧记录 - for record in self.dedup_disamb_records: - total_disambiguations += 1 - - # 从消歧类型中提取实体类型信息 - disamb_type = record.get("disamb_type", "") - entity_name = record.get("entity_name", "未知实体") - - disamb_examples.append({ - "entity1_name": entity_name, - "entity1_type": disamb_type.split("vs")[0].replace("消歧阻断:", - "").strip() if "vs" in disamb_type else "未知", - "entity2_name": entity_name, - "entity2_type": disamb_type.split("vs")[1].strip() if "vs" in disamb_type else "未知", - "description": f"{entity_name},消歧区分成功" - }) - - return { - "dedup_examples": dedup_examples[:5], # 只返回前5个示例 - "disamb_examples": disamb_examples[:5], # 只返回前5个示例 - "total_merges": total_merges, - "total_disambiguations": total_disambiguations, - } - - except Exception as e: - logger.error(f"获取去重报告失败: {e}", exc_info=True) - return {"dedup_examples": [], "disamb_examples": [], "total_merges": 0, "total_disambiguations": 0} - - -# ============================================================================ -# 数据加载和预处理函数 -# ============================================================================ -# 以下函数从 extraction_pipeline.py 迁移而来,用于数据加载和预处理 - - -async def get_chunked_dialogs( - chunker_strategy: str = "RecursiveChunker", - end_user_id: str = "group_1", - indices: Optional[List[int]] = None, -) -> List[DialogData]: - """从测试数据生成分块对话 - - Args: - chunker_strategy: 分块策略(默认: RecursiveChunker) - end_user_id: 组ID - indices: 要处理的数据索引列表(可选) - - Returns: - 包含分块的 DialogData 对象列表 - """ - import json - import re - - # 加载测试数据 - testdata_path = os.path.join(os.path.dirname(__file__), "../../data", "testdata.json") - with open(testdata_path, "r", encoding="utf-8") as f: - test_data = [json.loads(line) for line in f] - - dialog_data_list = [] - - if indices is not None: - # 选择特定索引 - selected_data = [test_data[i] for i in indices if 0 <= i < len(test_data)] - else: - # 默认使用所有数据 - selected_data = test_data - - for data in selected_data: - # 解析对话上下文 - context_text = data["context"] - - # 从context文本中解析日期 - conv_date: Optional[str] = None - m = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日", context_text) - if m: - y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3)) - conv_date = f"{y:04d}-{mo:02d}-{d:02d}" - else: - m = re.search(r"(\d{4})[-/](\d{1,2})[-/](\d{1,2})", context_text) - if m: - y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3)) - conv_date = f"{y:04d}-{mo:02d}-{d:02d}" - - dialog_metadata: Dict[str, Any] = {} - if conv_date: - dialog_metadata["conversation_date"] = conv_date - dialog_metadata["publication_date"] = conv_date - - # 分割对话为消息 - lines = context_text.split("\n") - messages = [] - - # 解析对话行 - for raw_line in lines: - line = raw_line.strip() - match = re.match(r'^[""]?(用户|AI)\s*[::]\s*(.*)$', line) - if match: - role = match.group(1) - msg = match.group(2).strip().rstrip('""') - from app.core.memory.models.message_models import ConversationMessage - messages.append(ConversationMessage(role=role, msg=msg)) - - # 创建 DialogData - from app.core.memory.models.message_models import ConversationContext - conversation_context = ConversationContext(msgs=messages) - dialog_data = DialogData( - context=conversation_context, - ref_id=data['id'], - end_user_id=end_user_id, - metadata=dialog_metadata, - ) - - # 创建分块器并处理对话 - from app.core.memory.storage_services.extraction_engine.knowledge_extraction.chunk_extraction import ( - DialogueChunker, - ) - chunker = DialogueChunker(chunker_strategy) - extracted_chunks = await chunker.process_dialogue(dialog_data) - dialog_data.chunks = extracted_chunks - - dialog_data_list.append(dialog_data) - - # 保存输出 - def serialize_datetime(obj): - if isinstance(obj, datetime): - return obj.isoformat() - raise TypeError( - f"Object of type {obj.__class__.__name__} is not JSON serializable" - ) - - combined_output = [dd.model_dump() for dd in dialog_data_list] - from app.core.config import settings - settings.ensure_memory_output_dir() - output_path = settings.get_memory_output_path("chunker_test_output.txt") - - import json - with open(output_path, "w", encoding="utf-8") as f: - json.dump( - combined_output, f, ensure_ascii=False, indent=4, default=serialize_datetime - ) - - return dialog_data_list - - -def preprocess_data( - input_path: Optional[str] = None, - output_path: Optional[str] = None, - skip_cleaning: bool = True, - indices: Optional[List[int]] = None -) -> List[DialogData]: - """数据预处理 - - Args: - input_path: 原始数据路径 - output_path: 预处理后数据保存路径 - skip_cleaning: 是否跳过数据清洗步骤(默认False) - indices: 要处理的数据索引列表 - - Returns: - 经过清洗转换后的 DialogData 列表 - """ - logger.debug("=== 数据预处理 ===") - from app.core.memory.storage_services.extraction_engine.data_preprocessing.data_preprocessor import ( - DataPreprocessor, - ) - preprocessor = DataPreprocessor() - try: - cleaned_data = preprocessor.preprocess(input_path=input_path, output_path=output_path, - skip_cleaning=skip_cleaning, indices=indices) - logger.debug(f"数据预处理完成!共处理了 {len(cleaned_data)} 条对话数据") - return cleaned_data - except Exception as e: - logger.error(f"数据预处理过程中出现错误: {e}") - raise - - -async def get_chunked_dialogs_from_preprocessed( - data: List[DialogData], - chunker_strategy: str = "RecursiveChunker", - llm_client: Optional[Any] = None, -) -> List[DialogData]: - """从预处理后的数据中生成分块 - - Args: - data: 预处理后的 DialogData 列表 - chunker_strategy: 分块策略 - llm_client: LLM 客户端(用于 LLMChunker) - - Returns: - 带 chunks 的 DialogData 列表 - """ - logger.debug(f"=== 批量对话分块处理 (使用 {chunker_strategy}) ===") - if not data: - raise ValueError("预处理数据为空,无法进行分块") - - all_chunked_dialogs: List[DialogData] = [] - from app.core.memory.storage_services.extraction_engine.knowledge_extraction.chunk_extraction import ( - DialogueChunker, - ) - - for dialog_data in data: - chunker = DialogueChunker(chunker_strategy, llm_client=llm_client) - chunks = await chunker.process_dialogue(dialog_data) - dialog_data.chunks = chunks - all_chunked_dialogs.append(dialog_data) - - return all_chunked_dialogs - - -async def get_chunked_dialogs_with_preprocessing( - chunker_strategy: str = "RecursiveChunker", - end_user_id: str = "default", - user_id: str = "default", - apply_id: str = "default", - indices: Optional[List[int]] = None, - input_data_path: Optional[str] = None, - llm_client: Optional[Any] = None, - skip_cleaning: bool = True, - pruning_config: Optional[Dict] = None, -) -> List[DialogData]: - """包含数据预处理步骤的完整分块流程 - - Args: - chunker_strategy: 分块策略 - end_user_id: 组ID - user_id: 用户ID - apply_id: 应用ID - indices: 要处理的数据索引列表 - input_data_path: 输入数据路径 - llm_client: LLM 客户端 - skip_cleaning: 是否跳过数据清洗步骤(默认False) - pruning_config: 剪枝配置字典,包含 pruning_switch, pruning_scene, pruning_threshold - - Returns: - 带 chunks 的 DialogData 列表 - """ - logger.debug("=== 完整数据处理流程(包含预处理)===") - - if input_data_path is None: - input_data_path = os.path.join( - os.path.dirname(__file__), "../../data", "testdata.json" - ) - - # 步骤1: 数据预处理(包含索引筛选) - from app.core.config import settings - settings.ensure_memory_output_dir() - preprocessed_data = preprocess_data( - input_path=input_data_path, - output_path=settings.get_memory_output_path("preprocessed_data.json"), - skip_cleaning=skip_cleaning, - indices=indices, - ) - - # 设置 end_user_id - for dd in preprocessed_data: - dd.end_user_id = end_user_id - - # 步骤2: 语义剪枝 - try: - from app.core.memory.storage_services.extraction_engine.data_preprocessing.data_pruning import ( - SemanticPruner, - ) - from app.core.memory.models.config_models import PruningConfig - - # 构建剪枝配置 - if pruning_config: - # 使用传入的配置 - config = PruningConfig(**pruning_config) - logger.debug( - f"[剪枝] 使用传入配置: switch={config.pruning_switch}, scene={config.pruning_scene}, threshold={config.pruning_threshold}") - else: - # 使用默认配置(关闭剪枝) - config = None - logger.debug("[剪枝] 未提供配置,使用默认配置(剪枝关闭)") - - pruner = SemanticPruner(config=config, llm_client=llm_client) - - # 记录单对话场景下剪枝前的消息数量 - single_dialog_original_msgs = None - if len(preprocessed_data) == 1 and preprocessed_data[0].context: - single_dialog_original_msgs = len(preprocessed_data[0].context.msgs) - - preprocessed_data = await pruner.prune_dataset(preprocessed_data) - - # 单对话:打印清洗与剪枝信息 - if len(preprocessed_data) == 1 and single_dialog_original_msgs is not None: - remaining_msgs = len(preprocessed_data[0].context.msgs) if preprocessed_data[0].context else 0 - deleted_msgs = max(0, single_dialog_original_msgs - remaining_msgs) - logger.debug( - f"语义剪枝完成!剩余 1 条对话!原始消息数:{single_dialog_original_msgs}," - f"保留消息数:{remaining_msgs},删除 {deleted_msgs} 条。" - ) - else: - logger.debug(f"语义剪枝完成!剩余 {len(preprocessed_data)} 条对话") - - # 保存剪枝后的数据 - try: - from app.core.memory.storage_services.extraction_engine.data_preprocessing.data_preprocessor import ( - DataPreprocessor, - ) - pruned_output_path = settings.get_memory_output_path("pruned_data.json") - dp = DataPreprocessor(output_file_path=pruned_output_path) - dp.save_data(preprocessed_data, output_path=pruned_output_path) - except Exception as se: - logger.error(f"保存剪枝结果失败:{se}") - except Exception as e: - logger.error(f"语义剪枝过程中出现错误,跳过剪枝: {e}") - - # 步骤3: 对话分块 - return await get_chunked_dialogs_from_preprocessed( - preprocessed_data, - chunker_strategy=chunker_strategy, - llm_client=llm_client, - ) diff --git a/api/app/core/memory/storage_services/extraction_engine/extraction_pipeline_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/extraction_pipeline_orchestrator.py index 3b0ce277..b93bdec2 100644 --- a/api/app/core/memory/storage_services/extraction_engine/extraction_pipeline_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/extraction_pipeline_orchestrator.py @@ -223,6 +223,7 @@ class NewExtractionOrchestrator: temporal_type=stmt_out.temporal_type, supporting_context=supporting_context, speaker=stmt_out.speaker, + dialog_at=stmt_out.dialog_at or "", valid_at=stmt_out.valid_at, invalid_at=stmt_out.invalid_at, has_unsolved_reference=stmt_out.has_unsolved_reference, @@ -494,10 +495,9 @@ class NewExtractionOrchestrator: else None ) for chunk in dialog.chunks: - # 仅对 speaker="user" 的 chunk 进行陈述句抽取;assistant 内容交给 - # 上游预处理/剪枝阶段处理,避免浪费 LLM 调用。 - chunk_speaker = getattr(chunk, "speaker", "user") - if chunk_speaker != "user": + # 仅跳过明确标记为 assistant 的 chunk;speaker=None(混合分块)正常处理。 + chunk_speaker = getattr(chunk, "speaker", None) + if chunk_speaker == "assistant": continue inp = StatementStepInput( chunk_id=chunk.id, @@ -506,6 +506,7 @@ class NewExtractionOrchestrator: target_message_date=str( getattr(dialog, "created_at", "") or "" ), + dialog_at=getattr(chunk, "dialog_at", "") or "", supporting_context=ctx, ) tasks.append(self.statement_temporal_step.run(inp)) @@ -561,10 +562,9 @@ class NewExtractionOrchestrator: chunk_stmts = all_stmt_results.get(dialog.id, {}) for _chunk_id, stmts in chunk_stmts.items(): for stmt in stmts: - # 防御性过滤:三元组抽取仅针对 user statement。 - # 上游 _extract_all_statements 已过滤 chunk.speaker,此处再做 - # 一次 statement.speaker 的二次校验,防止外部注入或 legacy 数据脱漏。 - if getattr(stmt, "speaker", "user") != "user": + # 防御性过滤:跳过明确标记为 assistant 的 statement。 + # speaker=None(混合分块)正常处理。 + if getattr(stmt, "speaker", None) == "assistant": continue inp = self._convert_to_triplet_input(stmt, ctx) tasks.append(self.triplet_step.run(inp)) diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/schema/extraction_step_schema.py b/api/app/core/memory/storage_services/extraction_engine/steps/schema/extraction_step_schema.py index 498dec54..597563a1 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/schema/extraction_step_schema.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/schema/extraction_step_schema.py @@ -34,6 +34,7 @@ class StatementStepInput(BaseModel): end_user_id: str target_content: str target_message_date: str + dialog_at: str = "" # ISO 8601 timestamp of the source message; used as "now" for relative time resolution supporting_context: SupportingContext @@ -50,6 +51,7 @@ class StatementStepOutput(BaseModel): valid_at: str # ISO 8601 or "NULL" invalid_at: str # ISO 8601 or "NULL" has_unsolved_reference: bool = False # Whether the statement has unresolved references + dialog_at: str = "" # Passed through from input; carried into TripletStepInput # ── Triplet extraction ── @@ -62,6 +64,7 @@ class TripletStepInput(BaseModel): temporal_type: str supporting_context: SupportingContext speaker: str + dialog_at: str = "" # ISO 8601 timestamp of the source message; helps LLM ground entity descriptions in time valid_at: str invalid_at: str has_unsolved_reference: bool = False # From upstream statement extraction diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/statement_temporal_step.py b/api/app/core/memory/storage_services/extraction_engine/steps/statement_temporal_step.py index 7c0e3a48..d7427fae 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/statement_temporal_step.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/statement_temporal_step.py @@ -38,6 +38,7 @@ class _ExtractedStatement(BaseModel): False, description="Whether the statement reflects user's emotional state", ) + dialog_at: str = Field("", description="ISO 8601 session timestamp, copied verbatim from input") valid_at: str = Field("NULL", description="ISO 8601 or NULL") invalid_at: str = Field("NULL", description="ISO 8601 or NULL") has_unsolved_reference: bool = Field(False, description="Whether the statement has unresolved references") @@ -106,6 +107,7 @@ class StatementTemporalExtractionStep(ExtractionStep[StatementStepInput, List[St input_json = { "chunk_id": input_data.chunk_id, "end_user_id": input_data.end_user_id, + "dialog_at": input_data.dialog_at or "", "target_content": input_data.target_content, "target_message_date": input_data.target_message_date, "supporting_context": { @@ -160,6 +162,7 @@ class StatementTemporalExtractionStep(ExtractionStep[StatementStepInput, List[St # relevance=stmt.relevance.strip().upper(), speaker="user", # default; orchestrator overrides from chunk metadata has_emotional_state=getattr(stmt, "has_emotional_state", False), + dialog_at=input_data.dialog_at or "", # carry through from input valid_at=stmt.valid_at or "NULL", invalid_at=stmt.invalid_at or "NULL", has_unsolved_reference=getattr(stmt, "has_unsolved_reference", False), diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/triplet_step.py b/api/app/core/memory/storage_services/extraction_engine/steps/triplet_step.py index 9f8953b8..684e2982 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/triplet_step.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/triplet_step.py @@ -68,6 +68,7 @@ class TripletExtractionStep(ExtractionStep[TripletStepInput, TripletStepOutput]) ] }, "speaker": input_data.speaker, + "dialog_at": input_data.dialog_at or "", "valid_at": input_data.valid_at, "invalid_at": input_data.invalid_at, "has_unsolved_reference": input_data.has_unsolved_reference, diff --git a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 deleted file mode 100644 index f31e535a..00000000 --- a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 +++ /dev/null @@ -1,130 +0,0 @@ -你是一个面向记忆存储的 Assistant 辅助信息提取器。 - -任务: - -- 输入是一个 JSON,对话放在 `msgs` 数组里,且数组中只有两条消息:第一条是 `User`,第二条是 `Assistant`。 -- 你只处理第二条消息里的 `Assistant.msg`。 -- 第一条消息里的 `User.msg` 只用于理解上下文,不允许出现在输出里。 -- 你的输出必须包含两个字段: - 1. `assistant_memory_hint` - 2. `assistant_memory_type` - -目标: - -- 从 `Assistant.msg` 中提取一条适合后续检索的极短辅助摘要。 -- 删除冗长解释、寒暄、礼貌话术、重复复述和空泛铺垫。 -- 允许做摘要式改写,但只能保留原消息中已经出现的建议、推荐、提醒、安慰、步骤或其他对后续记忆有帮助的核心内容。 -- 如果没有值得保留的信息,`assistant_memory_hint` 输出 `"NULL"`,`assistant_memory_type` 也输出 `"NULL"`。 - -硬约束: - -- 不得改写、复述或输出 `User.msg`。 -- 不得捏造新事实、新建议、新步骤、新材料。 -- 不得改变 `Assistant` 原始语义和立场。 -- 可以压缩、合并、重写 `Assistant.msg`,但必须忠于原内容。 -- `assistant_memory_type` 只能从以下枚举中选择: - `comfort | suggestion | recommendation | warning | instruction | NULL` -- 只输出严格 JSON,不要输出解释。 - -压缩原则: - -- 优先保留具体建议、推荐、提醒、操作步骤、风险提示、安慰动作。 -- 优先删除长背景解释、寒暄、礼貌收尾、对用户原话的重复复述。 -- 如果原文是长说明、长步骤、长菜谱,输出更短的概要版本,但不要丢掉核心意图。 -- 优先保留最短但仍有信息密度的版本。 -- `assistant_memory_hint` 尽量写成完整句,不要只写零散词组或标签。 -- 优先使用显式主语来写结果,例如: - `安慰了用户……` - `建议用户……` - `推荐用户……` - `提醒用户……` - -Few-shot 示例 1 -输入: -{ - "msgs": [ - { - "role": "User", - "msg": "我室友小雯这学期一直在准备毕业论文,这两周都在改答辩 PPT。她下周三答辩,我有点担心她会紧张。" - }, - { - "role": "Assistant", - "msg": "听起来你很关心小雯,也希望她答辩顺利。她现在紧张其实很正常,很多人在答辩前都会这样。" - } - ] -} -输出: -{ - "assistant_memory_hint": "安慰了用户对室友答辩状态的担忧。", - "assistant_memory_type": "comfort" -} - -Few-shot 示例 2 -输入: -{ - "msgs": [ - { - "role": "User", - "msg": "我最近总失眠,已经两周了,想先自己调一调。" - }, - { - "role": "Assistant", - "msg": "如果你想先自己调整,可以先减少咖啡因摄入,尤其下午和晚上尽量不要再喝咖啡或浓茶,同时把睡前刷手机的时间压缩一些,尽量固定上床时间,先连续观察几天。" - } - ] -} -输出: -{ - "assistant_memory_hint": "建议用户减少咖啡因摄入、减少睡前刷手机时间并固定上床时间。", - "assistant_memory_type": "suggestion" -} - -Few-shot 示例 3 -输入: -{ - "msgs": [ - { - "role": "User", - "msg": "我晚上想做个简单点的减脂餐,最好二十分钟左右能搞定。" - }, - { - "role": "Assistant", - "msg": "你可以做一个鸡胸肉沙拉碗,主要用鸡胸肉、生菜、黄瓜和圣女果。鸡胸肉简单煎熟切块后和蔬菜拌在一起,调味尽量用橄榄油加一点醋,不要放太多沙拉酱。" - } - ] -} -输出: -{ - "assistant_memory_hint": "推荐用户做鸡胸肉沙拉碗,并提醒用户调味时少放沙拉酱。", - "assistant_memory_type": "recommendation" -} - -Few-shot 示例 4 -输入: -{ - "msgs": [ - { - "role": "User", - "msg": "剪枝引擎和萃取引擎我都想先做,但是估计都会比较花时间。" - }, - { - "role": "Assistant", - "msg": "这两个模块都涉及比较多的设计和实现细节。如果你想先推进,我建议先拆需求,再分别评估开发量。" - } - ] -} -输出: -{ - "assistant_memory_hint": "建议用户先拆需求,再分别评估两个模块的开发量。", - "assistant_memory_type": "suggestion" -} - -现在处理下面这个输入。 -输入: -{{ dialog_text }} - -只输出严格 JSON: -{ - "assistant_memory_hint": "", - "assistant_memory_type": "comfort | suggestion | recommendation | warning | instruction | NULL" -} \ No newline at end of file diff --git a/api/app/core/memory/utils/prompt/prompts/extracat_pruning.jinja2 b/api/app/core/memory/utils/prompt/prompts/extracat_pruning.jinja2 new file mode 100644 index 00000000..c2bec638 --- /dev/null +++ b/api/app/core/memory/utils/prompt/prompts/extracat_pruning.jinja2 @@ -0,0 +1,180 @@ +你是一个面向记忆存储的 Assistant 辅助信息压缩器。 + +任务: + +- 输入是一个 JSON,对话放在 `msgs` 数组里。 +- 你只处理 `Assistant.msg`。 +- `User.msg` 只用于理解上下文,不允许出现在输出里,也不允许被复述成用户摘要。 +- 你的输出必须包含两个字段: + 1. `assistant_memory_hint` + 2. `assistant_memory_type` + +目标: + +- 把较长的 `Assistant.msg` 压缩成一条更短、便于检索的辅助摘要。 +- 保留建议、推荐、提醒、说明、提问、附和、重复等核心动作。 +- 删除冗长解释、寒暄、礼貌套话和低价值铺垫,但不要漏掉真正有用的信息。 + +硬约束: + +- 不得输出或复述 `User.msg`。 +- 不得捏造新事实、新建议、新步骤、新材料或新限制。 +- 不得改变 `Assistant` 原始语义和立场。 +- 可以压缩、合并、重写 `Assistant.msg`,但必须忠于原内容。 +- `assistant_memory_hint` 必须是简短的完整句,尽量包含清晰主谓宾,不要只写零散词组。 +- 如果 `assistant_memory_hint` 里出现"室友""老师""朋友""同事""这件事"这类泛称,而上下文中存在清晰、稳定、唯一的指代对象,则优先改写成那个清晰指代对象。 +- 只有在当前两条消息里无法稳定落到唯一对象时,才保留泛称或模糊表达。 +- 如果对象本身已经足够清晰,例如"数据库作业""鸡胸肉沙拉""李教授",则不要为了"更具体"而做不必要的过度展开。 +- `assistant_memory_type` 只能从以下枚举中选择: + `comfort | suggestion | recommendation | warning | instruction | question | agreement | repetition | other` +- 如果 `Assistant.msg` 同时包含多个动作,`assistant_memory_hint` 可以保留多个动作,但 `assistant_memory_type` 只标记其中最主要、最值得检索的主动作。 +- 不再输出 `NULL`。即使内容价值较低,也要尽量压成一条最短的辅助摘要。 +- 如果 `Assistant.msg` 含有提问、追问或反问,`assistant_memory_hint` 必须保留提问的具体内容,不能只写"询问了用户"。 +- 如果提问里给出了明确选项、候选分支或对比项,`assistant_memory_hint` 应尽量保留这些选项,而不是只保留上位概括。 +- `question` 只在"提问/追问/反问"是这条消息的主推进动作时使用;如果消息里同时有建议和提问,但建议明显更核心,则类型标为 `suggestion`,并在 hint 里按需保留提问内容。 +- 对 `question` 类型,优先保留: + 1. 问题的核心主题 + 2. 明确给出的选项或分支 + 3. 必要的限定条件 +- 对 `question` 类型,不要只保留寒暄式前缀,例如"听起来不错""如果方便的话";应保留真正要用户回答的部分。 +- 只输出严格 JSON,不要输出解释。 + +压缩原则: + +- 优先保留具体建议、推荐、提醒、操作步骤、风险提示和问题内容。 +- 对纯附和内容,压成极短摘要,例如"附和了用户对某事的看法。" +- 对明显重复用户内容的回复,压成极短摘要,例如"重复了用户关于某事的说法。" +- 对泛泛回应、空泛鼓励、礼貌性延展,压成最短可理解摘要,并标为 `other`。 +- 如果上下文里能确定人名、关系对象或具体事物,优先在摘要里写出明确对象,不要无必要地保留"室友""那个老师""这件事"这类泛称。 +- 如果原文里的对象已经明确且自然,就直接保留该对象,不要改写成更绕或更长的表达。 +- 如果问题中存在"是 A、B 还是 C"这类显式选项,优先保留 A、B、C,而不是只写成"询问用户偏好"。 +- 如果原文既有建议又有提问,允许在 hint 里同时保留;但 type 只标主动作。若提问是核心推进动作,则 type 标为 `question`;若建议更核心,则 type 标为 `suggestion`。 +- 优先使用显式主语来写结果,例如: + `安慰了用户……` + `建议用户……` + `推荐用户……` + `提醒用户……` + `询问用户……` + `附和了用户……` + `重复了用户……` + +类型判断补充: + +- `question`:主动作是向用户提问、追问、澄清、确认选项或收集偏好。 +- `suggestion`:主动作是给用户建议;即使末尾顺带问一句,也仍以建议为主。 +- `recommendation`:主动作是推荐某个方案、菜谱、产品或选择。 +- `warning`:主动作是提醒风险、限制、禁忌或后果。 +- `instruction`:主动作是说明操作顺序、步骤或执行流程。 +- `comfort`:主动作是安慰、理解、支持用户情绪。 +- `agreement`:主动作是附和、认同用户说法。 +- `repetition`:主动作是重复、转述用户已有内容,没有新增有效信息。 +- `other`:不适合归入以上类型,但仍值得压成一条短摘要。 + +Few-shot 示例 1 +输入: +{ + "msgs": [ + { + "role": "User", + "msg": "我室友小雯这学期一直在准备毕业论文,这两周都在改答辩PPT。她下周三答辩,我有点担心她会紧张。" + }, + { + "role": "Assistant", + "msg": "听起来你很关心小雯,也希望她答辩顺利。她现在紧张其实很正常,很多人在答辩前都会这样。" + } + ] +} +输出: +{ + "assistant_memory_hint": "安慰了用户对室友小雯答辩状态的担忧。", + "assistant_memory_type": "comfort" +} + +Few-shot 示例 2 +输入: +{ + "msgs": [ + { + "role": "User", + "msg": "我最近总失眠,已经两周了,想先自己调一调。" + }, + { + "role": "Assistant", + "msg": "如果你想先自己调整,可以先减少咖啡因摄入,尤其下午和晚上尽量不要再喝咖啡或浓茶,同时把睡前刷手机的时间压缩一些,尽量固定上床时间,先连续观察几天。" + } + ] +} +输出: +{ + "assistant_memory_hint": "建议用户减少咖啡因摄入、减少睡前刷手机时间并固定上床时间。", + "assistant_memory_type": "suggestion" +} + +Few-shot 示例 3 +输入: +{ + "msgs": [ + { + "role": "User", + "msg": "我晚上想做个简单点的减脂餐,最好二十分钟左右能搞定。" + }, + { + "role": "Assistant", + "msg": "你可以做一个鸡胸肉沙拉碗,主要用鸡胸肉、生菜、黄瓜和圣女果。鸡胸肉简单煎熟切块后和蔬菜拌在一起,调味尽量用橄榄油加一点醋,不要放太多沙拉酱。" + } + ] +} +输出: +{ + "assistant_memory_hint": "推荐用户做鸡胸肉沙拉碗,并提醒用户调味时少放沙拉酱。", + "assistant_memory_type": "suggestion" +} + +Few-shot 示例 4 +输入: +{ + "msgs": [ + { + "role": "User", + "msg": "听起来不错!" + }, + { + "role": "Assistant", + "msg": "听起来不错!你最喜欢吃什么类型的沙拉呢?是蔬菜沙拉、水果沙拉还是其他的?如果有任何特定的食材是你最喜欢的,也可以告诉我哦。" + } + ] +} +输出: +{ + "assistant_memory_hint": "询问用户更喜欢蔬菜沙拉、水果沙拉还是其他类型的沙拉,以及是否有偏好的食材。", + "assistant_memory_type": "question" +} + +Few-shot 示例 5 +输入: +{ + "msgs": [ + { + "role": "User", + "msg": "我最近总失眠,白天特别困,想先自己调一调。" + }, + { + "role": "Assistant", + "msg": "你可以先减少下午和晚上的咖啡因摄入,睡前也尽量少看手机。如果方便的话,我还想了解一下,你通常晚上大概几点上床、几点真正睡着?" + } + ] +} +输出: +{ + "assistant_memory_hint": "建议用户减少下午和晚上的咖啡因摄入并减少睡前看手机,同时询问用户通常几点上床和几点入睡。", + "assistant_memory_type": "suggestion" +} + +现在处理下面这个输入。 +输入:{{ dialog_text }} + +只输出严格 JSON: +{ + "assistant_memory_hint": "", + "assistant_memory_type": "comfort | suggestion | recommendation | warning | instruction | question | agreement | repetition | other" +} diff --git a/api/app/core/memory/utils/prompt/prompts/extract_statement_temporal.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_statement_temporal.jinja2 index 9669144a..b851e643 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_statement_temporal.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_statement_temporal.jinja2 @@ -13,6 +13,7 @@ - temporal_type - has_emotional_state - has_unsolved_reference +- dialog_at - valid_at - invalid_at @@ -26,6 +27,7 @@ Your task is to identify and extract declarative statements from the provided ta - temporal_type - has_emotional_state - has_unsolved_reference +- dialog_at - valid_at - invalid_at @@ -37,15 +39,19 @@ Each output item should be a structured candidate memory statement. - chunk_id: chunk 唯一 ID - end_user_id: 终端用户 ID +- dialog_at: 会话时间,必须是 ISO 8601 时间点 - target_content: 当前要处理的对话片段文本,也是唯一允许被抽取的目标文本 -- target_message_date: 目标文本对应的时间,用于解析相对时间表达 +- dialog_at: 会话时间,优先作为解析相对时间表达的参考时间 +- target_message_date: 目标文本对应的时间,可作为辅助时间背景;当与 dialog_at 同时存在时,优先使用 dialog_at 解析相对时间表达 - supporting_context: 完整对话上下文,仅用于辅助理解 target_content,不能单独贡献新的可抽取事实 - supporting_context.msgs: 按顺序提供的上下文消息,可包含 User 和 Assistant {% else %} - chunk_id: unique chunk identifier - end_user_id: end-user identifier +- dialog_at: session time, which must be an ISO 8601 timestamp - target_content: the current dialogue fragment to process, and the only text span that may be extracted from -- target_message_date: the reference time for the target content, used for resolving relative temporal expressions +- dialog_at: session time, used as the primary reference for resolving relative temporal expressions +- target_message_date: the time associated with the target content and may serve as supporting temporal context; when both exist, prefer `dialog_at` for resolving relative expressions - supporting_context: full dialogue context used only to help interpret target_content and must not independently contribute new extractable facts - supporting_context.msgs: ordered contextual messages, which may include User and Assistant messages {% endif %} @@ -114,9 +120,18 @@ statement_type: 时间规则: - 仅使用目标文本中明确陈述或可由 `target_message_date` 直接解析的时间信息;不要使用外部知识补时间。 -- 使用 `target_message_date` 作为“现在”来解释相对时间,例如“昨天”“上周五”“下个月”。 +- 优先使用 `dialog_at` 作为“现在”来解释相对时间,例如“昨天”“上周五”“下个月”;只有在 `dialog_at` 缺失时才退回 `target_message_date`。 +- 如果相对时间可以稳定落到更具体的中文时间表达,就应直接改写进 `statement_text`,而不要保留原始模糊表达。 +- 可稳定具体化的示例包括: + - “昨天” -> “2026年4月29日” + - “前天晚上” -> “2026年4月28日晚上” + - “上周三” -> “2026年4月22日” + - “上个月” -> “2026年3月” + - “下周” -> “2026年5月4日至2026年5月10日” +- 如果相对时间只能粗粒度定位,保留该粗粒度但仍尽量具体化;例如“去年冬天”可以保留为“去年冬天”,不要强行伪精确到具体日期。 - `valid_at` 表示陈述开始成立或生效的时间。 - `invalid_at` 表示陈述结束或不再成立的时间;如果仍在持续,填 `"NULL"`。 +- `dialog_at` 表示当前会话时间,每条 statement 都必须原样复制输入中的 `dialog_at`。 - 时间格式优先使用 ISO 8601。 - 对于只有日期没有时分秒的时间,默认使用整天边界,便于后续检索。 - 如果没有明确时间,不要编造时间。 @@ -185,10 +200,19 @@ statement_type: Temporal rules: -- Use only temporal information explicitly stated in the target text or directly resolvable from `target_message_date`; do not add dates from external knowledge. -- Use `target_message_date` as “now” when interpreting relative expressions such as “yesterday,” “last Friday,” or “next month.” +- Use only temporal information explicitly stated in the target text or directly resolvable from `dialog_at` / `target_message_date`; do not add dates from external knowledge. +- Prefer `dialog_at` as “now” when interpreting relative expressions such as “yesterday,” “last Friday,” or “next month”; only fall back to `target_message_date` when `dialog_at` is unavailable. +- If a relative time can be stably grounded to a more concrete Chinese time phrase, rewrite it directly into `statement_text` rather than keeping the vague source phrase. +- Examples of stable concretization: + - “yesterday” -> “2026年4月29日” + - “the night before last” -> “2026年4月28日晚上” + - “last Wednesday” -> “2026年4月22日” + - “last month” -> “2026年3月” + - “next week” -> “2026年5月4日至2026年5月10日” +- If the relative time can only be grounded coarsely, keep that coarse granularity while still making it as concrete as reasonably possible; for example, “last winter” may stay as “去年冬天” instead of being forced into fake exact dates. - `valid_at` means when the statement became valid or started to hold. - `invalid_at` means when the statement ended or stopped being valid; use `"NULL"` if it is still ongoing. +- `dialog_at` is the session timestamp, and every statement must copy the input `dialog_at` verbatim. - Prefer ISO 8601 for time values. - When only a date can be resolved, default to full-day boundaries for retrieval use. - If no explicit time is available, do not invent one. @@ -213,6 +237,9 @@ temporal_type: Rewrite boundary: - Minimal rewriting is allowed only to resolve reference, ellipsis, and temporal ambiguity. +- For resolvable relative time expressions, rewrite them into grounded Chinese time phrases directly inside `statement_text`. +- Do not keep both the vague source phrase and the grounded phrase together; output only the rewritten concrete form. +- Do not fake precision for time expressions that cannot be grounded reliably from `dialog_at`. - Do not introduce unsupported facts, extra inference, or stylistic summarization. {% endif %} @@ -222,6 +249,7 @@ Rewrite boundary: 示例输入: { "chunk_id": "chunk_a1b2c3d4", "end_user_id": "eu_12345678", + "dialog_at": "2023-09-04T18:00:00Z", "target_content": "老李这学期要求还是一如既往地严,不过他讲课确实清晰透彻,而且每节课的结构都特别清楚。就是气场实在太吓人了,我每次被他点名都有点发怵。", "target_message_date": "2023-09-04T18:00:00", "supporting_context": { @@ -247,6 +275,7 @@ Rewrite boundary: "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2023-09-04T18:00:00Z", "valid_at": "2023-09-04T18:00:00", "invalid_at": "NULL" }, @@ -257,6 +286,7 @@ Rewrite boundary: "temporal_type": "ATEMPORAL", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2023-09-04T18:00:00Z", "valid_at": "NULL", "invalid_at": "NULL" }, @@ -267,6 +297,7 @@ Rewrite boundary: "temporal_type": "DYNAMIC", "has_emotional_state": true, "has_unsolved_reference": false, + "dialog_at": "2023-09-04T18:00:00Z", "valid_at": "2023-09-04T18:00:00", "invalid_at": "NULL" } @@ -277,6 +308,7 @@ Rewrite boundary: 示例输入: { "chunk_id": "chunk_b2c3d4e5", "end_user_id": "eu_12345678", + "dialog_at": "2026-04-01T00:00:00Z", "target_content": "我最近在学Python,每天晚上都会练一个小时。这周还打算先把基础语法和函数部分过一遍。", "target_message_date": "2026-04-01T00:00:00", "supporting_context": { @@ -302,6 +334,7 @@ Rewrite boundary: "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" }, @@ -312,16 +345,18 @@ Rewrite boundary: "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" }, { "statement_id": "stmt_u1v2w3x4", - "statement_text": "用户这周打算先复习Python的基础语法和函数部分。", + "statement_text": "用户计划在2026年3月30日至2026年4月5日先复习Python的基础语法和函数部分。", "statement_type": "FACT", "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" } @@ -332,6 +367,7 @@ Rewrite boundary: 示例输入: { "chunk_id": "chunk_c3d4e5f6", "end_user_id": "eu_12345678", + "dialog_at": "2026-04-01T00:00:00Z", "target_content": "这周老师新布置的那两个我觉得有点难,而且我昨晚看了半天还是没太搞明白。要是周末再弄不出来,我可能就得去问助教了。", "target_message_date": "2026-04-01T00:00:00", "supporting_context": { @@ -352,31 +388,34 @@ Rewrite boundary: "statements": [ { "statement_id": "stmt_y5z6a7b8", - "statement_text": "用户觉得那两个有点难。", + "statement_text": "用户觉得2026年3月30日至2026年4月5日老师新布置的那两个内容有点难。", "statement_type": "OPINION", "temporal_type": "DYNAMIC", "has_emotional_state": true, "has_unsolved_reference": true, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" }, { "statement_id": "stmt_c9d0e1f2", - "statement_text": "用户昨晚看了半天那两个还是没太搞明白。", + "statement_text": "用户2026年3月31日晚上看了半天那两个内容还是没太搞明白。", "statement_type": "FACT", "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": true, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-03-31T00:00:00", "invalid_at": "2026-03-31T23:59:59" }, { "statement_id": "stmt_g3h4i5j6", - "statement_text": "如果周末还弄不出来,用户可能会去问助教。", + "statement_text": "如果到2026年4月4日至2026年4月5日还弄不出来,用户可能会去问助教。", "statement_type": "OTHER", "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": true, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" } @@ -387,6 +426,7 @@ Example 1: Example Input: { "chunk_id": "chunk_a1b2c3d4", "end_user_id": "eu_12345678", + "dialog_at": "2023-09-04T18:00:00Z", "target_content": "Old Li is just as strict as ever this semester, but he really explains things clearly and the structure of every class is extremely clear. His presence is honestly kind of intimidating, and I get nervous every time he calls on me.", "target_message_date": "2023-09-04T18:00:00", "supporting_context": { @@ -412,6 +452,7 @@ Example Output: { "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2023-09-04T18:00:00Z", "valid_at": "2023-09-04T18:00:00", "invalid_at": "NULL" }, @@ -422,6 +463,7 @@ Example Output: { "temporal_type": "ATEMPORAL", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2023-09-04T18:00:00Z", "valid_at": "NULL", "invalid_at": "NULL" }, @@ -432,6 +474,7 @@ Example Output: { "temporal_type": "DYNAMIC", "has_emotional_state": true, "has_unsolved_reference": false, + "dialog_at": "2023-09-04T18:00:00Z", "valid_at": "2023-09-04T18:00:00", "invalid_at": "NULL" } @@ -442,6 +485,7 @@ Example 2: Example Input: { "chunk_id": "chunk_b2c3d4e5", "end_user_id": "eu_12345678", + "dialog_at": "2026-04-01T00:00:00Z", "target_content": "I've been learning Python recently, and I practice for an hour every night. This week I also plan to review basic syntax and functions first.", "target_message_date": "2026-04-01T00:00:00", "supporting_context": { @@ -467,6 +511,7 @@ Example Output: { "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" }, @@ -477,16 +522,18 @@ Example Output: { "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" }, { "statement_id": "stmt_u1v2w3x4", - "statement_text": "The user plans to review Python basic syntax and functions first this week.", + "statement_text": "The user plans to review Python basic syntax and functions first during 2026-03-30 to 2026-04-05.", "statement_type": "FACT", "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": false, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" } @@ -497,6 +544,7 @@ Example 3: Example Input: { "chunk_id": "chunk_c3d4e5f6", "end_user_id": "eu_12345678", + "dialog_at": "2026-04-01T00:00:00Z", "target_content": "The two things the teacher assigned this week seem hard to me, and even after looking at them for a long time last night I still didn't really understand them. If I still can't finish them by the weekend, I may have to ask the TA.", "target_message_date": "2026-04-01T00:00:00", "supporting_context": { @@ -517,31 +565,34 @@ Example Output: { "statements": [ { "statement_id": "stmt_y5z6a7b8", - "statement_text": "The user thinks those two things are difficult.", + "statement_text": "The user thinks the two items assigned during 2026-03-30 to 2026-04-05 are difficult.", "statement_type": "OPINION", "temporal_type": "DYNAMIC", "has_emotional_state": true, "has_unsolved_reference": true, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" }, { "statement_id": "stmt_c9d0e1f2", - "statement_text": "The user spent a long time last night looking at those two things but still did not really understand them.", + "statement_text": "The user spent a long time on the evening of 2026-03-31 looking at those two items but still did not really understand them.", "statement_type": "FACT", "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": true, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-03-31T00:00:00", "invalid_at": "2026-03-31T23:59:59" }, { "statement_id": "stmt_g3h4i5j6", - "statement_text": "If the user still cannot finish them by the weekend, the user may ask the TA.", + "statement_text": "If the user still cannot finish them by 2026-04-04 to 2026-04-05, the user may ask the TA.", "statement_type": "OTHER", "temporal_type": "DYNAMIC", "has_emotional_state": false, "has_unsolved_reference": true, + "dialog_at": "2026-04-01T00:00:00Z", "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" } @@ -557,6 +608,7 @@ Example Output: { - 如果主语是用户,是否统一写“用户” - 非用户主体是否尽量写成具体名称;若无法做到,是否已正确标记 `has_unsolved_reference = true` - 如果最终 `statement_text` 已经落到具体实体名,`has_unsolved_reference` 是否已经改为 `false` +- 如果 `statement_text` 中出现可由 `dialog_at` 稳定解析的相对时间,是否已经改写成更具体的日期、月份或日期区间表达 - statement_type 是否合法,且没有把一般事实机械标成 `OPINION` - `has_emotional_state` 是否仅用于判断是否存在情感状态,而没有被当作情绪分类字段 - temporal_type 是否与 valid_at / invalid_at 一致 @@ -567,6 +619,7 @@ Example Output: { - If the subject is the user, render it as “the user” - Render non-user subjects as concrete names when possible; otherwise mark `has_unsolved_reference = true` - If the final `statement_text` already resolves the reference to a concrete named entity, ensure `has_unsolved_reference = false` +- If `statement_text` contains relative time expressions that can be stably resolved from `dialog_at`, rewrite them into more concrete date, month, or date-range expressions - Ensure statement_type is valid and do not mechanically label ordinary facts as `OPINION` - Ensure `has_emotional_state` is used only for emotional-state presence detection, not emotion classification - Ensure temporal_type is consistent with valid_at and invalid_at @@ -584,6 +637,7 @@ Example Output: { **ISO 8601 HARD CONSTRAINT:** +- `dialog_at` must be ISO 8601. - `target_message_date` must be ISO 8601. - `valid_at` and `invalid_at` must be ISO 8601, or `"NULL"` when no time is available. - Do not output non-ISO values such as `2026/04/01`, `2026-04-01 00:00:00`, `yesterday evening`, or `下周三`. @@ -615,6 +669,7 @@ Return only a JSON object matching the schema below: "temporal_type": "STATIC | DYNAMIC | ATEMPORAL", "has_emotional_state": "boolean", "has_unsolved_reference": "boolean", + "dialog_at": "string", "valid_at": "string | NULL", "invalid_at": "string | NULL" } diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index 421b7381..df891829 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -39,6 +39,7 @@ Extract entities and knowledge triplets from the given statement. - `supporting_context.msgs[].role`: `User` / `Assistant` - `supporting_context.msgs[].msg`: 消息文本 - `speaker`: `user` / `assistant` +- `dialog_at`: 会话时间,ISO 8601 时间点;可用于在 `description` 中标注实体的时间背景 - `valid_at`: ISO 8601 时间点,或 `NULL` - `invalid_at`: ISO 8601 时间点,或 `NULL` - `has_unsolved_reference`: 布尔值 @@ -53,6 +54,7 @@ Extract entities and knowledge triplets from the given statement. - `supporting_context.msgs[].role`: `User` / `Assistant` - `supporting_context.msgs[].msg`: message text - `speaker`: `user` / `assistant` +- `dialog_at`: session time as an ISO 8601 timestamp; may be used to anchor temporal context in entity `description` - `valid_at`: ISO 8601 timestamp or `NULL` - `invalid_at`: ISO 8601 timestamp or `NULL` - `has_unsolved_reference`: boolean @@ -113,6 +115,7 @@ Primary statement to analyze: - 如果某条信息只出现在 `supporting_context.msgs` 中,而没有出现在 `statement_text` 中,就不要输出它。 - 如果 `supporting_context.msgs` 中的 Assistant 消息包含总结、猜测、解释或改写,这些内容只能作为理解辅助,不能直接作为抽取来源。 - `statement_type`、`temporal_type` 是辅助理解字段,不是抽取目标。 +- `dialog_at` 是辅助时间上下文字段,不是抽取目标。 - `valid_at`、`invalid_at` 不用于决定是否创建实体或关系,但如果产生 triplet,必须原样复制到每个 triplet 的同名字段中。 - 对 `statement_text` 中的用户自指表达,要统一规范成实体 `用户`。 - 对其他可稳定解析的代词或指示表达,要替换为具体指代实体名后再抽取。 @@ -124,6 +127,7 @@ Primary statement to analyze: - If some information appears only in `supporting_context.msgs` but not in `statement_text`, do not include it in the output. - If Assistant messages in `supporting_context.msgs` contain summary, guess, interpretation, or rephrasing, use them only as interpretive support and never as a direct extraction source. - Treat `statement_type` and `temporal_type` as auxiliary context, not extraction targets. +- Treat `dialog_at` as auxiliary session-time context, not an extraction target. - Do not use `valid_at` or `invalid_at` to decide whether to create entities or relations, but if any triplet is produced, copy them verbatim into every triplet field with the same names. - Normalize user self-reference in `statement_text` to the entity `用户`. - Replace other resolvable pronouns or demonstratives with their resolved entity names before extraction. @@ -493,6 +497,8 @@ Do not let auxiliary fields drive the extraction process. - 优先描述实体在当前陈述和必要上下文中的身份、作用或关系。 - `description` 只保留适合长期附着在该实体上的描述,例如稳定身份、稳定关系、长期偏好/兴趣/习惯、较稳定认知倾向或可用于区分实体的持久特征。 - 不要把短期状态、一次性事件、临时计划、当前情绪、具体时间锚点,或只在当前句子里短暂成立的信息写进 `description`。 +- 但如果第一步已经把相对时间稳定改写成具体日期、月份或日期区间,且这段具体时间对识别当前实体有帮助,可以在 `description` 中沿用这段已经出现在 `statement_text` 里的具体时间表达。 +- triplet 这一步不要自己新增时间推理;只允许复用 `statement_text` 中已经具体化的时间表述,不要把“上周三”“上个月”再次自行展开。 - 如果实体应保留,但当前 statement 中没有适合长期附着在该实体上的稳定描述,则 `description` 允许为空字符串 `""`;不要为了填充 `description` 而写入短期状态或临时信息。 - 避免使用“陈述中提到的人物”“陈述中提到的组织”“陈述中提到的物品”这类低信息量模板。 - 不要补充识别实体所不需要的外部知识。 @@ -501,6 +507,8 @@ Do not let auxiliary fields drive the extraction process. - Prefer describing the entity's role, identity, or relation in the current statement and necessary supporting context. - `description` should keep only information suitable to remain attached to the entity over time, such as stable identity, stable relations, long-term preferences/interests/habits, relatively stable beliefs, or persistent distinguishing traits. - Do not put short-lived states, one-off events, temporary plans, current emotions, concrete time anchors, or information that only briefly holds in the current sentence into `description`. +- But if step 1 has already rewritten a relative time into a concrete date, month, or date range, and that concrete time phrase helps identify the current entity, you may reuse that already-grounded phrase in `description`. +- Do not perform new temporal inference in the triplet step; only reuse time wording that is already concretized in `statement_text`, and do not independently expand phrases like "last Wednesday" or "last month" again here. - If an entity should be retained but the current statement does not provide any suitable stable description for it, `description` may be the empty string `""`; do not fill it with short-lived states or temporary information just to avoid emptiness. - Avoid low-information templates such as "the person mentioned in the statement" or "the organization mentioned in the statement". - Do not add extra world knowledge that is not needed for identifying the entity in context. @@ -659,7 +667,7 @@ Output: } **示例 4** -Statement: "他上个月加入了这家公司。" +Statement: "他2026年3月加入了这家公司。" Input condition: `"has_unsolved_reference": true` Output: diff --git a/api/app/services/memory_agent_service.py b/api/app/services/memory_agent_service.py index 6e2c6c32..48ae92b6 100644 --- a/api/app/services/memory_agent_service.py +++ b/api/app/services/memory_agent_service.py @@ -34,7 +34,6 @@ from app.core.memory.agent.utils.messages_tools import ( reorder_output_results, ) from app.core.memory.agent.utils.type_classifier import status_typle -from app.core.memory.agent.utils.write_tools import write as write_neo4j from app.core.memory.analytics.hot_memory_tags import get_interest_distribution from app.core.memory.memory_service import MemoryService from app.core.memory.utils.llm.llm_utils import MemoryClientFactory @@ -447,32 +446,20 @@ class MemoryAgentService: memory_config, language: Language | str, ) -> None: - """根据 NEW_PIPELINE_ENABLED 选择新旧流水线写入 Neo4j。""" - # 统一转换为 dict,下游流水线期望 list[dict] + """使用新流水线(MemoryService → WritePipeline)写入 Neo4j。""" messages_dict = [ msg if isinstance(msg, dict) else msg.model_dump(exclude_none=True) for msg in messages ] - use_new_pipeline = os.getenv("NEW_PIPELINE_ENABLED", "false").lower() == "true" - - if use_new_pipeline: - service = MemoryService(memory_config=memory_config, end_user_id=end_user_id) - result = await service.write( - messages=messages_dict, language=language, ref_id='', - ) - logger.info( - f"[NewPipeline] 完成: status={result.status}, " - f"elapsed={result.elapsed_seconds:.2f}s, " - f"extraction={result.extraction}" - ) - else: - await write_neo4j( - end_user_id=end_user_id, - messages=messages_dict, - memory_config=memory_config, - ref_id='', - language=language, - ) + service = MemoryService(memory_config=memory_config, end_user_id=end_user_id) + result = await service.write( + messages=messages_dict, language=language, ref_id='', + ) + logger.info( + f"[WritePipeline] 完成: status={result.status}, " + f"elapsed={result.elapsed_seconds:.2f}s, " + f"extraction={result.extraction}" + ) async def _invalidate_interest_cache(self, end_user_id: str) -> None: """写入完成后失效兴趣分布缓存。""" diff --git a/api/app/services/pilot_run_service.py b/api/app/services/pilot_run_service.py index 122e2181..b1efe64a 100644 --- a/api/app/services/pilot_run_service.py +++ b/api/app/services/pilot_run_service.py @@ -2,6 +2,11 @@ Pilot Run Service - 试运行服务 用于执行记忆系统的试运行流程,不保存到 Neo4j。 + +职责边界: +- 文本解析、语义剪枝、语义分块(预处理) +- 调用 PilotWritePipeline 执行萃取链路 +- 输出结果文件 """ import os @@ -17,17 +22,10 @@ from app.core.memory.models.message_models import ( ConversationMessage, DialogData, ) -from app.core.memory.storage_services.extraction_engine.extraction_orchestrator import ( - ExtractionOrchestrator, - get_chunked_dialogs_from_preprocessed, -) from app.core.memory.storage_services.extraction_engine.pipeline_help import ( _write_extracted_result_summary, export_test_input_doc, ) -from app.core.memory.utils.config.config_utils import get_pipeline_config -from app.core.memory.utils.llm.llm_utils import MemoryClientFactory -from app.repositories.neo4j.neo4j_connector import Neo4jConnector from app.schemas.memory_config_schema import MemoryConfig from sqlalchemy.orm import Session @@ -77,18 +75,19 @@ async def run_pilot_extraction( progress_callback: Optional[Callable[[str, str, Optional[dict]], Awaitable[None]]] = None, language: str = "zh", ) -> None: - """ - 执行试运行模式的知识提取流水线。 + """执行试运行模式的知识提取流水线。 + + 职责: + 1. 文本解析 → 语义剪枝 → 语义分块(预处理,需要 llm_client) + 2. 调用 PilotWritePipeline 执行萃取链路(Pipeline 自行管理客户端) + 3. 将萃取结果写入输出文件 Args: memory_config: 从数据库加载的内存配置对象 dialogue_text: 输入的对话文本 - db: 数据库会话 - progress_callback: 可选的进度回调函数 - - 参数1 (stage): 当前处理阶段标识符 - - 参数2 (message): 人类可读的进度消息 - - 参数3 (data): 可选的附加数据字典 - language: 语言类型 ("zh" 中文, "en" 英文),默认中文 + db: 数据库会话(用于初始化预处理所需的 LLM 客户端) + progress_callback: 可选的进度回调 (stage, message, data) + language: 语言类型 ("zh" | "en") """ log_file = "logs/time.log" os.makedirs(os.path.dirname(log_file), exist_ok=True) @@ -99,21 +98,16 @@ async def run_pilot_extraction( pipeline_start = time.time() try: - # 步骤 1: 初始化客户端 - logger.info("Initializing clients...") + # ── 步骤 1: 初始化预处理所需的 LLM 客户端 ────────────────────────── + # 只用于语义剪枝和分块,PilotWritePipeline 内部会自行初始化萃取客户端 step_start = time.time() - - client_factory = MemoryClientFactory(db) - llm_client = client_factory.get_llm_client(str(memory_config.llm_model_id)) - embedder_client = client_factory.get_embedder_client(str(memory_config.embedding_model_id)) - + from app.core.memory.utils.llm.llm_utils import MemoryClientFactory + factory = MemoryClientFactory(db) + llm_client = factory.get_llm_client(str(memory_config.llm_model_id)) log_time("Client Initialization", time.time() - step_start, log_file) - # 步骤 2: 解析对话文本 - logger.info("Parsing dialogue text...") + # ── 步骤 2: 文本解析 ──────────────────────────────────────────────── step_start = time.time() - - # 解析对话文本,支持 "用户:" 和 "AI:" 格式 pattern = r"(用户|AI)[::]\s*([^\n]+(?:\n(?!(?:用户|AI)[::])[^\n]*)*?)" matches = re.findall(pattern, dialogue_text, re.MULTILINE | re.DOTALL) messages = [ @@ -121,14 +115,11 @@ async def run_pilot_extraction( for r, c in matches if c.strip() ] - - # 如果没有匹配到格式化的对话,将整个文本作为用户消息 if not messages: messages = [ConversationMessage(role="用户", msg=dialogue_text.strip())] - context = ConversationContext(msgs=messages) dialog = DialogData( - context=context, + context=ConversationContext(msgs=messages), ref_id="pilot_dialog_1", end_user_id=str(memory_config.workspace_id), user_id=str(memory_config.tenant_id), @@ -139,267 +130,142 @@ async def run_pilot_extraction( if progress_callback: await progress_callback("text_preprocessing", "开始预处理文本(语义剪枝 + 语义分块)...") - # ========== 步骤 2.1: 语义剪枝 ========== + # ── 步骤 2.1: 语义剪枝 ───────────────────────────────────────────── pruned_dialogs = [dialog] - deleted_messages = [] # 记录被删除的消息 - pruning_stats = None # 保存剪枝统计信息,用于最终汇总 - + pruning_stats: dict = {"enabled": False} + if memory_config.pruning_enabled: try: from app.core.memory.storage_services.extraction_engine.data_preprocessing.data_pruning import ( SemanticPruner, ) from app.core.memory.models.config_models import PruningConfig - - # 构建剪枝配置 - pruning_config_dict = { - "pruning_switch": memory_config.pruning_enabled, - "pruning_scene": memory_config.pruning_scene, - "pruning_threshold": memory_config.pruning_threshold, - "scene_id": str(memory_config.scene_id) if memory_config.scene_id else None, - "ontology_class_infos": memory_config.ontology_class_infos, - } - config = PruningConfig(**pruning_config_dict) - - logger.info(f"[PILOT_RUN] 开始语义剪枝: scene={config.pruning_scene}, threshold={config.pruning_threshold}") - - # 记录剪枝前的消息(用于对比) - original_messages = [{"role": msg.role, "content": msg.msg} for msg in dialog.context.msgs] - original_msg_count = len(original_messages) - - # 执行剪枝 - pruner = SemanticPruner(config=config, llm_client=llm_client) - pruned_dialogs = await pruner.prune_dataset([dialog]) - - # 计算剪枝结果并找出被删除的消息 + + config = PruningConfig( + pruning_switch=memory_config.pruning_enabled, + pruning_scene=memory_config.pruning_scene, + pruning_threshold=memory_config.pruning_threshold, + scene_id=str(memory_config.scene_id) if memory_config.scene_id else None, + ontology_class_infos=memory_config.ontology_class_infos, + ) + original_msgs = [{"role": m.role, "content": m.msg} for m in dialog.context.msgs] + pruned_dialogs = await SemanticPruner(config=config, llm_client=llm_client).prune_dataset([dialog]) + if pruned_dialogs and pruned_dialogs[0].context: - remaining_messages = [{"role": msg.role, "content": msg.msg} for msg in pruned_dialogs[0].context.msgs] - remaining_msg_count = len(remaining_messages) - deleted_msg_count = original_msg_count - remaining_msg_count - - # 找出被删除的消息(基于索引精确匹配) - # 为剩余消息创建带索引的列表,用于精确追踪 - remaining_with_index = [] - remaining_idx = 0 - for orig_idx, orig_msg in enumerate(original_messages): - if remaining_idx < len(remaining_messages) and \ - orig_msg["role"] == remaining_messages[remaining_idx]["role"] and \ - orig_msg["content"] == remaining_messages[remaining_idx]["content"]: - remaining_with_index.append(orig_idx) - remaining_idx += 1 - - # 找出未在保留列表中的消息索引 + remaining = [{"role": m.role, "content": m.msg} for m in pruned_dialogs[0].context.msgs] + # 找出被删除的消息(顺序匹配) + kept_indices: list[int] = [] + ri = 0 + for oi, om in enumerate(original_msgs): + if ri < len(remaining) and om == remaining[ri]: + kept_indices.append(oi) + ri += 1 deleted_messages = [ - {"index": idx, "role": msg["role"], "content": msg["content"]} - for idx, msg in enumerate(original_messages) - if idx not in remaining_with_index + {"index": i, "role": m["role"], "content": m["content"]} + for i, m in enumerate(original_msgs) + if i not in kept_indices ] - - # 保存剪枝统计信息(用于最终汇总,只保留deleted_count) pruning_stats = { "enabled": True, "scene": config.pruning_scene, "threshold": config.pruning_threshold, - "deleted_count": deleted_msg_count, + "deleted_count": len(deleted_messages), } - - # 输出剪枝结果(显示删除的消息详情) - pruning_result = { - "type": "pruning", - "deleted_messages": deleted_messages, - } - logger.info( - f"[PILOT_RUN] 语义剪枝完成: 原始{original_msg_count}条 -> " - f"保留{remaining_msg_count}条 (删除{deleted_msg_count}条)" + f"[PILOT_RUN] 语义剪枝完成: {len(original_msgs)} → {len(remaining)} 条" + f"(删除 {len(deleted_messages)} 条)" ) - if progress_callback: - await progress_callback("text_preprocessing_result", "语义剪枝完成", pruning_result) + await progress_callback( + "text_preprocessing_result", "语义剪枝完成", + {"type": "pruning", "deleted_messages": deleted_messages}, + ) else: logger.warning("[PILOT_RUN] 剪枝后对话为空,使用原始对话") pruned_dialogs = [dialog] - + except Exception as e: logger.error(f"[PILOT_RUN] 语义剪枝失败,使用原始对话: {e}", exc_info=True) pruned_dialogs = [dialog] if progress_callback: - error_result = { - "type": "pruning", - "error": str(e), - "fallback": "使用原始对话" - } - await progress_callback("text_preprocessing_result", "语义剪枝失败", error_result) - else: - logger.info("[PILOT_RUN] 语义剪枝已关闭,跳过") - pruning_stats = { - "enabled": False, - } + await progress_callback( + "text_preprocessing_result", "语义剪枝失败", + {"type": "pruning", "error": str(e), "fallback": "使用原始对话"}, + ) - # ========== 步骤 2.2: 语义分块 ========== - chunked_dialogs = await get_chunked_dialogs_from_preprocessed( - data=pruned_dialogs, - chunker_strategy=memory_config.chunker_strategy, - llm_client=llm_client, + # ── 步骤 2.2: 语义分块 ───────────────────────────────────────────── + from app.core.memory.storage_services.extraction_engine.knowledge_extraction.chunk_extraction import ( + DialogueChunker, ) - - remaining_msg_count = len(pruned_dialogs[0].context.msgs) if pruned_dialogs and pruned_dialogs[0].context else 0 - logger.info(f"Processed dialogue text: {remaining_msg_count} messages after pruning") + chunked_dialogs = [] + for dlg in pruned_dialogs: + dlg.chunks = await DialogueChunker(memory_config.chunker_strategy, llm_client=llm_client).process_dialogue(dlg) + chunked_dialogs.append(dlg) - # 进度回调:输出每个分块的结果 if progress_callback: for dlg in chunked_dialogs: - if hasattr(dlg, 'chunks') and dlg.chunks: - for i, chunk in enumerate(dlg.chunks): - chunk_result = { + for i, chunk in enumerate(dlg.chunks or []): + await progress_callback( + "text_preprocessing_result", f"分块 {i + 1} 处理完成", + { "type": "chunking", "chunk_index": i + 1, "content": chunk.content[:200] + "..." if len(chunk.content) > 200 else chunk.content, "full_length": len(chunk.content), "dialog_id": dlg.id, "chunker_strategy": memory_config.chunker_strategy, - } - await progress_callback("text_preprocessing_result", f"分块 {i + 1} 处理完成", chunk_result) - - # 构建预处理完成总结(包含剪枝统计) - preprocessing_summary = { - "total_chunks": sum(len(dlg.chunks) for dlg in chunked_dialogs if hasattr(dlg, 'chunks') and dlg.chunks), - "total_dialogs": len(chunked_dialogs), - "chunker_strategy": memory_config.chunker_strategy, - } - - # 添加剪枝统计信息(始终包含 pruning 字段,确保前端不会因字段缺失报错) - preprocessing_summary["pruning"] = pruning_stats if pruning_stats else { - "enabled": memory_config.pruning_enabled, - "deleted_count": 0, - } - - await progress_callback("text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", preprocessing_summary) + }, + ) + await progress_callback( + "text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", + { + "total_chunks": sum(len(dlg.chunks or []) for dlg in chunked_dialogs), + "total_dialogs": len(chunked_dialogs), + "chunker_strategy": memory_config.chunker_strategy, + "pruning": pruning_stats, + }, + ) log_time("Data Loading & Chunking", time.time() - step_start, log_file) - # 步骤 3: 初始化并选择试运行流水线(环境变量可切换) - use_refactored = bool(settings.PILOT_RUN_USE_REFACTORED_PIPELINE) - logger.info( - "Selecting pilot pipeline by env: PILOT_RUN_USE_REFACTORED_PIPELINE=%s", - use_refactored, - ) - logger.info( - "Initializing %s pilot pipeline...", - "refactored" if use_refactored else "legacy", - ) + # ── 步骤 3: 萃取(PilotWritePipeline 自行管理客户端和本体加载)────── step_start = time.time() + logger.info("Running pilot extraction pipeline...") - # 加载本体类型(如果配置了 scene_id),支持通用类型回退 - ontology_types = None - try: - from app.core.memory.ontology_services.ontology_type_loader import load_ontology_types_with_fallback - - ontology_types = load_ontology_types_with_fallback( - scene_id=memory_config.scene_id, - workspace_id=memory_config.workspace_id, - db=db, - enable_general_fallback=True - ) - except Exception as e: - logger.warning(f"Failed to load ontology types: {e}", exc_info=True) + if progress_callback: + await progress_callback("knowledge_extraction", "正在知识抽取...") - if use_refactored: - from app.core.memory.memory_service import MemoryService + from app.core.memory.pipelines.pilot_write_pipeline import PilotWritePipeline - memory_service = MemoryService( - memory_config=memory_config, - end_user_id=str(memory_config.workspace_id), - ) - log_time("Pilot Pipeline Initialization", time.time() - step_start, log_file) - - # 步骤 4a: 执行重构后试运行短链路 - # statement -> triplet -> graph_build -> 第一层去重消歧(结束) - logger.info("Running refactored pilot extraction short pipeline...") - step_start = time.time() - - if progress_callback: - await progress_callback("knowledge_extraction", "正在知识抽取...") - - pilot_result = await memory_service.pilot_write( - chunked_dialogs=chunked_dialogs, - language=language, - progress_callback=progress_callback, - ) - dialog_data_list = pilot_result.dialog_data_list - graph = pilot_result.graph - chunk_nodes = graph.chunk_nodes - export_entity_nodes = graph.entity_nodes - export_stmt_entity_edges = graph.stmt_entity_edges - export_entity_edges = graph.entity_entity_edges - else: - # 步骤 4b: 执行旧试运行流水线 - logger.info("Running legacy pilot extraction pipeline...") - step_start = time.time() - - if progress_callback: - await progress_callback("knowledge_extraction", "正在知识抽取...") - - neo4j_connector = Neo4jConnector() - try: - legacy_orchestrator = ExtractionOrchestrator( - llm_client=llm_client, - embedder_client=embedder_client, - connector=neo4j_connector, - config=get_pipeline_config(memory_config), - progress_callback=progress_callback, - embedding_id=str(memory_config.embedding_model_id), - language=language, - ontology_types=ontology_types, - ) - extraction_result = await legacy_orchestrator.run( - dialog_data_list=chunked_dialogs, - is_pilot_run=True, - ) - ( - _dialogue_nodes, - chunk_nodes, - _statement_nodes, - entity_nodes, - _perceptual_nodes, - _statement_chunk_edges, - statement_entity_edges, - entity_edges, - _perceptual_edges, - _last_created_at, - ) = extraction_result - dialog_data_list = chunked_dialogs - export_entity_nodes = entity_nodes - export_stmt_entity_edges = statement_entity_edges - export_entity_edges = entity_edges - finally: - try: - await neo4j_connector.close() - except Exception: - pass + pilot_result = await PilotWritePipeline( + memory_config=memory_config, + end_user_id=str(memory_config.workspace_id), + language=language, + progress_callback=progress_callback, + ).run(chunked_dialogs) log_time("Extraction Pipeline", time.time() - step_start, log_file) + # ── 步骤 4: 输出结果文件 ──────────────────────────────────────────── if progress_callback: await progress_callback("generating_results", "正在生成结果...") - # 步骤 5: 输出试运行结果文件(保持 /pilot_run 返回契约) + graph = pilot_result.graph settings.ensure_memory_output_dir() export_test_input_doc( - entity_nodes=export_entity_nodes, - statement_entity_edges=export_stmt_entity_edges, - entity_entity_edges=export_entity_edges, + entity_nodes=graph.entity_nodes, + statement_entity_edges=graph.stmt_entity_edges, + entity_entity_edges=graph.entity_entity_edges, ) _save_triplets_from_dialogs( - dialog_data_list=dialog_data_list, + dialog_data_list=pilot_result.dialog_data_list, output_path=settings.get_memory_output_path("extracted_triplets.txt"), ) _write_extracted_result_summary( - chunk_nodes=chunk_nodes, + chunk_nodes=graph.chunk_nodes, pipeline_output_dir=settings.get_memory_output_path(), ) - - logger.info("Pilot run completed: stop after layer-1 dedup (no layer-2 / no Neo4j write)") + logger.info("Pilot run completed: stop after layer-1 dedup (no Neo4j write)") except Exception as e: logger.error(f"Pilot run failed: {e}", exc_info=True) @@ -407,9 +273,6 @@ async def run_pilot_extraction( total_time = time.time() - pipeline_start log_time("TOTAL PILOT RUN TIME", total_time, log_file) - - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") with open(log_file, "a", encoding="utf-8") as f: - f.write(f"=== Pilot Run Completed: {timestamp} ===\n\n") - + f.write(f"=== Pilot Run Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n\n") logger.info(f"Pilot run complete. Total time: {total_time:.2f}s")