refactor(memory): redesign metadata extraction as async pipeline step

- Replace extract_user_metadata_task with entity-level extract_metadata_batch_task
- Add MetadataExtractionStep following ExtractionStep pattern with Jinja2 prompts
- Flatten MetadataExtractionResponse to 9-field schema (aliases, core_facts, etc.)
- Add Cypher queries for incremental metadata writeback and alias edge redirection
- Wire _extract_metadata into WritePipeline as Step 3.6 (fire-and-forget)
- Add pilot_write() to MemoryService; refactor pilot_run_service to use it
- Extract snapshot logic into WriteSnapshotRecorder
This commit is contained in:
lanceyq
2026-04-29 18:16:24 +08:00
parent 4af9b02815
commit d66d601e41
23 changed files with 1437 additions and 819 deletions

View File

@@ -16,7 +16,9 @@ import logging
from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Optional
if TYPE_CHECKING:
from app.core.memory.pipelines.pilot_write_pipeline import PilotWriteResult
from app.core.memory.pipelines.write_pipeline import WriteResult
from app.core.memory.models.message_models import DialogData
from app.schemas.memory_config_schema import MemoryConfig
logger = logging.getLogger(__name__)
@@ -83,6 +85,34 @@ class MemoryService:
is_pilot_run=is_pilot_run,
)
async def pilot_write(
self,
chunked_dialogs: List[DialogData],
language: str = "zh",
progress_callback: Optional[
Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]
] = None,
) -> PilotWriteResult:
"""试运行写入:只执行萃取链路,不写入 Neo4j
Args:
chunked_dialogs: 预处理 + 分块后的 DialogData 列表
language: 语言 ("zh" | "en")
progress_callback: 可选的进度回调
Returns:
PilotWriteResult 包含萃取结果、图构建结果和去重结果
"""
from app.core.memory.pipelines.pilot_write_pipeline import PilotWritePipeline
pipeline = PilotWritePipeline(
memory_config=self.memory_config,
end_user_id=self.end_user_id,
language=language,
progress_callback=progress_callback,
)
return await pipeline.run(chunked_dialogs)
async def read(
self, query: str, history: list, search_switch: str
) -> dict:

View File

@@ -60,8 +60,6 @@ from app.core.memory.models.triplet_models import (
# User metadata models
from app.core.memory.models.metadata_models import (
UserMetadata,
UserMetadataProfile,
MetadataExtractionResponse,
MetadataFieldChange,
)
@@ -132,8 +130,6 @@ __all__ = [
"Entity",
"Triplet",
"TripletExtractionResponse",
"UserMetadata",
"UserMetadataProfile",
"MetadataExtractionResponse",
"MetadataFieldChange",
# Ontology models

View File

@@ -464,6 +464,16 @@ class ExtractedEntityNode(Node):
description="Whether this entity represents explicit/semantic memory (knowledge, concepts, definitions, theories, principles)"
)
# User Metadata Fields (populated by async metadata extraction after dedup)
core_facts: List[str] = Field(default_factory=list, description="Stable basic facts about the user")
traits: List[str] = Field(default_factory=list, description="Stable personality traits or behavioral tendencies")
relations: List[str] = Field(default_factory=list, description="Durable relationships with people/groups/entities")
goals: List[str] = Field(default_factory=list, description="Long-term goals or ongoing pursuits")
interests: List[str] = Field(default_factory=list, description="Stable interests, preferences, or hobbies")
beliefs_or_stances: List[str] = Field(default_factory=list, description="Stable beliefs, values, or stances")
anchors: List[str] = Field(default_factory=list, description="Personally meaningful objects or symbols")
events: List[str] = Field(default_factory=list, description="Durable personal experiences or milestones")
@field_validator('aliases', mode='before')
@classmethod
def validate_aliases_field(cls, v): # 字段验证器 自动清理和验证 aliases 字段

View File

@@ -2,6 +2,9 @@
Independent from triplet_models.py - these models are used by the
standalone metadata extraction pipeline (post-dedup async Celery task).
The field definitions align with the Jinja2 prompt template
``extract_user_metadata.jinja2``.
"""
from typing import List, Literal, Optional
@@ -9,55 +12,69 @@ from typing import List, Literal, Optional
from pydantic import BaseModel, ConfigDict, Field
class UserMetadataProfile(BaseModel):
"""用户画像信息"""
class MetadataExtractionResponse(BaseModel):
"""LLM 元数据提取响应结构。
字段与 extract_user_metadata.jinja2 模板的输出 JSON 一一对应。
每个字段都是字符串数组,表示本次新增的元数据条目。
"""
model_config = ConfigDict(extra="ignore")
role: List[str] = Field(default_factory=list, description="用户职业或角色")
domain: List[str] = Field(default_factory=list, description="用户所在领域")
expertise: List[str] = Field(
default_factory=list, description="用户擅长的技能或工具"
aliases: List[str] = Field(
default_factory=list,
description="用户别名、昵称、称呼",
)
core_facts: List[str] = Field(
default_factory=list,
description="用户稳定的基础事实(身份、年龄、国籍、所在地等)",
)
traits: List[str] = Field(
default_factory=list,
description="用户稳定的人格特质、风格、行为倾向",
)
relations: List[str] = Field(
default_factory=list,
description="用户与他人/群体/宠物/重要对象之间的长期关系",
)
goals: List[str] = Field(
default_factory=list,
description="用户明确、稳定的长期目标或计划",
)
interests: List[str] = Field(
default_factory=list, description="用户关注的话题或领域标签"
)
class UserMetadata(BaseModel):
"""用户元数据顶层结构"""
model_config = ConfigDict(extra="ignore")
profile: UserMetadataProfile = Field(default_factory=UserMetadataProfile)
class MetadataFieldChange(BaseModel):
"""单个元数据字段的变更操作"""
model_config = ConfigDict(extra="ignore")
field_path: str = Field(
description="字段路径,用点号分隔,如 'profile.role''profile.expertise'"
)
action: Literal["set", "remove"] = Field(
description="操作类型:'set' 表示新增或修改,'remove' 表示移除"
)
value: Optional[str] = Field(
default=None,
description="字段的新值action='set' 时必填)。标量字段直接填值,列表字段填单个要新增的元素"
)
class MetadataExtractionResponse(BaseModel):
"""元数据提取 LLM 响应结构(增量模式)"""
model_config = ConfigDict(extra="ignore")
metadata_changes: List[MetadataFieldChange] = Field(
default_factory=list,
description="元数据的增量变更列表,每项描述一个字段的新增、修改或移除操作",
description="用户稳定的兴趣、偏好、长期爱好",
)
aliases_to_add: List[str] = Field(
beliefs_or_stances: List[str] = Field(
default_factory=list,
description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)",
description="用户稳定的信念、价值立场",
)
aliases_to_remove: List[str] = Field(
default_factory=list, description="用户明确否认的别名(如'我不叫XX了'"
anchors: List[str] = Field(
default_factory=list,
description="对用户有长期意义的物品、收藏、纪念物",
)
events: List[str] = Field(
default_factory=list,
description="对用户画像有长期价值的个人经历、事件、里程碑",
)
# ── 便捷属性 ──
METADATA_FIELDS: List[str] = [
"core_facts", "traits", "relations", "goals",
"interests", "beliefs_or_stances", "anchors", "events",
]
def has_any_metadata(self) -> bool:
"""是否提取到了任何元数据(不含 aliases"""
return any(
bool(getattr(self, field, []))
for field in self.METADATA_FIELDS
)
def to_metadata_dict(self) -> dict:
"""返回 8 个元数据字段的字典(不含 aliases用于 Neo4j 回写。"""
return {
field: getattr(self, field, [])
for field in self.METADATA_FIELDS
}

View File

@@ -1,17 +1,20 @@
"""PilotWritePipeline — 试运行专用萃取流水线。
职责边界:
- 只执行“萃取相关”链路statement -> triplet -> graph_build -> 第一层去重消歧
- 只执行"萃取相关"链路statement -> triplet -> graph_build -> 第一层去重消歧
- 不负责 Neo4j 写入、聚类、摘要、缓存更新
- 自行管理客户端初始化和本体类型加载(与 WritePipeline 对齐)
依赖方向Facade → Pipeline → Engine → Repository单向不允许反向调用
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import Any, Awaitable, Callable, Dict, List, Optional
from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Optional
from app.core.memory.models.message_models import DialogData
from app.core.memory.models.variate_config import ExtractionPipelineConfig
from app.core.memory.storage_services.extraction_engine.steps.dedup_step import (
DedupResult,
run_dedup,
@@ -24,6 +27,11 @@ from app.core.memory.storage_services.extraction_engine.steps.graph_build_step i
build_graph_nodes_and_edges,
)
if TYPE_CHECKING:
from app.schemas.memory_config_schema import MemoryConfig
logger = logging.getLogger(__name__)
@dataclass
class PilotWriteResult:
@@ -46,36 +54,54 @@ class PilotWriteResult:
class PilotWritePipeline:
"""重构后试运行专用流水线。"""
"""重构后试运行专用流水线。
构造函数只接收 memory_config客户端初始化和本体加载在 run() 内部完成,
与 WritePipeline 保持一致的生命周期管理模式。
"""
def __init__(
self,
llm_client: Any,
embedder_client: Any,
pipeline_config: ExtractionPipelineConfig,
embedding_id: Optional[str],
memory_config: MemoryConfig,
end_user_id: str,
language: str = "zh",
ontology_types: Any = None,
progress_callback: Optional[
Callable[[str, str, Optional[Dict[str, Any]]], Awaitable[None]]
] = None,
) -> None:
self.llm_client = llm_client
self.embedder_client = embedder_client
self.pipeline_config = pipeline_config
self.embedding_id = embedding_id
"""
Args:
memory_config: 不可变的记忆配置对象(从数据库加载)
end_user_id: 终端用户 ID
language: 语言 ("zh" | "en")
progress_callback: 可选的进度回调
"""
self.memory_config = memory_config
self.end_user_id = end_user_id
self.language = language
self.ontology_types = ontology_types
self.progress_callback = progress_callback
# 延迟初始化的客户端
self._llm_client = None
self._embedder_client = None
async def run(self, dialog_data_list: List[DialogData]) -> PilotWriteResult:
"""执行试运行萃取链路。"""
"""执行试运行萃取链路。
内部完成客户端初始化 → 本体加载 → 萃取 → 图构建 → 去重。
"""
from app.core.memory.utils.config.config_utils import get_pipeline_config
self._init_clients()
pipeline_config = get_pipeline_config(self.memory_config)
ontology_types = self._load_ontology_types()
orchestrator = NewExtractionOrchestrator(
llm_client=self.llm_client,
embedder_client=self.embedder_client,
config=self.pipeline_config,
embedding_id=self.embedding_id,
ontology_types=self.ontology_types,
llm_client=self._llm_client,
embedder_client=self._embedder_client,
config=pipeline_config,
embedding_id=str(self.memory_config.embedding_model_id),
ontology_types=ontology_types,
language=self.language,
is_pilot_run=True,
progress_callback=self.progress_callback,
@@ -84,7 +110,7 @@ class PilotWritePipeline:
graph = await build_graph_nodes_and_edges(
dialog_data_list=extracted_dialogs,
embedder_client=self.embedder_client,
embedder_client=self._embedder_client,
progress_callback=self.progress_callback,
)
@@ -93,9 +119,9 @@ class PilotWritePipeline:
statement_entity_edges=graph.stmt_entity_edges,
entity_entity_edges=graph.entity_entity_edges,
dialog_data_list=extracted_dialogs,
pipeline_config=self.pipeline_config,
pipeline_config=pipeline_config,
connector=None, # pilot: no layer-2 db dedup
llm_client=self.llm_client,
llm_client=self._llm_client,
is_pilot_run=True,
progress_callback=self.progress_callback,
)
@@ -106,3 +132,50 @@ class PilotWritePipeline:
dedup=dedup,
)
# ──────────────────────────────────────────────
# 辅助方法
# ──────────────────────────────────────────────
def _init_clients(self) -> None:
"""从 MemoryConfig 构建 LLM 和 Embedding 客户端。"""
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
from app.db import get_db_context
with get_db_context() as db:
factory = MemoryClientFactory(db)
self._llm_client = factory.get_llm_client_from_config(self.memory_config)
self._embedder_client = factory.get_embedder_client_from_config(
self.memory_config
)
logger.info("Pilot pipeline: LLM and embedding clients constructed")
def _load_ontology_types(self):
"""加载本体类型配置(如果配置了 scene_id"""
if not self.memory_config.scene_id:
return None
try:
from app.core.memory.ontology_services.ontology_type_loader import (
load_ontology_types_for_scene,
)
from app.db import get_db_context
with get_db_context() as db:
ontology_types = load_ontology_types_for_scene(
scene_id=self.memory_config.scene_id,
workspace_id=self.memory_config.workspace_id,
db=db,
)
if ontology_types:
logger.info(
f"Loaded {len(ontology_types.types)} ontology types "
f"for scene_id: {self.memory_config.scene_id}"
)
return ontology_types
except Exception as e:
logger.warning(
f"Failed to load ontology types for scene_id "
f"{self.memory_config.scene_id}: {e}",
exc_info=True,
)
return None

View File

@@ -186,9 +186,12 @@ class WritePipeline:
self._init_clients()
self._init_neo4j_connector()
# 初始化 Snapshot(提前创建,供预处理阶段的剪枝使用)
from app.core.memory.utils.debug.pipeline_snapshot import PipelineSnapshot
self._snapshot = PipelineSnapshot("new")
# 初始化快照记录器(提前创建,供预处理阶段的剪枝使用)
from app.core.memory.utils.debug.write_snapshot_recorder import (
WriteSnapshotRecorder,
)
self._recorder = WriteSnapshotRecorder("new")
# Step 1: 预处理 - 消息分块 + AI消息语义剪枝
async with bear.step(1, 5, "预处理", "消息分块") as s:
@@ -197,7 +200,9 @@ class WritePipeline:
# Step 2: 萃取 - 知识提取
async with bear.step(2, 5, "萃取", "知识提取") as s:
extraction_result = await self._extract(chunked_dialogs, is_pilot_run)
extraction_result = await self._extract(
chunked_dialogs, is_pilot_run
)
stats = extraction_result.stats
s.metadata(
entities=stats["entity_count"],
@@ -224,6 +229,9 @@ class WritePipeline:
# Step 3.5: 异步情绪提取fire-and-forget需在 _store 之后确保 Statement 节点已存在)
await self._extract_emotion(getattr(self, "_emotion_statements", []))
# Step 3.6: 异步元数据提取fire-and-forget需在 _store 之后确保 Entity 节点已存在)
await self._extract_metadata(extraction_result)
# Step 4: 聚类 - 增量更新社区(异步,不阻塞)
async with bear.step(4, 5, "聚类", "增量更新社区") as s:
await self._cluster(extraction_result)
@@ -264,7 +272,8 @@ class WritePipeline:
"""
from app.core.memory.agent.utils.get_dialogs import get_chunked_dialogs
snapshot = getattr(self, "_snapshot", None)
recorder = getattr(self, "_recorder", None)
snapshot = recorder.snapshot if recorder else None
return await get_chunked_dialogs(
chunker_strategy=self.memory_config.chunker_strategy,
@@ -308,14 +317,16 @@ class WritePipeline:
)
from app.core.memory.utils.config.config_utils import get_pipeline_config
from app.core.memory.utils.debug.pipeline_snapshot import PipelineSnapshot
from app.core.memory.utils.debug.write_snapshot_recorder import (
WriteSnapshotRecorder,
)
pipeline_config = get_pipeline_config(self.memory_config)
ontology_types = self._load_ontology_types()
# 复用 run() 中已创建的 snapshot(剪枝阶段已使用同一实例)
snapshot = getattr(self, "_snapshot", None) or PipelineSnapshot("new")
self._snapshot = snapshot
# 复用 run() 中已创建的 recorder(剪枝阶段已使用同一实例)
recorder = getattr(self, "_recorder", None) or WriteSnapshotRecorder("new")
self._recorder = recorder
# ── 新编排器LLM 萃取 + 数据赋值 ──
new_orchestrator = NewExtractionOrchestrator(
@@ -335,52 +346,8 @@ class WritePipeline:
# 注意:实际 dispatch 在 _store 之后,确保 Statement 节点已写入 Neo4j
self._emotion_statements = new_orchestrator.emotion_statements
# ── Snapshot: 各阶段萃取结果 ── TODO 乐力齐 重构流水线切换生产环境稳定后修改
stage_outputs = new_orchestrator.last_stage_outputs
if stage_outputs:
stmt_results = stage_outputs.get("statement_results", {})
stmt_snapshot = []
for _did, chunk_stmts in stmt_results.items():
for _cid, stmts in chunk_stmts.items():
for s in stmts:
stmt_snapshot.append(s.model_dump())
snapshot.save_stage("2_statement_outputs", stmt_snapshot)
triplet_results = stage_outputs.get("triplet_results", {})
triplet_snapshot = {}
for _did, stmt_triplets in triplet_results.items():
for stmt_id, t_out in stmt_triplets.items():
triplet_snapshot[stmt_id] = t_out.model_dump()
snapshot.save_stage("3_triplet_outputs", triplet_snapshot)
emotion_results = stage_outputs.get("emotion_results", {})
emotion_snapshot = {}
for stmt_id, emo in emotion_results.items():
if hasattr(emo, "model_dump"):
emotion_snapshot[stmt_id] = emo.model_dump()
snapshot.save_stage("4_emotion_outputs", emotion_snapshot)
emb_output = stage_outputs.get("embedding_output")
if emb_output and hasattr(emb_output, "model_dump"):
emb_data = emb_output.model_dump()
for key in (
"statement_embeddings",
"chunk_embeddings",
"entity_embeddings",
):
if key in emb_data and isinstance(emb_data[key], dict):
emb_data[key] = {
k: v[:5] if isinstance(v, list) else v
for k, v in emb_data[key].items()
}
if "dialog_embeddings" in emb_data and isinstance(
emb_data["dialog_embeddings"], list
):
emb_data["dialog_embeddings"] = [
v[:5] if isinstance(v, list) else v
for v in emb_data["dialog_embeddings"]
]
snapshot.save_stage("5_embedding_outputs", emb_data)
# ── Snapshot: 各阶段萃取结果 ──
recorder.record_stage_outputs(new_orchestrator.last_stage_outputs)
# step2: 构建图节点和边
graph = await build_graph_nodes_and_edges(
@@ -389,34 +356,8 @@ class WritePipeline:
progress_callback=self.progress_callback,
)
# region Snapshot: 图节点和边(去重前)Snapshot有关的内容在重构流水线切换生产环境之后修改
snapshot.save_stage(
"6_nodes_edges_before_dedup",
{
"dialogue_nodes_count": len(graph.dialogue_nodes),
"chunk_nodes_count": len(graph.chunk_nodes),
"statement_nodes_count": len(graph.statement_nodes),
"entity_nodes": [
{
"id": e.id,
"name": e.name,
"entity_type": e.entity_type,
"description": e.description,
}
for e in graph.entity_nodes
],
"entity_entity_edges": [
{
"source": e.source,
"target": e.target,
"relation_type": e.relation_type,
"statement": e.statement,
}
for e in graph.entity_entity_edges
],
"stmt_entity_edges_count": len(graph.stmt_entity_edges),
},
)
# Snapshot: 图节点和边(去重前)
recorder.record_graph_before_dedup(graph)
# step3: 两阶段去重消歧
dedup_result = await run_dedup(
@@ -432,29 +373,7 @@ class WritePipeline:
)
# Snapshot: 去重后
snapshot.save_stage(
"7_after_dedup",
{
"entity_nodes": [
{
"id": e.id,
"name": e.name,
"entity_type": e.entity_type,
"description": e.description,
}
for e in dedup_result.entity_nodes
],
"entity_entity_edges": [
{
"source": e.source,
"target": e.target,
"relation_type": e.relation_type,
"statement": e.statement,
}
for e in dedup_result.entity_entity_edges
],
},
)
recorder.record_dedup_result(dedup_result)
# step4: 构造最终结果
result = ExtractionResult(
@@ -474,7 +393,7 @@ class WritePipeline:
dialog_data_list=dialog_data_list,
)
snapshot.save_summary(result.stats) # TODO 乐力齐 snapshot需要改
recorder.record_summary(result.stats)
return result
# ──────────────────────────────────────────────
@@ -551,7 +470,10 @@ class WritePipeline:
同时在内存中同步更新 ExtractionResult.entity_nodes保持内存与 Neo4j 一致。
失败不中断主流程。
"""
from app.repositories.neo4j.cypher_queries import MERGE_ALIAS_BELONGS_TO
from app.repositories.neo4j.cypher_queries import (
MERGE_ALIAS_BELONGS_TO,
REDIRECT_ALIAS_EDGES,
)
ALIAS_PREDICATE = "别名属于"
@@ -571,12 +493,17 @@ class WritePipeline:
# ── 1. 在内存中同步更新 entity_nodes ──
entity_map = {e.id: e for e in result.entity_nodes}
# 构建 alias_id → target_id 映射(别名节点 → 用户节点)
alias_to_target: dict[str, str] = {}
for edge in alias_edges:
source_node = entity_map.get(edge.source)
target_node = entity_map.get(edge.target)
if not source_node or not target_node:
continue
alias_to_target[edge.source] = edge.target
# 将 source.name 追加到 target.aliases去重忽略大小写
source_name = (source_node.name or "").strip()
if source_name:
@@ -595,11 +522,36 @@ class WritePipeline:
f"{tgt_desc}{src_desc}" if tgt_desc else src_desc
)
# ── 1.1 内存中重定向指向别名节点的边到用户节点 ──
alias_ids = set(alias_to_target.keys())
redirected_ee_count = 0
redirected_se_count = 0
# 重定向 entity_entity_edges排除"别名属于"边本身)
for edge in result.entity_entity_edges:
rel_type = getattr(edge, "relation_type", "")
if rel_type == ALIAS_PREDICATE:
continue
if edge.source in alias_ids:
edge.source = alias_to_target[edge.source]
redirected_ee_count += 1
if edge.target in alias_ids:
edge.target = alias_to_target[edge.target]
redirected_ee_count += 1
# 重定向 stmt_entity_edges陈述句 → 实体边)
for edge in result.stmt_entity_edges:
if edge.target in alias_ids:
edge.target = alias_to_target[edge.target]
redirected_se_count += 1
logger.info(
f"[AliasMerge] 内存同步完成,处理 {len(alias_edges)}'别名属于'"
f"[AliasMerge] 内存同步完成,处理 {len(alias_edges)}'别名属于'"
f"重定向 entity_entity 边 {redirected_ee_count} 次,"
f"重定向 stmt_entity 边 {redirected_se_count}"
)
# ── 2. 写入 Neo4j ──
# ── 2. 写入 Neo4j:别名属性归并 ──
records = await self._neo4j_connector.execute_query(
MERGE_ALIAS_BELONGS_TO,
end_user_id=self.end_user_id,
@@ -607,6 +559,16 @@ class WritePipeline:
merged_count = len(records) if records else 0
logger.info(f"[AliasMerge] Neo4j 别名归并完成,影响 {merged_count} 条记录")
# ── 3. 写入 Neo4j重定向指向别名节点的边到用户节点 ──
redirect_records = await self._neo4j_connector.execute_query(
REDIRECT_ALIAS_EDGES,
end_user_id=self.end_user_id,
)
redirect_count = len(redirect_records) if redirect_records else 0
logger.info(
f"[AliasMerge] Neo4j 边重定向完成,影响 {redirect_count} 条记录"
)
except Exception as e:
logger.warning(
f"[AliasMerge] 别名归并失败(不影响主流程): {e}", exc_info=True
@@ -691,10 +653,10 @@ class WritePipeline:
return
# 快照目录:仅在 PIPELINE_SNAPSHOT_ENABLED=true 时非空,供 worker 端落盘
snapshot = getattr(self, "_snapshot", None)
recorder = getattr(self, "_recorder", None)
snapshot_dir = (
snapshot.directory
if snapshot is not None and getattr(snapshot, "enabled", False)
recorder.snapshot_dir
if recorder is not None and recorder.enabled
else None
)
@@ -723,6 +685,67 @@ class WritePipeline:
exc_info=True,
)
# ──────────────────────────────────────────────
# Step 3.6: 异步元数据提取
# fire-and-forget 提交 Celery 任务,不阻塞主流程
# ──────────────────────────────────────────────
async def _extract_metadata(self, result: ExtractionResult) -> None:
"""提交异步元数据提取 Celery 任务。
从去重后的用户实体 description 中提取结构化元数据,
异步回写到 Neo4j ExtractedEntity 节点。失败不影响主流程。
"""
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import (
collect_user_entities_for_metadata,
)
user_entities = collect_user_entities_for_metadata(result.entity_nodes)
if not user_entities:
return
llm_model_id = (
str(self.memory_config.llm_model_id)
if self.memory_config.llm_model_id
else None
)
if not llm_model_id:
logger.warning("[Metadata] 无法提交元数据提取任务llm_model_id 为空")
return
# 快照目录
recorder = getattr(self, "_recorder", None)
snapshot_dir = (
recorder.snapshot_dir
if recorder is not None and recorder.enabled
else None
)
try:
from app.celery_app import celery_app
task_result = celery_app.send_task(
"app.tasks.extract_metadata_batch",
kwargs={
"user_entities": user_entities,
"llm_model_id": llm_model_id,
"language": self.language,
"snapshot_dir": snapshot_dir,
},
)
logger.info(
f"[Metadata] 异步元数据提取任务已提交 - "
f"task_id = {task_result.id}, "
f"entity_count = {len(user_entities)}, "
f"snapshot_dir = {snapshot_dir}"
)
except Exception as e:
logger.error(
f"[Metadata] 提交元数据提取任务失败(不影响主流程): {e}",
exc_info=True,
)
# ──────────────────────────────────────────────
# Step 5: 摘要
# + entity_description+ meta_data部分在此提取

View File

@@ -117,12 +117,18 @@ def _merge_attribute(canonical: ExtractedEntityNode, ent: ExtractedEntityNode):
except Exception:
pass
# 描述与事实摘要(保留更长者
# 描述合并(去重拼接,分号分隔
try:
desc_a = getattr(canonical, "description", "") or ""
desc_b = getattr(ent, "description", "") or ""
if len(desc_b) > len(desc_a):
canonical.description = desc_b
desc_a = (getattr(canonical, "description", "") or "").strip()
desc_b = (getattr(ent, "description", "") or "").strip()
if desc_b and desc_b != desc_a:
if desc_a:
# 将已有 description 按分号拆分,检查新 description 是否已存在
existing_parts = {p.strip() for p in desc_a.replace("", ";").split(";") if p.strip()}
if desc_b not in existing_parts:
canonical.description = f"{desc_a}{desc_b}"
else:
canonical.description = desc_b
# 合并事实摘要:统一保留一个“实体: name”行来源行去重保序
# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用
# fact_a = getattr(canonical, "fact_summary", "") or ""

View File

@@ -311,53 +311,8 @@ class ExtractionOrchestrator:
dialog_data_list,
)
# 步骤 7: 触发异步元数据和别名提取(仅正式模式)
if not is_pilot_run:
try:
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import (
MetadataExtractor,
)
metadata_extractor = MetadataExtractor(
llm_client=self.llm_client, language=self.language
)
user_statements = (
metadata_extractor.collect_user_related_statements(
entity_nodes, statement_nodes, statement_entity_edges
)
)
if user_statements:
end_user_id = (
dialog_data_list[0].end_user_id
if dialog_data_list
else None
)
config_id = (
dialog_data_list[0].config_id
if dialog_data_list
and hasattr(dialog_data_list[0], "config_id")
else None
)
if end_user_id:
from app.tasks import extract_user_metadata_task
extract_user_metadata_task.delay(
end_user_id=str(end_user_id),
statements=user_statements,
config_id=str(config_id) if config_id else None,
language=self.language,
)
logger.info(
f"已触发异步元数据提取任务,共 {len(user_statements)} 条用户相关 statement"
)
else:
logger.info("未找到用户相关 statement跳过元数据提取")
except Exception as e:
logger.error(
f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True
)
# 别名同步已迁移到 Celery 元数据提取任务中,不再在此处执行
# 步骤 7: 元数据提取已迁移到新流水线WritePipeline._extract_metadata
# 旧编排器不再触发异步元数据提取任务。
logger.info(f"知识提取流水线运行完成({mode_str}")
return (

View File

@@ -25,7 +25,7 @@ from app.core.memory.models.variate_config import ExtractionPipelineConfig
from .steps.base import ExtractionStep, StepContext
from .steps.embedding_step import EmbeddingStep
from .steps.sidecar_factory import SidecarStepFactory, SidecarTiming
from .sidecar_factory import SidecarStepFactory, SidecarTiming
from .steps.statement_temporal_step import StatementTemporalExtractionStep
from .steps.triplet_step import TripletExtractionStep
from .steps.schema import (

View File

@@ -142,7 +142,7 @@ async def generate_title_and_type_for_summary(
f"已归一化为 '{episodic_type}'"
)
logger.info(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}")
logger.debug(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}")
return (title, episodic_type)
except json.JSONDecodeError:
@@ -197,7 +197,7 @@ async def _process_chunk_summary(
llm_client=llm_client,
language=language
)
logger.info(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}")
logger.debug(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}")
except Exception as e:
logger.warning(f"Failed to generate title and type for chunk {chunk.id}: {e}")
# Continue without title and type

View File

@@ -1,176 +1,69 @@
"""
Metadata extractor module.
Metadata extractor utilities.
Collects user-related statements from post-dedup graph data and
extracts user metadata via an independent LLM call.
Provides helper functions for identifying user entities from post-dedup
graph data. The actual LLM extraction logic lives in MetadataExtractionStep.
"""
import logging
from typing import List, Optional
from typing import Dict, List
from app.core.memory.models.graph_models import (
ExtractedEntityNode,
StatementEntityEdge,
StatementNode,
)
from app.core.memory.models.graph_models import ExtractedEntityNode
logger = logging.getLogger(__name__)
# Reuse the same user-entity detection logic from dedup module
_USER_NAMES = {"用户", "", "user", "i"}
_CANONICAL_USER_TYPE = "用户"
# 用户实体判定常量
USER_NAMES = {"用户", "", "user", "i"}
CANONICAL_USER_TYPE = "用户"
def _is_user_entity(ent: ExtractedEntityNode) -> bool:
"""判断实体是否为用户实体"""
name = (getattr(ent, "name", "") or "").strip().lower()
etype = (getattr(ent, "entity_type", "") or "").strip()
return name in _USER_NAMES or etype == _CANONICAL_USER_TYPE
def is_user_entity(entity: ExtractedEntityNode) -> bool:
"""判断实体是否为用户实体"""
name = (getattr(entity, "name", "") or "").strip().lower()
etype = (getattr(entity, "entity_type", "") or "").strip()
return name in USER_NAMES or etype == CANONICAL_USER_TYPE
class MetadataExtractor:
"""Extracts user metadata from post-dedup graph data via independent LLM call."""
def collect_user_entities_for_metadata(
entity_nodes: List[ExtractedEntityNode],
) -> List[Dict]:
"""从去重后的实体列表中筛选用户实体,构造元数据提取的输入。
def __init__(self, llm_client, language: Optional[str] = None):
self.llm_client = llm_client
self.language = language
将每个用户实体的 description 按分号拆分为列表,
作为 Celery 异步元数据提取任务的输入。
@staticmethod
def detect_language(statements: List[str]) -> str:
"""根据 statement 文本内容检测语言。
如果文本中包含中文字符则返回 "zh",否则返回 "en"
"""
import re
Args:
entity_nodes: 去重后的实体节点列表
combined = " ".join(statements)
if re.search(r"[\u4e00-\u9fff]", combined):
return "zh"
return "en"
Returns:
用户实体字典列表,每项包含 entity_id、entity_name、descriptions
"""
user_entities = []
for entity in entity_nodes:
if not is_user_entity(entity):
continue
def collect_user_related_statements(
self,
entity_nodes: List[ExtractedEntityNode],
statement_nodes: List[StatementNode],
statement_entity_edges: List[StatementEntityEdge],
) -> List[str]:
"""
从去重后的数据中筛选与用户直接相关且由用户发言的 statement 文本。
desc = (getattr(entity, "description", "") or "").strip()
if not desc:
continue
筛选逻辑:
1. 用户实体 → StatementEntityEdge → statement直接关联
2. 只保留 speaker="user" 的 statement过滤 assistant 回复的噪声)
Returns:
用户发言的 statement 文本列表
"""
# Find user entity IDs
user_entity_ids = set()
for ent in entity_nodes:
if _is_user_entity(ent):
user_entity_ids.add(ent.id)
if not user_entity_ids:
logger.debug("未找到用户实体节点,跳过 statement 收集")
return []
# 用户实体 → StatementEntityEdge → statement
target_stmt_ids = set()
for edge in statement_entity_edges:
if edge.target in user_entity_ids:
target_stmt_ids.add(edge.source)
# Collect: only speaker="user" statements, preserving order
result = []
seen = set()
total_associated = 0
skipped_non_user = 0
for stmt_node in statement_nodes:
if stmt_node.id in target_stmt_ids and stmt_node.id not in seen:
total_associated += 1
speaker = getattr(stmt_node, "speaker", None) or "unknown"
if speaker == "user":
text = (stmt_node.statement or "").strip()
if text:
result.append(text)
else:
skipped_non_user += 1
seen.add(stmt_node.id)
# 将分号分隔的 description 拆分为列表
descriptions = [
d.strip() for d in desc.replace("", ";").split(";")
if d.strip()
]
if descriptions:
user_entities.append({
"entity_id": entity.id,
"entity_name": entity.name,
"descriptions": descriptions,
})
if user_entities:
logger.info(
f"收集到 {len(result)} 用户发言 statement "
f"(直接关联: {total_associated}, speaker=user: {len(result)}, "
f"跳过非user: {skipped_non_user})"
f"收集到 {len(user_entities)} 用户实体用于元数据提取"
)
if result:
for i, text in enumerate(result):
logger.info(f" [user statement {i + 1}] {text}")
if total_associated > 0 and len(result) == 0:
logger.warning(
f"{total_associated} 条直接关联 statement 但全部被 speaker 过滤,"
f"可能本次写入不包含 user 消息"
)
return result
else:
logger.debug("未找到用户实体,跳过元数据提取")
async def extract_metadata(
self,
statements: List[str],
existing_metadata: Optional[dict] = None,
existing_aliases: Optional[List[str]] = None,
) -> Optional[tuple]:
"""
对筛选后的 statement 列表调用 LLM 提取元数据增量变更和用户别名。
Args:
statements: 用户发言的 statement 文本列表
existing_metadata: 数据库已有的元数据(可选)
existing_aliases: 数据库已有的用户别名列表(可选)
Returns:
(List[MetadataFieldChange], List[str], List[str]) tuple:
(metadata_changes, aliases_to_add, aliases_to_remove) on success, None on failure
"""
if not statements:
return None
try:
from app.core.memory.utils.prompt.prompt_utils import prompt_env
if self.language:
detected_language = self.language
logger.info(f"元数据提取使用显式指定语言: {detected_language}")
else:
detected_language = self.detect_language(statements)
logger.info(f"元数据提取语言自动检测结果: {detected_language}")
template = prompt_env.get_template("extract_user_metadata.jinja2")
prompt = template.render(
statements=statements,
language=detected_language,
existing_metadata=existing_metadata,
existing_aliases=existing_aliases,
json_schema="",
)
from app.core.memory.models.metadata_models import (
MetadataExtractionResponse,
)
response = await self.llm_client.response_structured(
messages=[{"role": "user", "content": prompt}],
response_model=MetadataExtractionResponse,
)
if response:
changes = response.metadata_changes if response.metadata_changes else []
to_add = response.aliases_to_add if response.aliases_to_add else []
to_remove = (
response.aliases_to_remove if response.aliases_to_remove else []
)
return changes, to_add, to_remove
logger.warning("LLM 返回的响应为空")
return None
except Exception as e:
logger.error(f"元数据提取 LLM 调用失败: {e}", exc_info=True)
return None
return user_entities

View File

@@ -51,7 +51,7 @@ class OntologyExtractor:
self.validator = OntologyValidator()
self.owl_validator = OWLValidator()
logger.info("OntologyExtractor initialized")
logger.debug("OntologyExtractor initialized")
async def extract_ontology_classes(
self,

View File

@@ -9,7 +9,7 @@ import logging
from enum import Enum
from typing import Any, Dict, List, Tuple, Type
from .base import ExtractionStep, StepContext
from .steps.base import ExtractionStep, StepContext
logger = logging.getLogger(__name__)

View File

@@ -4,7 +4,7 @@ Importing this package triggers @register decorator self-registration
for all sidecar (non-critical) steps via SidecarStepFactory.
"""
from .sidecar_factory import SidecarStepFactory, SidecarTiming # noqa: F401
from ..sidecar_factory import SidecarStepFactory, SidecarTiming # noqa: F401
# Step implementations — importing triggers @register self-registration.
from .statement_temporal_step import StatementTemporalExtractionStep # noqa: F401

View File

@@ -12,7 +12,7 @@ from app.core.memory.models.emotion_models import EmotionExtraction
from app.core.memory.utils.prompt.prompt_utils import render_emotion_extraction_prompt
from .base import ExtractionStep, StepContext
from .sidecar_factory import SidecarStepFactory, SidecarTiming
from ..sidecar_factory import SidecarStepFactory, SidecarTiming
from .schema import EmotionStepInput, EmotionStepOutput
logger = logging.getLogger(__name__)

View File

@@ -308,6 +308,7 @@ async def build_graph_nodes_and_edges(
object_entity_id = entity_idx_to_id.get(triplet.object_id)
if subject_entity_id and object_entity_id:
_tv = getattr(statement, "temporal_validity", None)
entity_entity_edges.append(
EntityEntityEdge(
source=subject_entity_id,
@@ -320,6 +321,8 @@ async def build_graph_nodes_and_edges(
run_id=dialog_data.run_id,
created_at=dialog_data.created_at,
expired_at=dialog_data.expired_at,
valid_at=_tv.valid_at if _tv else None,
invalid_at=_tv.invalid_at if _tv else None,
)
)

View File

@@ -0,0 +1,89 @@
"""MetadataExtractionStep — 用户实体元数据提取 step。
从用户实体的 description 中提取结构化元数据core_facts、traits、relations 等),
通过 Celery 异步任务在去重消歧完成后执行,结果回写到 Neo4j ExtractedEntity 节点。
不注册为 SidecarStepFactory 的自动旁路(因为它在去重后异步执行,不在主萃取流程中),
而是由 Celery 任务直接实例化调用。
"""
import json
import logging
from typing import Any
from .base import ExtractionStep, StepContext
from .schema import MetadataStepInput, MetadataStepOutput
logger = logging.getLogger(__name__)
class MetadataExtractionStep(ExtractionStep[MetadataStepInput, MetadataStepOutput]):
"""从用户实体 description 中提取结构化元数据。
非 critical step — 失败返回空默认值,不中断流程。
"""
def __init__(self, context: StepContext) -> None:
super().__init__(context)
@property
def name(self) -> str:
return "metadata_extraction"
@property
def is_critical(self) -> bool:
return False
@property
def max_retries(self) -> int:
return 1
async def render_prompt(self, input_data: MetadataStepInput) -> str:
"""使用 Jinja2 模板渲染元数据提取 prompt。"""
from app.core.memory.utils.prompt.prompt_utils import prompt_env
template = prompt_env.get_template("extract_user_metadata.jinja2")
input_json = json.dumps(
{
"description": input_data.descriptions,
"existing_metadata": input_data.existing_metadata,
},
ensure_ascii=False,
indent=2,
)
return template.render(
language=self.language,
input_json=input_json,
)
async def call_llm(self, prompt: Any) -> Any:
"""调用 LLM 进行结构化输出。"""
from app.core.memory.models.metadata_models import MetadataExtractionResponse
messages = [{"role": "user", "content": prompt}]
return await self.llm_client.response_structured(
messages, MetadataExtractionResponse
)
async def parse_response(
self, raw_response: Any, input_data: MetadataStepInput
) -> MetadataStepOutput:
"""将 LLM 响应解析为 MetadataStepOutput。"""
if raw_response is None:
return self.get_default_output()
return MetadataStepOutput(
core_facts=getattr(raw_response, "core_facts", []) or [],
traits=getattr(raw_response, "traits", []) or [],
relations=getattr(raw_response, "relations", []) or [],
goals=getattr(raw_response, "goals", []) or [],
interests=getattr(raw_response, "interests", []) or [],
beliefs_or_stances=getattr(raw_response, "beliefs_or_stances", []) or [],
anchors=getattr(raw_response, "anchors", []) or [],
events=getattr(raw_response, "events", []) or [],
)
def get_default_output(self) -> MetadataStepOutput:
return MetadataStepOutput()

View File

@@ -19,6 +19,8 @@ from .extraction_step_schema import (
from .sidecar_step_schema import (
EmotionStepInput,
EmotionStepOutput,
MetadataStepInput,
MetadataStepOutput,
)
__all__ = [
@@ -39,4 +41,7 @@ __all__ = [
# Sidecar — Emotion
"EmotionStepInput",
"EmotionStepOutput",
# Sidecar — Metadata
"MetadataStepInput",
"MetadataStepOutput",
]

View File

@@ -24,3 +24,39 @@ class EmotionStepOutput(BaseModel):
emotion_type: str = "neutral"
emotion_intensity: float = 0.0
emotion_keywords: List[str] = Field(default_factory=list)
# ── Metadata extraction (async post-dedup) ──
class MetadataStepInput(BaseModel):
"""Input for MetadataExtractionStep."""
entity_id: str
entity_name: str
descriptions: List[str] = Field(
default_factory=list,
description="用户实体的 description 列表(可能由分号分隔拆分而来)",
)
existing_metadata: dict = Field(
default_factory=dict,
description="Neo4j 中已有的元数据,用于增量去重",
)
class MetadataStepOutput(BaseModel):
"""Output of MetadataExtractionStep."""
core_facts: List[str] = Field(default_factory=list)
traits: List[str] = Field(default_factory=list)
relations: List[str] = Field(default_factory=list)
goals: List[str] = Field(default_factory=list)
interests: List[str] = Field(default_factory=list)
beliefs_or_stances: List[str] = Field(default_factory=list)
anchors: List[str] = Field(default_factory=list)
events: List[str] = Field(default_factory=list)
def has_any(self) -> bool:
"""是否提取到了任何元数据。"""
return any([
self.core_facts, self.traits, self.relations, self.goals,
self.interests, self.beliefs_or_stances, self.anchors, self.events,
])

View File

@@ -1,140 +1,616 @@
===Task===
Extract user metadata changes from the following conversation statements spoken by the user.
{% if language == "zh" %}
**"三度原则"判断标准:**
- 复用度:该信息是否会被多个功能模块使用?
- 约束度:该信息是否会影响系统行为?
- 时效性:该信息是长期稳定的还是临时的?仅提取长期稳定信息。
你是一个用户画像 metadata 增量提取助手。你的任务是根据输入的用户 `description` 列表,提取值得长期保留、适合挂在“用户节点”下的新增 metadata。
**提取规则:**
- **只提取关于"用户本人"的画像信息**,忽略用户提到的第三方人物(如朋友、同事、家人)的信息
- 仅提取文本中明确提到的信息,不要推测
- **输出语言必须与输入文本的语言一致**(输入中文则输出中文值,输入英文则输出英文值)
你会同时收到:
**增量模式(重要):**
你只需要输出**本次对话引起的变更操作**,不要输出完整的元数据。每个变更是一个对象,包含:
- `field_path`:字段路径,用点号分隔(如 `profile.role`、`profile.expertise`
- `action`:操作类型
* `set`:新增或修改一个字段的值
* `remove`:移除一个字段的值
- `value`:字段的新值(`action="set"` 时必填,`action="remove"` 时填要移除的元素值)
* 所有字段均为列表类型,每个元素一条变更记录
- `description`: 一组待分析的描述字符串
- `existing_metadata`: 用户当前已经存在的 metadata
**判断规则:**
- 用户提到新信息 → `action="set"`,填入新值
- 用户明确否定已有信息(如"我不再做老师了"、"我已经不学Python了")→ `action="remove"``value` 填要移除的元素值
- 如果本次对话没有任何可提取的变更,返回空的 `metadata_changes` 数组 `[]`
- **不要为未被提及的字段生成任何变更操作**
你的目标不是重建完整 metadata而是只输出“新增内容”
{% if existing_metadata %}
**已有元数据(仅供参考,用于判断是否需要变更):**
请对比已有数据和用户最新发言,只输出差异部分的变更操作。
- 如果用户说的信息和已有数据一致,不需要输出变更
- 如果用户否定了已有数据中的某个值,输出 `remove` 操作
- 如果用户提到了新信息,输出 `set` 操作
{% endif %}
- 只能输出从 `description` 中能够支持的新增 metadata
- 不要重复输出已经出现在 `existing_metadata` 里的内容
- 不允许修改、重写、删除或纠正已有 metadata
- 所有字段一律输出为字符串数组
{% else %}
You are an assistant for incremental user metadata extraction. Your task is to extract durable, user-node-level new metadata from the input `description` list.
**字段说明:**
- profile.role用户的职业或角色列表如 教师、医生、后端工程师,一个人可以有多个角色
- profile.domain用户所在领域列表如 教育、医疗、软件开发,一个人可以涉及多个领域
- profile.expertise用户擅长的技能或工具列表如 Python、心理咨询、高中物理
- profile.interests用户主动表达兴趣的话题或领域标签列表
You will receive:
**用户别名变更(增量模式):**
- **aliases_to_add**:本次新发现的用户别名,包括:
* 用户主动自我介绍:如"我叫张三"、"我的名字是XX"、"我的网名是XX"
* 他人对用户的称呼:如"同事叫我陈哥"、"大家叫我小张"、"领导叫我老陈"
* 只提取原文中逐字出现的名字,严禁推测或创造
* 禁止提取:用户给 AI 取的名字、第三方人物自身的名字、"用户"/"我" 等占位词
* 如果没有新别名,返回空数组 `[]`
- **aliases_to_remove**:用户明确否认的别名,包括:
* 用户说"我不叫XX了"、"别叫我XX"、"我改名了不叫XX" → 将 XX 放入此数组
* **严格限制**:只将用户原文中**逐字提到**的被否认名字放入,不要推断关联的其他别名
* 如果没有要移除的别名,返回空数组 `[]`
{% if existing_aliases %}
- 已有别名:{{ existing_aliases | tojson }}(仅供参考,不需要在输出中重复)
{% endif %}
{% else %}
**"Three-Degree Principle" criteria:**
- Reusability: Will this information be used by multiple functional modules?
- Constraint: Will this information affect system behavior?
- Timeliness: Is this information long-term stable or temporary? Only extract long-term stable information.
- `description`: a list of descriptions to analyze
- `existing_metadata`: the user's existing metadata
**Extraction rules:**
- **Only extract profile information about the user themselves**, ignore information about third parties (friends, colleagues, family) mentioned by the user
- Only extract information explicitly mentioned in the text, do not speculate
- **Output language must match the input text language**
Your goal is not to rebuild the full metadata. You must output only new metadata:
**Incremental mode (important):**
You should only output **the change operations caused by this conversation**, not the complete metadata. Each change is an object containing:
- `field_path`: Field path separated by dots (e.g. `profile.role`, `profile.expertise`)
- `action`: Operation type
* `set`: Add or update a field value
* `remove`: Remove a field value
- `value`: The new value for the field (required when `action="set"`, for `action="remove"` fill in the element value to remove)
* All fields are list types, one change record per element
- Output only metadata supported by `description`
- Do not repeat anything already present in `existing_metadata`
- Do not modify, rewrite, delete, or correct existing metadata
- Every field must be an array of strings
{% endif %}
**Decision rules:**
- User mentions new information → `action="set"`, fill in the new value
- User explicitly negates existing info (e.g. "I'm no longer a teacher", "I stopped learning Python") → `action="remove"`, `value` is the element to remove
- If this conversation has no extractable changes, return an empty `metadata_changes` array `[]`
- **Do NOT generate any change operations for fields not mentioned in the conversation**
===Inputs===
{% if language == "zh" %}
输入 JSON 包含以下字段:
{% if existing_metadata %}
**Existing metadata (for reference only, to determine if changes are needed):**
Compare existing data with the user's latest statements, and only output change operations for the differences.
- If the user's statement matches existing data, no change is needed
- If the user negates a value in existing data, output a `remove` operation
- If the user mentions new information, output a `set` operation
{% endif %}
- `description`: 字符串数组,表示关于用户的一组描述
- `existing_metadata`: 现有 metadata 对象,字段固定为:
- `aliases`
- `core_facts`
- `traits`
- `relations`
- `goals`
- `interests`
- `beliefs_or_stances`
- `anchors`
- `events`
{% else %}
The input JSON contains:
- `description`: an array of strings describing the user
- `existing_metadata`: an existing metadata object with these fixed fields:
- `aliases`
- `core_facts`
- `traits`
- `relations`
- `goals`
- `interests`
- `beliefs_or_stances`
- `anchors`
- `events`
{% endif %}
**Field descriptions:**
- profile.role: User's occupation or role (list), e.g. teacher, doctor, software engineer. A person can have multiple roles
- profile.domain: User's domain (list), e.g. education, healthcare, software development. A person can span multiple domains
- profile.expertise: User's skills or tools (list), e.g. Python, counseling, physics
- profile.interests: Topics or domain tags the user actively expressed interest in (list)
Input JSON:
**User alias changes (incremental mode):**
- **aliases_to_add**: Newly discovered user aliases from this conversation, including:
* User self-introductions: e.g. "I'm John", "My name is XX", "My username is XX"
* How others address the user: e.g. "My colleagues call me Johnny", "People call me Mike"
* Only extract names that appear VERBATIM in the text — never infer or fabricate
* Do NOT extract: names the user gives to the AI, third-party people's own names, placeholder words like "User"/"I"
* If no new aliases, return empty array `[]`
- **aliases_to_remove**: Aliases the user explicitly denies, including:
* User says "Don't call me XX anymore", "I'm not called XX", "I changed my name from XX" → put XX in this array
* **Strict rule**: Only include the exact name the user **verbatim mentions** as denied. Do NOT infer or remove related aliases
* If no aliases to remove, return empty array `[]`
{% if existing_aliases %}
- Existing aliases: {{ existing_aliases | tojson }} (for reference only, do not repeat in output)
{% endif %}
{% endif %}
===User Statements===
{% for stmt in statements %}
- {{ stmt }}
{% endfor %}
{% if existing_metadata %}
===Existing User Metadata===
```json
{{ existing_metadata | tojson }}
{{ input_json | default("{}") }}
```
===Field Definitions===
{% if language == "zh" %}
- `aliases`
- 用户的别名、昵称、称呼、英文名、稳定使用的另一个名字
- `core_facts`
- 用户相对稳定的基础事实,如身份、年龄、国籍、所在地、关系状态、家庭状态、长期背景
- `traits`
- 用户相对稳定的人格特质、风格、气质、行为倾向
- `relations`
- 用户与他人/群体/宠物/重要对象之间值得长期记忆的关系
- 保持字符串格式,可包含多个片段,常见格式如 `对象 | 关系/身份 | 补充信息`
- `goals`
- 用户明确、稳定、值得长期保留的人生目标、长期计划、持续追求
- `interests`
- 用户稳定的兴趣、偏好、长期爱好
- `beliefs_or_stances`
- 用户稳定的信念、价值立场、政治/宗教/社会议题立场
- `anchors`
- 对用户有长期意义的物品、收藏、纪念物、象征物
- 保持字符串格式,可包含多个片段,常见格式如 `对象 | 来源/关联 | 意义`
- `events`
- 对用户画像有长期价值的个人经历、事件、里程碑
- 保持字符串格式,可包含多个片段,常见格式如 `事件 | 时间 | 补充说明`
{% else %}
- `aliases`
- aliases, nicknames, stable alternative names, English names, or regular forms of address
- `core_facts`
- stable basic facts such as identity, age, nationality, residence, relationship status, family status, or long-term background
- `traits`
- stable personality traits, style, temperament, or behavioral tendencies
- `relations`
- durable relationships between the user and people/groups/pets/important entities
- keep string format; common pattern: `entity | relation/identity | extra info`
- `goals`
- explicit, stable, long-term goals or ongoing pursuits worth remembering
- `interests`
- stable interests, preferences, or hobbies
- `beliefs_or_stances`
- stable beliefs, values, political/religious/social stances
- `anchors`
- personally meaningful objects, collections, keepsakes, or symbols
- keep string format; common pattern: `object | source/association | meaning`
- `events`
- durable personal experiences, milestones, or events worth preserving
- keep string format; common pattern: `event | time | extra note`
{% endif %}
===Core Principles===
{% if language == "zh" %}
1. 只提取新增内容
- 如果某条信息已经在 `existing_metadata` 中出现,不能再次输出
- 即使 `description` 只是换了一种说法表达已有信息,也不要重复输出
- 如果只是对已有信息做轻微改写、近义改写、语序调整,也视为重复
2. 不修改已有内容
- 不要纠正已有 metadata 的措辞
- 不要补全已有 metadata 的结构
- 不要把已有 metadata 中的短字符串改写成更长版本后再输出
- 不要因为 `description` 出现了更精确表达,就把已有内容“升级后重新输出”
3. 只保留对用户画像有长期价值的信息
- 优先提取稳定身份、长期偏好、重要关系、重大目标、长期立场、重要锚点、关键事件
- 不要提取纯闲聊、瞬时感受、一次性很弱的细节
- 短暂情绪通常不单独提取,除非它是某个重要事件说明的一部分
4. 所有字段都必须是字符串数组
- 不允许输出对象数组
- 不允许输出嵌套结构
- 不允许把 `events` 拆成 event/time/note 对象
- 不允许把 `relations` 拆成 subject/relation/object 对象
5. 可以保留多段信息在一个字符串里
- `relations`、`anchors`、`events` 可以使用 `|` 连接多个片段
- 只有在确实有助于保留结构时才这样做
- 不必强行补满固定片段数,宁可简洁准确
6. 证据边界
- 只能依据 `description` 提取新增 metadata
- `existing_metadata` 只用于去重和分类参考,不是新增内容来源
- 不要从常识、推测或世界知识补充额外信息
{% else %}
1. Extract only new content
- If something already appears in `existing_metadata`, do not output it again
- If a description merely paraphrases existing metadata, do not output it
- Minor wording changes, synonym swaps, or reordered phrasing still count as duplicates
2. Do not modify existing content
- Do not correct wording in existing metadata
- Do not expand existing metadata and re-output it
- Do not upgrade an existing item into a more detailed version and emit it as new
3. Keep only durable user-profile information
- Prioritize stable identity, long-term preferences, important relationships, major goals, durable stances, meaningful anchors, and key events
- Exclude casual chatter, fleeting states, and weak one-off details
- Temporary emotions should usually not be extracted unless they are part of an important event description
4. Every field must be an array of strings
- No object arrays
- No nested structure
- Do not split `events` into event/time/note objects
- Do not split `relations` into structured triples
5. Multi-part strings are allowed
- `relations`, `anchors`, and `events` may use `|` to join parts
- Do this only when it helps preserve useful structure
- Do not force a fixed number of parts
6. Evidence boundary
- Extract new metadata only from `description`
- Use `existing_metadata` only for deduplication and category reference
- Do not add unsupported information from world knowledge or inference beyond the text
{% endif %}
===Deduplication Rules===
{% if language == "zh" %}
- 先理解 `description` 想表达的含义,再与 `existing_metadata` 做语义去重
- 若以下任一情况成立,则视为“已存在”,不要输出:
- 完全相同
- 近义表达
- 更长或更短但语义相同
- 只是把已有多段字符串拆开或重新组合
- 只是把已有事件/关系中的时间或备注略作改写
- 去重标准以“是否新增了值得保留的新事实”为准,而不是字面是否完全一致
去重示例:
- 已有 `single`,新描述说 `not in a relationship`,不要输出
- 已有 `from Sweden`,新描述说 `originally from Sweden`,不要输出
- 已有 `art`,新描述说 `likes art a lot`,通常不要输出
- 已有 `Oscar | pet guinea pig`,新描述说 `her guinea pig Oscar`,不要输出
{% else %}
- First understand the meaning of the description, then deduplicate semantically against `existing_metadata`
- Treat an item as already existing if any of these holds:
- exact match
- close paraphrase
- longer or shorter wording with the same meaning
- just a split or recombination of an existing multi-part string
- a lightly reworded time/note variant of an existing event or relation
- The test is whether the item adds a genuinely new durable fact, not whether the wording is different
{% endif %}
===Extraction Guidance By Field===
{% if language == "zh" %}
`aliases`
- 只收稳定名字,不收临时调侃
- 职业、身份、评价词不算 alias
`core_facts`
- 放稳定基础事实
- 不要放短暂状态、一次性动作、弱情绪
`traits`
- 只收相对稳定的人格或行为风格
- 不要因为一次行为就推断 trait
`relations`
- 只保留长期关系、有记忆价值的关系
- 可以写成 `对象 | 关系/身份 | 补充信息`
- 不要收纯一次性互动
`goals`
- 只收长期目标
- 不要把一时愿望、泛化口号、普通期待当作 goal
`interests`
- 只收稳定兴趣
- 短期尝试一次某事,通常不算 interest
`beliefs_or_stances`
- 收稳定信念、价值观、政治/宗教/社会议题立场
- 不要收普通瞬时意见
`anchors`
- 收具有象征意义、纪念意义、长期陪伴意义的对象
- 可写来源与意义
`events`
- 只收对用户画像有长期价值的事件或里程碑
- 优先保留时间信息和事件意义
- 普通日常小事通常不收,除非它明显揭示重要关系、目标推进或身份背景
{% else %}
`aliases`
- only stable names, not playful one-off labels
- occupations, identities, and evaluations are not aliases
`core_facts`
- keep stable background facts
- exclude temporary states, one-off actions, and weak emotions
`traits`
- only relatively stable traits or behavioral style
- do not infer a trait from one isolated action
`relations`
- keep durable, memory-worthy relationships
- may use `entity | relation/identity | extra info`
- exclude one-off interactions
`goals`
- only long-term goals
- do not treat temporary wishes or generic aspirations as goals
`interests`
- only stable interests
- a one-time attempt usually does not qualify
`beliefs_or_stances`
- keep stable beliefs, values, or social/political/religious stances
- exclude ordinary fleeting opinions
`anchors`
- keep symbolic, commemorative, or personally meaningful objects
- source and meaning may be included
`events`
- keep only events or milestones with durable profile value
- preserve time and significance when useful
- exclude ordinary daily trivia unless it clearly advances an important goal, relationship, or identity arc
{% endif %}
===Output Hard Constraints===
{% if language == "zh" %}
- 只输出新增 metadata不要输出完整 metadata
- 结果必须包含全部 9 个字段
- 每个字段都必须是数组
- 即使某字段没有新增内容,也必须输出空数组
- 每个数组元素必须是字符串
- 不要输出 `null`
- 不要输出解释文字
- 不要输出 markdown code fence
- 不要输出字段之外的任何额外键
- 如果没有任何新增 metadata也必须返回所有字段都为空数组的 JSON
{% else %}
- Output only new metadata, not the full metadata
- The result must include all 9 fields
- Every field must be an array
- Use empty arrays when there is no new content
- Every array element must be a string
- Do not output `null`
- Do not output explanation text
- Do not wrap the result in markdown code fences
- Do not output any keys beyond the required fields
- If there is no new metadata, still return the full JSON shape with empty arrays
{% endif %}
===Examples===
{% if language == "zh" %}
示例 1
Input:
- description:
- "She recently started volunteering for a trans youth hotline."
- existing_metadata:
- goals: ["pursue counseling / mental health work for transgender people"]
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [],
"events": [
"started volunteering for a trans youth hotline"
]
}
示例 2
Input:
- description:
- "She is originally from Sweden."
- "She is not dating anyone right now."
- existing_metadata:
- core_facts: ["from Sweden", "single"]
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [],
"events": []
}
示例 3
Input:
- description:
- "Her sister Mia encouraged her to apply."
- existing_metadata:
- relations: ["grandma | grandmother | from Sweden"]
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [
"Mia | sister"
],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [],
"events": []
}
示例 4
Input:
- description:
- "She keeps a journal from her first year after moving."
- existing_metadata:
- anchors: []
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [
"journal | from first year after moving"
],
"events": []
}
示例 5
Input:
- description:
- "Last month she attended a workshop on trauma-informed care and felt it clarified her future direction."
- existing_metadata:
- goals: ["pursue counseling / mental health work for transgender people"]
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [],
"events": [
"attended workshop on trauma-informed care | last month | clarified future direction"
]
}
{% else %}
Example 1
Input:
- description:
- "She recently started volunteering for a trans youth hotline."
- existing_metadata:
- goals: ["pursue counseling / mental health work for transgender people"]
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [],
"events": [
"started volunteering for a trans youth hotline"
]
}
Example 2
Input:
- description:
- "She is originally from Sweden."
- "She is not dating anyone right now."
- existing_metadata:
- core_facts: ["from Sweden", "single"]
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [],
"events": []
}
Example 3
Input:
- description:
- "Her sister Mia encouraged her to apply."
- existing_metadata:
- relations: ["grandma | grandmother | from Sweden"]
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [
"Mia | sister"
],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [],
"events": []
}
Example 4
Input:
- description:
- "She keeps a journal from her first year after moving."
- existing_metadata:
- anchors: []
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [
"journal | from first year after moving"
],
"events": []
}
Example 5
Input:
- description:
- "Last month she attended a workshop on trauma-informed care and felt it clarified her future direction."
- existing_metadata:
- goals: ["pursue counseling / mental health work for transgender people"]
Output:
{
"aliases": [],
"core_facts": [],
"traits": [],
"relations": [],
"goals": [],
"interests": [],
"beliefs_or_stances": [],
"anchors": [],
"events": [
"attended workshop on trauma-informed care | last month | clarified future direction"
]
}
{% endif %}
===Output Format===
Return a JSON object with the following structure:
{% if language == "zh" %}
输出必须是严格可解析的 JSON 对象,结构固定如下:
{% else %}
Return a strict JSON object with this exact structure:
{% endif %}
```json
{
"metadata_changes": [
{"field_path": "profile.role", "action": "set", "value": "后端工程师"},
{"field_path": "profile.expertise", "action": "set", "value": "Python"},
{"field_path": "profile.expertise", "action": "remove", "value": "Java"}
],
"aliases_to_add": [],
"aliases_to_remove": []
"aliases": ["string"],
"core_facts": ["string"],
"traits": ["string"],
"relations": ["string"],
"goals": ["string"],
"interests": ["string"],
"beliefs_or_stances": ["string"],
"anchors": ["string"],
"events": ["string"]
}
```
{{ json_schema }}
{% if language == "zh" %}
JSON 要求:
- 使用标准 ASCII 双引号 `"`
- 不要使用中文引号
- 不要在 JSON 外输出任何文字
- 字符串内如果包含双引号,必须转义为 `\"`
- 不要遗漏字段
- 不要输出尾逗号
{% else %}
JSON requirements:
- Use standard ASCII double quotes `"`
- No smart quotes
- Output JSON only
- Escape internal quotes as `\"`
- Do not omit any field
- Do not emit trailing commas
{% endif %}

View File

@@ -139,6 +139,65 @@ SET e.name = CASE WHEN entity.name IS NOT NULL AND entity.name <> '' THEN entity
RETURN e.id AS uuid
"""
# ── 元数据增量回写:将 LLM 提取的元数据追加到用户实体节点 ──
ENTITY_METADATA_UPDATE = """
MATCH (e:ExtractedEntity {id: $entity_id})
SET e.core_facts = CASE
WHEN $core_facts IS NOT NULL AND size($core_facts) > 0
THEN reduce(acc = coalesce(e.core_facts, []), item IN $core_facts |
CASE WHEN item IN acc THEN acc ELSE acc + item END)
ELSE coalesce(e.core_facts, []) END,
e.traits = CASE
WHEN $traits IS NOT NULL AND size($traits) > 0
THEN reduce(acc = coalesce(e.traits, []), item IN $traits |
CASE WHEN item IN acc THEN acc ELSE acc + item END)
ELSE coalesce(e.traits, []) END,
e.relations = CASE
WHEN $relations IS NOT NULL AND size($relations) > 0
THEN reduce(acc = coalesce(e.relations, []), item IN $relations |
CASE WHEN item IN acc THEN acc ELSE acc + item END)
ELSE coalesce(e.relations, []) END,
e.goals = CASE
WHEN $goals IS NOT NULL AND size($goals) > 0
THEN reduce(acc = coalesce(e.goals, []), item IN $goals |
CASE WHEN item IN acc THEN acc ELSE acc + item END)
ELSE coalesce(e.goals, []) END,
e.interests = CASE
WHEN $interests IS NOT NULL AND size($interests) > 0
THEN reduce(acc = coalesce(e.interests, []), item IN $interests |
CASE WHEN item IN acc THEN acc ELSE acc + item END)
ELSE coalesce(e.interests, []) END,
e.beliefs_or_stances = CASE
WHEN $beliefs_or_stances IS NOT NULL AND size($beliefs_or_stances) > 0
THEN reduce(acc = coalesce(e.beliefs_or_stances, []), item IN $beliefs_or_stances |
CASE WHEN item IN acc THEN acc ELSE acc + item END)
ELSE coalesce(e.beliefs_or_stances, []) END,
e.anchors = CASE
WHEN $anchors IS NOT NULL AND size($anchors) > 0
THEN reduce(acc = coalesce(e.anchors, []), item IN $anchors |
CASE WHEN item IN acc THEN acc ELSE acc + item END)
ELSE coalesce(e.anchors, []) END,
e.events = CASE
WHEN $events IS NOT NULL AND size($events) > 0
THEN reduce(acc = coalesce(e.events, []), item IN $events |
CASE WHEN item IN acc THEN acc ELSE acc + item END)
ELSE coalesce(e.events, []) END
RETURN e.id AS uuid
"""
# ── 查询用户实体已有的元数据(供增量提取时去重) ──
ENTITY_METADATA_QUERY = """
MATCH (e:ExtractedEntity {id: $entity_id})
RETURN e.core_facts AS core_facts,
e.traits AS traits,
e.relations AS relations,
e.goals AS goals,
e.interests AS interests,
e.beliefs_or_stances AS beliefs_or_stances,
e.anchors AS anchors,
e.events AS events
"""
# Add back ENTITY_RELATIONSHIP_SAVE to be used by graph_saver.save_entities_and_relationships
ENTITY_RELATIONSHIP_SAVE = """
UNWIND $relationships AS rel
@@ -1136,6 +1195,56 @@ SET target.aliases = new_aliases,
RETURN source.name AS merged_alias, target.name AS target_name, new_aliases AS updated_aliases
"""
# 边重定向:将指向别名节点("别名属于"关系的 source的所有其他边重定向到用户节点target
# 处理两类边:
# 1. EXTRACTED_RELATIONSHIP其他实体 → 别名节点 或 别名节点 → 其他实体
# 2. STATEMENT_ENTITY陈述句 → 别名节点
# 对于每条需要重定向的边,创建一条指向用户节点的新边(复制所有属性),然后删除旧边。
REDIRECT_ALIAS_EDGES = """
// 找到所有 别名→用户 的映射
MATCH (alias:ExtractedEntity {end_user_id: $end_user_id})-[ar:EXTRACTED_RELATIONSHIP]->(user:ExtractedEntity {end_user_id: $end_user_id})
WHERE ar.predicate = '别名属于'
WITH collect({alias_id: elementId(alias), user_id: elementId(user), alias_eid: alias.id, user_eid: user.id}) AS mappings
// 1. 重定向 EXTRACTED_RELATIONSHIP 边:别名节点作为 target 的情况
UNWIND mappings AS m
MATCH (other)-[r:EXTRACTED_RELATIONSHIP]->(alias:ExtractedEntity {end_user_id: $end_user_id})
WHERE alias.id = m.alias_eid
AND r.predicate <> '别名属于'
AND other.id <> m.user_eid
WITH m, other, r, alias
MATCH (user:ExtractedEntity {id: m.user_eid, end_user_id: $end_user_id})
CREATE (other)-[nr:EXTRACTED_RELATIONSHIP]->(user)
SET nr = properties(r)
DELETE r
WITH count(*) AS redirected_incoming
// 2. 重定向 EXTRACTED_RELATIONSHIP 边:别名节点作为 source 的情况
MATCH (alias:ExtractedEntity {end_user_id: $end_user_id})-[ar2:EXTRACTED_RELATIONSHIP]->(user2:ExtractedEntity {end_user_id: $end_user_id})
WHERE ar2.predicate = '别名属于'
WITH alias, user2, redirected_incoming
MATCH (alias)-[r:EXTRACTED_RELATIONSHIP]->(other)
WHERE r.predicate <> '别名属于'
AND other.id <> user2.id
WITH user2, other, r, redirected_incoming
CREATE (user2)-[nr:EXTRACTED_RELATIONSHIP]->(other)
SET nr = properties(r)
DELETE r
WITH redirected_incoming, count(*) AS redirected_outgoing
// 3. 重定向 STATEMENT_ENTITY 边:陈述句 → 别名节点
MATCH (alias:ExtractedEntity {end_user_id: $end_user_id})-[ar3:EXTRACTED_RELATIONSHIP]->(user3:ExtractedEntity {end_user_id: $end_user_id})
WHERE ar3.predicate = '别名属于'
WITH alias, user3, redirected_incoming, redirected_outgoing
MATCH (stmt)-[r:STATEMENT_ENTITY]->(alias)
WITH user3, stmt, r, redirected_incoming, redirected_outgoing
CREATE (stmt)-[nr:STATEMENT_ENTITY]->(user3)
SET nr = properties(r)
DELETE r
RETURN redirected_incoming, redirected_outgoing, count(*) AS redirected_stmt
"""
CHECK_COMMUNITY_IS_COMPLETE_WITH_EMBEDDING = """
MATCH (c:Community {community_id: $community_id, end_user_id: $end_user_id})
RETURN (

View File

@@ -12,7 +12,6 @@ from typing import Awaitable, Callable, Optional
from app.core.config import settings
from app.core.logging_config import get_memory_logger, log_time
from app.core.memory.pipelines.pilot_write_pipeline import PilotWritePipeline
from app.core.memory.models.message_models import (
ConversationContext,
ConversationMessage,
@@ -306,14 +305,11 @@ async def run_pilot_extraction(
logger.warning(f"Failed to load ontology types: {e}", exc_info=True)
if use_refactored:
pilot_pipeline = PilotWritePipeline(
llm_client=llm_client,
embedder_client=embedder_client,
pipeline_config=get_pipeline_config(memory_config),
progress_callback=progress_callback,
embedding_id=str(memory_config.embedding_model_id),
language=language,
ontology_types=ontology_types,
from app.core.memory.memory_service import MemoryService
memory_service = MemoryService(
memory_config=memory_config,
end_user_id=str(memory_config.workspace_id),
)
log_time("Pilot Pipeline Initialization", time.time() - step_start, log_file)
@@ -325,7 +321,11 @@ async def run_pilot_extraction(
if progress_callback:
await progress_callback("knowledge_extraction", "正在知识抽取...")
pilot_result = await pilot_pipeline.run(chunked_dialogs)
pilot_result = await memory_service.pilot_write(
chunked_dialogs=chunked_dialogs,
language=language,
progress_callback=progress_callback,
)
dialog_data_list = pilot_result.dialog_data_list
graph = pilot_result.graph
chunk_nodes = graph.chunk_nodes

View File

@@ -1564,9 +1564,201 @@ def extract_emotion_batch_task(
_shutdown_loop_gracefully(loop)
@celery_app.task(
bind=True,
name="app.tasks.extract_metadata_batch",
max_retries=2,
default_retry_delay=30,
)
def extract_metadata_batch_task(
self,
user_entities: List[Dict[str, Any]],
llm_model_id: str,
language: str = "zh",
snapshot_dir: Optional[str] = None,
) -> Dict[str, Any]:
"""Celery task: 用户实体元数据提取 + Neo4j 回写。
在主写入流水线完成后异步执行。从用户实体的 description 中提取
结构化元数据core_facts、traits、relations 等),增量回写到 Neo4j。
Args:
user_entities: 用户实体列表,每项包含:
- entity_id: 实体 ID
- entity_name: 实体名称
- descriptions: description 文本列表
llm_model_id: LLM 模型 UUID 字符串
language: 语言 ("zh" / "en")
snapshot_dir: 可选的快照目录路径(调试模式下使用)
"""
task_id = self.request.id
total = len(user_entities)
logger.info(
f"[Metadata] 开始用户元数据提取: "
f"entities={total}, llm_model_id={llm_model_id}, "
f"language={language}, task_id={task_id}"
)
start_time = time.time()
if not user_entities:
return {"status": "SUCCESS", "total": 0, "extracted": 0, "failed": 0, "task_id": task_id}
async def _run() -> Dict[str, Any]:
from app.core.memory.models.variate_config import ExtractionPipelineConfig
from app.core.memory.storage_services.extraction_engine.steps.base import StepContext
from app.core.memory.storage_services.extraction_engine.steps.metadata_step import MetadataExtractionStep
from app.core.memory.storage_services.extraction_engine.steps.schema import (
MetadataStepInput,
)
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
from app.db import get_db_context
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
from app.repositories.neo4j.cypher_queries import ENTITY_METADATA_UPDATE, ENTITY_METADATA_QUERY
# Build LLM client
with get_db_context() as db:
factory = MemoryClientFactory(db)
llm_client = factory.get_llm_client(llm_model_id)
pipeline_config = ExtractionPipelineConfig()
context = StepContext(
llm_client=llm_client,
language=language,
config=pipeline_config,
)
step = MetadataExtractionStep(context)
extracted = 0
failed = 0
snapshot_outputs: Dict[str, Any] = {} if snapshot_dir else None # type: ignore[assignment]
connector = Neo4jConnector()
try:
for entity_dict in user_entities:
entity_id = entity_dict["entity_id"]
entity_name = entity_dict.get("entity_name", "")
descriptions = entity_dict.get("descriptions", [])
if not descriptions:
logger.debug(f"[Metadata] 跳过无 description 的实体: {entity_id}")
continue
try:
# 查询已有元数据用于增量去重
existing_metadata = {}
try:
records = await connector.execute_query(
ENTITY_METADATA_QUERY, entity_id=entity_id
)
if records:
rec = records[0]
for field in (
"core_facts", "traits", "relations", "goals",
"interests", "beliefs_or_stances", "anchors", "events",
):
val = rec.get(field)
existing_metadata[field] = val if val else []
except Exception as e:
logger.warning(f"[Metadata] 查询已有元数据失败: {e}")
inp = MetadataStepInput(
entity_id=entity_id,
entity_name=entity_name,
descriptions=descriptions,
existing_metadata=existing_metadata,
)
result = await step.run(inp)
if result.has_any():
# 回写 Neo4j
await connector.execute_query(
ENTITY_METADATA_UPDATE,
entity_id=entity_id,
core_facts=result.core_facts,
traits=result.traits,
relations=result.relations,
goals=result.goals,
interests=result.interests,
beliefs_or_stances=result.beliefs_or_stances,
anchors=result.anchors,
events=result.events,
)
extracted += 1
logger.info(
f"[Metadata] 实体 {entity_name}({entity_id}) 元数据提取并回写成功"
)
else:
logger.debug(
f"[Metadata] 实体 {entity_name}({entity_id}) 无新增元数据"
)
if snapshot_outputs is not None:
snapshot_outputs[entity_id] = {
"entity_name": entity_name,
"descriptions": descriptions,
"extracted_metadata": result.model_dump(),
}
except Exception as e:
failed += 1
if snapshot_outputs is not None:
snapshot_outputs[entity_id] = {"error": str(e)}
logger.warning(
f"[Metadata] 实体 {entity_id} 元数据提取失败: {e}"
)
finally:
await connector.close()
# 快照落盘
if snapshot_outputs is not None and snapshot_dir:
try:
from pathlib import Path as _Path
import json as _json
_dir = _Path(snapshot_dir)
_dir.mkdir(parents=True, exist_ok=True)
_path = _dir / "8_metadata_outputs.json"
with open(_path, "w", encoding="utf-8") as _f:
_json.dump(snapshot_outputs, _f, ensure_ascii=False, indent=2, default=str)
logger.info(
f"[Metadata][Snapshot] 已落盘 {len(snapshot_outputs)} 条元数据结果 → {_path}"
)
except Exception as _e:
logger.warning(
f"[Metadata][Snapshot] 快照落盘失败(不影响主流程): {_e}"
)
return {"extracted": extracted, "failed": failed}
loop = None
try:
loop = set_asyncio_event_loop()
result = loop.run_until_complete(_run())
elapsed = time.time() - start_time
logger.info(
f"[Metadata] 任务完成: 提取={result['extracted']}, "
f"失败={result['failed']}, 耗时={elapsed:.2f}s, task_id={task_id}"
)
return {
"status": "SUCCESS",
"total": total,
**result,
"elapsed_time": elapsed,
"task_id": task_id,
}
except Exception as e:
elapsed = time.time() - start_time
logger.error(
f"[Metadata] 任务失败: {e}, 耗时={elapsed:.2f}s",
exc_info=True,
)
raise self.retry(exc=e)
finally:
if loop:
_shutdown_loop_gracefully(loop)
# unused task
# @celery_app.task(name="app.core.memory.agent.health.check_read_service")
# def check_read_service_task() -> Dict[str, str]:
# """Call read_service and write latest status to Redis.
# Returns status data dict that gets written to Redis.
@@ -3222,299 +3414,4 @@ def init_community_clustering_for_users(self, end_user_ids: List[str], workspace
# ─── User Metadata Extraction Task ───────────────────────────────────────────
def _update_timestamps(existing: dict, new: dict, updated_at: dict, now: str, prefix: str = "") -> None:
"""对比新旧元数据,更新变更字段的 _updated_at 时间戳。"""
for key, new_val in new.items():
if key == "_updated_at":
continue
path = f"{prefix}.{key}" if prefix else key
old_val = existing.get(key)
if isinstance(new_val, dict) and isinstance(old_val, dict):
_update_timestamps(old_val, new_val, updated_at, now, prefix=path)
elif old_val != new_val:
updated_at[path] = now
@celery_app.task(
bind=True,
name='app.tasks.extract_user_metadata',
ignore_result=False,
max_retries=0,
acks_late=True,
time_limit=300,
soft_time_limit=240,
)
def extract_user_metadata_task(
self,
end_user_id: str,
statements: List[str],
config_id: Optional[str] = None,
language: str = "zh",
) -> Dict[str, Any]:
"""异步提取用户元数据并写入数据库。
在去重消歧完成后由编排器触发,使用独立 LLM 调用提取元数据。
LLM 配置优先使用 config_id 对应的应用配置,失败时回退到工作空间默认配置。
Args:
end_user_id: 终端用户 ID
statements: 用户相关的 statement 文本列表
config_id: 应用配置 ID可选
language: 语言类型 ("zh" 中文, "en" 英文)
Returns:
包含任务执行结果的字典
"""
start_time = time.time()
logger.info(
f"[CELERY METADATA] Starting metadata extraction - end_user_id={end_user_id}, "
f"statements_count={len(statements)}, config_id={config_id}, language={language}"
)
async def _run() -> Dict[str, Any]:
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import MetadataExtractor
from app.repositories.end_user_info_repository import EndUserInfoRepository
from app.repositories.end_user_repository import EndUserRepository
from app.services.memory_config_service import MemoryConfigService
# 1. 获取 LLM 配置(应用配置 → 工作空间配置兜底)并创建 LLM client
with get_db_context() as db:
end_user_uuid = uuid.UUID(end_user_id)
# 获取 workspace_id from end_user
end_user = EndUserRepository(db).get_by_id(end_user_uuid)
if not end_user:
return {"status": "FAILURE", "error": f"End user not found: {end_user_id}"}
workspace_id = end_user.workspace_id
config_service = MemoryConfigService(db)
memory_config = config_service.get_config_with_fallback(
memory_config_id=uuid.UUID(config_id) if config_id else None,
workspace_id=workspace_id,
)
if not memory_config:
return {"status": "FAILURE", "error": "No LLM config available (app + workspace fallback failed)"}
# 2. 创建 LLM client
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
factory = MemoryClientFactory(db)
if not memory_config.llm_id:
return {"status": "FAILURE", "error": "Memory config has no LLM model configured"}
llm_client = factory.get_llm_client(memory_config.llm_id)
# 2.5 读取已有元数据和别名,传给 extractor 作为上下文
existing_metadata = None
existing_aliases = None
try:
info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid)
if info:
if info.meta_data:
existing_metadata = info.meta_data
existing_aliases = info.aliases if info.aliases else []
logger.info(f"[CELERY METADATA] 已读取已有元数据和别名aliases={existing_aliases}")
except Exception as e:
logger.warning(f"[CELERY METADATA] 读取已有数据失败(继续无上下文提取): {e}")
# 3. 提取元数据和别名(传入已有数据作为上下文)
extractor = MetadataExtractor(llm_client=llm_client, language=language)
extract_result = await extractor.extract_metadata(
statements,
existing_metadata=existing_metadata,
existing_aliases=existing_aliases,
)
if not extract_result:
logger.info(f"[CELERY METADATA] No metadata extracted for end_user_id={end_user_id}")
return {"status": "SUCCESS", "result": "no_metadata_extracted"}
metadata_changes, aliases_to_add, aliases_to_remove = extract_result
logger.info(
f"[CELERY METADATA] LLM 元数据变更: {[c.model_dump() for c in metadata_changes]}, "
f"别名新增: {aliases_to_add}, 移除: {aliases_to_remove}"
)
from datetime import datetime as dt, timezone as tz
now = dt.now(tz.utc).isoformat()
# 过滤别名中的占位名称,执行增量增删
_PLACEHOLDER_NAMES = {"用户", "", "user", "i"}
def _filter_aliases(aliases_list):
seen = set()
result = []
for a in aliases_list:
a_stripped = a.strip()
if a_stripped and a_stripped.lower() not in _PLACEHOLDER_NAMES and a_stripped.lower() not in seen:
result.append(a_stripped)
seen.add(a_stripped.lower())
return result
filtered_add = _filter_aliases(aliases_to_add)
filtered_remove = _filter_aliases(aliases_to_remove)
remove_lower = {a.lower() for a in filtered_remove}
with get_db_context() as db:
end_user_uuid = uuid.UUID(end_user_id)
info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid)
end_user = EndUserRepository(db).get_by_id(end_user_uuid)
if info:
# 4. 元数据增量更新(按 LLM 输出的变更操作逐条执行,所有字段均为列表类型)
if metadata_changes:
# 深拷贝,确保 SQLAlchemy 能检测到变更
import copy
existing_meta = copy.deepcopy(info.meta_data) if info.meta_data else {}
updated_at = dict(existing_meta.get("_updated_at", {}))
for change in metadata_changes:
field_path = change.field_path
action = change.action
value = change.value
if not value or not value.strip():
continue
# 定位到目标字段的父级节点
parts = field_path.split(".")
target = existing_meta
for part in parts[:-1]:
target = target.setdefault(part, {})
leaf = parts[-1]
current_list = target.get(leaf, [])
if action == "set":
if value not in current_list:
# 新值插入列表头部,保证按时间从新到旧排序
current_list.insert(0, value)
target[leaf] = current_list
logger.info(f"[CELERY METADATA] set {field_path} = {value}")
elif action == "remove":
if value in current_list:
current_list.remove(value)
target[leaf] = current_list
logger.info(f"[CELERY METADATA] remove {value} from {field_path}")
updated_at[field_path] = now
existing_meta["_updated_at"] = updated_at
# 赋值深拷贝后的新对象SQLAlchemy 会检测到字段变更并写入
info.meta_data = existing_meta
logger.info(f"[CELERY METADATA] 增量更新元数据完成: {json.dumps(existing_meta, ensure_ascii=False)}")
# 别名增量增删:(已有 - remove) + add
old_aliases = info.aliases if info.aliases else []
# 先移除
merged = [a for a in old_aliases if a.strip().lower() not in remove_lower]
# 再追加(去重)
existing_lower = {a.strip().lower() for a in merged}
for a in filtered_add:
if a.lower() not in existing_lower:
merged.append(a)
existing_lower.add(a.lower())
if merged != old_aliases:
info.aliases = merged
# other_name 更新逻辑
if merged and (
not info.other_name
or info.other_name.strip().lower() in _PLACEHOLDER_NAMES
or info.other_name.strip().lower() in remove_lower
):
info.other_name = merged[0]
if end_user and merged and (
not end_user.other_name
or end_user.other_name.strip().lower() in _PLACEHOLDER_NAMES
or end_user.other_name.strip().lower() in remove_lower
):
end_user.other_name = merged[0]
logger.info(
f"[CELERY METADATA] 别名增量更新: {old_aliases} - {filtered_remove} + {filtered_add}{merged}"
)
else:
# 没有 end_user_info 记录,创建一条
from app.models.end_user_info_model import EndUserInfo
initial_aliases = filtered_add # 新记录只有 add没有 remove
first_alias = initial_aliases[0] if initial_aliases else ""
# 从变更操作构建初始元数据(所有字段均为列表类型)
initial_meta = {}
for change in metadata_changes:
if change.action == "set" and change.value is not None and change.value.strip():
parts = change.field_path.split(".")
target = initial_meta
for part in parts[:-1]:
target = target.setdefault(part, {})
leaf = parts[-1]
current_list = target.get(leaf, [])
if change.value not in current_list:
# 新值插入列表头部,保证按时间从新到旧排序
current_list.insert(0, change.value)
target[leaf] = current_list
if first_alias or initial_meta:
new_info = EndUserInfo(
end_user_id=end_user_uuid,
other_name=first_alias or "",
aliases=initial_aliases,
meta_data=initial_meta if initial_meta else None,
)
db.add(new_info)
if end_user and first_alias and (
not end_user.other_name or end_user.other_name.strip().lower() in _PLACEHOLDER_NAMES
):
end_user.other_name = first_alias
logger.info(f"[CELERY METADATA] 创建 end_user_info: other_name={first_alias}, aliases={initial_aliases}")
else:
return {"status": "SUCCESS", "result": "no_data_to_write"}
db.commit()
# 同步 PgSQL aliases 到 Neo4j 用户实体PgSQL 为权威源)
final_aliases = info.aliases if info else initial_aliases
if final_aliases:
try:
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
neo4j_connector = Neo4jConnector()
cypher = """
MATCH (e:ExtractedEntity)
WHERE e.end_user_id = $end_user_id AND e.name IN ['用户', '', 'User', 'I']
SET e.aliases = $aliases
"""
await neo4j_connector.execute_query(
cypher, end_user_id=end_user_id, aliases=final_aliases
)
await neo4j_connector.close()
logger.info(f"[CELERY METADATA] Neo4j 用户实体 aliases 已同步: {final_aliases}")
except Exception as neo4j_err:
logger.warning(f"[CELERY METADATA] Neo4j aliases 同步失败(不影响主流程): {neo4j_err}")
return {"status": "SUCCESS", "result": "metadata_and_aliases_written"}
loop = None
try:
loop = set_asyncio_event_loop()
result = loop.run_until_complete(_run())
elapsed = time.time() - start_time
result["elapsed_time"] = elapsed
result["task_id"] = self.request.id
logger.info(f"[CELERY METADATA] Task completed - elapsed={elapsed:.2f}s, result={result.get('result')}")
return result
except Exception as e:
elapsed = time.time() - start_time
logger.error(f"[CELERY METADATA] Task failed - elapsed={elapsed:.2f}s, error={e}", exc_info=True)
return {
"status": "FAILURE",
"error": str(e),
"elapsed_time": elapsed,
"task_id": self.request.id,
}
finally:
if loop:
_shutdown_loop_gracefully(loop)
# unused task