From b0ddd12cc60ca355a960ca3714cb5b9ee5360f96 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Fri, 24 Apr 2026 13:55:14 +0800 Subject: [PATCH] feat(memory): add emotion batch extraction task and improve extraction prompts - Add extract_emotion_batch_task for async emotion extraction - Refine Chinese entity types and relation types in extraction prompts - Add STATEMENT_EMOTION_UPDATE Cypher query for Neo4j backfill - Refactor statement_step and triplet_step implementations --- api/app/celery_app.py | 3 + api/app/core/memory/models/message_models.py | 2 + .../core/memory/pipelines/write_pipeline.py | 52 ++ .../statement_extraction.py | 2 + .../steps/extraction_pipeline_orchestrator.py | 139 +-- .../steps/schema/extraction_step_schema.py | 4 +- .../extraction_engine/steps/statement_step.py | 37 +- .../extraction_engine/steps/triplet_step.py | 29 +- .../core/memory/utils/prompt/prompt_utils.py | 16 + .../utils/prompt/prompts/entity_dedup.jinja2 | 89 +- .../prompt/prompts/extract_statement.jinja2 | 759 ++++++++++------ .../prompt/prompts/extract_triplet.jinja2 | 859 ++++++++++-------- api/app/repositories/neo4j/cypher_queries.py | 9 + api/app/tasks.py | 154 ++++ 14 files changed, 1321 insertions(+), 833 deletions(-) diff --git a/api/app/celery_app.py b/api/app/celery_app.py index 717709da..d380745b 100644 --- a/api/app/celery_app.py +++ b/api/app/celery_app.py @@ -114,6 +114,9 @@ celery_app.conf.update( # Metadata extraction → memory_tasks queue 'app.tasks.extract_user_metadata': {'queue': 'memory_tasks'}, + # Async emotion extraction → memory_tasks queue (IO-bound LLM calls) + 'app.tasks.extract_emotion_batch': {'queue': 'memory_tasks'}, + # Document tasks → document_tasks queue (prefork worker) 'app.core.rag.tasks.parse_document': {'queue': 'document_tasks'}, 'app.core.rag.tasks.sync_knowledge_for_kb': {'queue': 'document_tasks'}, diff --git a/api/app/core/memory/models/message_models.py b/api/app/core/memory/models/message_models.py index 66203067..67d274c7 100644 --- a/api/app/core/memory/models/message_models.py +++ b/api/app/core/memory/models/message_models.py @@ -94,6 +94,8 @@ class Statement(BaseModel): emotion_keywords: Optional[List[str]] = Field(default_factory=list, description="Emotion keywords, max 3") emotion_subject: Optional[str] = Field(None, description="Emotion subject: self/other/object") emotion_target: Optional[str] = Field(None, description="Emotion target: person or object name") + # Reference resolution + has_unsolved_reference: bool = Field(False, description="Whether the statement has unresolved references") class ConversationContext(BaseModel): diff --git a/api/app/core/memory/pipelines/write_pipeline.py b/api/app/core/memory/pipelines/write_pipeline.py index cc30df7d..180a70cf 100644 --- a/api/app/core/memory/pipelines/write_pipeline.py +++ b/api/app/core/memory/pipelines/write_pipeline.py @@ -219,6 +219,9 @@ class WritePipeline: f"✔ {time.time() - step_start:.2f}s" ) + # Step 3.5: 异步情绪提取(fire-and-forget,需在 _store 之后确保 Statement 节点已存在) + self._extract_emotion(getattr(self, "_emotion_statements", [])) + # Step 4: 聚类 - 增量更新社区(异步,不阻塞) step_start = time.time() await self._cluster(extraction_result) @@ -334,6 +337,10 @@ class WritePipeline: # step1: 执行知识提取 dialog_data_list = await new_orchestrator.run(chunked_dialogs) + # 收集需要异步情绪提取的 statements(由编排器在 Phase 4 后收集) + # 注意:实际 dispatch 在 _store 之后,确保 Statement 节点已写入 Neo4j + self._emotion_statements = new_orchestrator.emotion_statements + # ── Snapshot: 各阶段萃取结果 ── TODO 乐力齐 重构流水线切换生产环境稳定后修改 stage_outputs = new_orchestrator.last_stage_outputs if stage_outputs: @@ -578,6 +585,51 @@ class WritePipeline: ) # ────────────────────────────────────────────── + # Step 4.5: 异步情绪提取 + # fire-and-forget 提交 Celery 任务,不阻塞主流程 + # ────────────────────────────────────────────── + + def _extract_emotion(self, emotion_statements: list) -> None: + """提交异步情绪提取 Celery 任务。 + + 从编排器收集的 user statement 列表中提取情绪, + 异步回写到 Neo4j Statement 节点。失败不影响主流程。 + """ + if not emotion_statements: + return + + llm_model_id = ( + str(self.memory_config.llm_model_id) + if self.memory_config.llm_model_id + else None + ) + if not llm_model_id: + logger.warning("[Emotion] 无法提交情绪提取任务:llm_model_id 为空") + return + + try: + from app.celery_app import celery_app + + result = celery_app.send_task( + "app.tasks.extract_emotion_batch", + kwargs={ + "statements": emotion_statements, + "llm_model_id": llm_model_id, + "language": self.language, + }, + ) + logger.info( + f"[Emotion] 异步情绪提取任务已提交 - " + f"task_id={result.id}, " + f"statement_count={len(emotion_statements)}, " + f"source=async" + ) + except Exception as e: + logger.error( + f"[Emotion] 提交情绪提取任务失败(不影响主流程): {e}", + exc_info=True, + ) + # ────────────────────────────────────────────── # Step 5: 摘要 # (+ entity_description)+ meta_data部分在此提取 # ────────────────────────────────────────────── diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py index d90a49ba..76a48c58 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py @@ -22,6 +22,7 @@ class ExtractedStatement(BaseModel): statement_type: str = Field(..., description="FACT, OPINION, SUGGESTION or PREDICTION") temporal_type: str = Field(..., description="STATIC, DYNAMIC, ATEMPORAL") relevence: str = Field(..., description="RELEVANT or IRRELEVANT") + has_unsolved_reference: bool = Field(False, description="Whether the statement has unresolved references") class StatementExtractionResponse(BaseModel): statements: List[ExtractedStatement] = Field(default_factory=list, description="List of extracted statements") @@ -159,6 +160,7 @@ class StatementExtractor: chunk_id=chunk.id, end_user_id=end_user_id, speaker=chunk_speaker, + has_unsolved_reference=getattr(extracted_stmt, "has_unsolved_reference", False), ) chunk_statements.append(chunk_statement) diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/extraction_pipeline_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/steps/extraction_pipeline_orchestrator.py index 4098312f..ea8c2812 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/extraction_pipeline_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/extraction_pipeline_orchestrator.py @@ -225,6 +225,7 @@ class NewExtractionOrchestrator: speaker=stmt_out.speaker, valid_at=stmt_out.valid_at, invalid_at=stmt_out.invalid_at, + has_unsolved_reference=stmt_out.has_unsolved_reference, ) @staticmethod @@ -346,7 +347,7 @@ class NewExtractionOrchestrator: dialog_data_list, all_stmt_results ) - # Build sidecar inputs for after_statement sidecars (e.g. emotion) + # Build sidecar inputs for after_statement sidecars (emotion excluded — async Celery) sidecar_pairs = self._build_after_statement_sidecar_inputs( dialog_data_list, all_stmt_results ) @@ -403,21 +404,25 @@ class NewExtractionOrchestrator: # ── Phase 4: Data assignment ── logger.info("Phase 4/4: Data assignment") - emotion_results = sidecar_output_map.get("emotion_extraction", {}) self._assign_results( dialog_data_list, all_stmt_results, all_triplet_results, - emotion_results=emotion_results, + emotion_results={}, embedding_output=merged_emb, ) + # ── Fire-and-forget: collect statements for async emotion extraction ── + self._emotion_statements: List[Dict[str, str]] = [] + if self.config.emotion_enabled: + self._emotion_statements = self._collect_emotion_statements(all_stmt_results) + # Store raw step outputs for snapshot/debugging self._last_stage_outputs = { "statement_results": all_stmt_results, "triplet_results": all_triplet_results, - "emotion_results": emotion_results, + "emotion_results": {}, "embedding_output": merged_emb, } @@ -611,8 +616,7 @@ class NewExtractionOrchestrator: ) -> List[Tuple[ExtractionStep, Any]]: """Build (step, input) pairs for after_statement sidecars. - For emotion extraction, we create a batch wrapper that runs the sidecar - on every user statement concurrently and returns a dict of results. + Emotion extraction is excluded here — it runs asynchronously via Celery. """ if not self.after_statement_sidecars: return [] @@ -628,21 +632,12 @@ class NewExtractionOrchestrator: pairs: List[Tuple[ExtractionStep, Any]] = [] for sidecar in self.after_statement_sidecars: if sidecar.name == "emotion_extraction": - # Emotion sidecar: wrap as batch coroutine via a sentinel input - # The actual per-statement calls happen inside _run_emotion_batch - pairs.append(( - _EmotionBatchWrapper(sidecar, all_user_stmts), - EmotionStepInput( - statement_id="__batch__", - statement_text="", - speaker="", - ), - )) - else: - # Generic sidecar: pass first statement as representative input - if all_user_stmts: - inp = self._convert_to_emotion_input(all_user_stmts[0]) - pairs.append((sidecar, inp)) + # Skip — emotion is dispatched as async Celery task after Phase 4 + continue + # Generic sidecar: pass first statement as representative input + if all_user_stmts: + inp = self._convert_to_emotion_input(all_user_stmts[0]) + pairs.append((sidecar, inp)) return pairs @@ -675,11 +670,41 @@ class NewExtractionOrchestrator: merged.entity_embeddings = entity.entity_embeddings return merged + # ────────────────────────────────────────────── + # 6.5 异步情绪提取调度 + # 收集 user statement,fire-and-forget 发送 Celery task + # ────────────────────────────────────────────── + + def _collect_emotion_statements( + self, + all_stmt_results: Dict[str, Dict[str, List[StatementStepOutput]]], + ) -> List[Dict[str, str]]: + """Collect user statements for async emotion extraction. + + Returns a list of dicts ready to be sent as Celery task payload. + """ + statements_payload: List[Dict[str, str]] = [] + for _dialog_id, chunk_stmts in all_stmt_results.items(): + for _chunk_id, stmts in chunk_stmts.items(): + for s in stmts: + if s.speaker == "user": + statements_payload.append({ + "statement_id": s.statement_id, + "statement_text": s.statement_text, + "speaker": s.speaker, + }) + return statements_payload + + @property + def emotion_statements(self) -> List[Dict[str, str]]: + """Statements collected for async emotion extraction after last run.""" + return getattr(self, "_emotion_statements", []) + # ────────────────────────────────────────────── # 7. 数据赋值 # 将各阶段 StepOutput 组装为 Statement 对象,替换 chunk.statements # ────────────────────────────────────────────── - + # TODO 乐力齐 函数内容密集较长,需要优化 def _assign_results( self, dialog_data_list: List[DialogData], @@ -815,7 +840,7 @@ class NewExtractionOrchestrator: speaker=stmt_out.speaker, stmt_type=_STMT_TYPE_MAP.get(stmt_out.statement_type, StatementType.FACT), temporal_info=_TEMPORAL_MAP.get(stmt_out.temporal_type, TemporalInfo.ATEMPORAL), - relevence_info=RelevenceInfo.RELEVANT if stmt_out.relevance == "RELEVANT" else RelevenceInfo.IRRELEVANT, + # relevence_info=RelevenceInfo.RELEVANT if stmt_out.relevance == "RELEVANT" else RelevenceInfo.IRRELEVANT, temporal_validity=TemporalValidityRange(valid_at=valid_at, invalid_at=invalid_at), triplet_extraction_info=triplet_info, statement_embedding=stmt_embedding, @@ -836,71 +861,3 @@ class NewExtractionOrchestrator: assigned_chunk_emb, assigned_dialog_emb, ) - - -class _EmotionBatchWrapper(ExtractionStep): - """情绪批量提取包装器。再考虑一下用法,这是子类? - - 将单条情绪旁路 Step 包装为批量并发执行,适配 ``_run_with_sidecars`` 接口。 - 编排器传入一个 sentinel input,``run()`` 忽略它,转而对预收集的 statement 列表 - 逐条并发调用内部 Step,返回 ``{statement_id: EmotionStepOutput}`` 字典。 - """ - - # ── 初始化 ── - - def __init__( - self, - inner_step: ExtractionStep, - statements: List[StatementStepOutput], - ) -> None: - # 不调用 super().__init__() — 本类是薄包装,不需要 StepContext - self._inner = inner_step - self._statements = statements - - # ── Step 身份属性(满足 ExtractionStep 抽象接口) ── - - @property - def name(self) -> str: - return self._inner.name - - @property - def is_critical(self) -> bool: - return False - - def get_default_output(self) -> Dict[str, EmotionStepOutput]: - return {} - - # ── 未使用的生命周期方法(批量包装器不走 render→call→parse 流程) ── - - async def render_prompt(self, input_data: Any) -> Any: - raise NotImplementedError - - async def call_llm(self, prompt: Any) -> Any: - raise NotImplementedError - - async def parse_response(self, raw_response: Any, input_data: Any) -> Any: - raise NotImplementedError - - # ── 批量执行入口 ── - - async def run(self, input_data: Any) -> Dict[str, EmotionStepOutput]: - """对所有预收集的 statement 并发执行情绪提取,单条失败返回默认值。""" - if not self._statements: - return {} - - async def _extract_one(stmt: StatementStepOutput) -> Tuple[str, EmotionStepOutput]: - inp = EmotionStepInput( - statement_id=stmt.statement_id, - statement_text=stmt.statement_text, - speaker=stmt.speaker, - ) - try: - result = await self._inner.run(inp) - return stmt.statement_id, result - except Exception: - return stmt.statement_id, self._inner.get_default_output() - - pairs = await asyncio.gather( - *[_extract_one(s) for s in self._statements] - ) - return dict(pairs) diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/schema/extraction_step_schema.py b/api/app/core/memory/storage_services/extraction_engine/steps/schema/extraction_step_schema.py index a4dad6d5..8b0ae643 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/schema/extraction_step_schema.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/schema/extraction_step_schema.py @@ -44,10 +44,11 @@ class StatementStepOutput(BaseModel): statement_text: str statement_type: str # FACT / OPINION / PREDICTION / SUGGESTION temporal_type: str # STATIC / DYNAMIC / ATEMPORAL - relevance: str # RELEVANT / IRRELEVANT + # relevance: str # RELEVANT / IRRELEVANT speaker: str # "user" / "assistant" valid_at: str # ISO 8601 or "NULL" invalid_at: str # ISO 8601 or "NULL" + has_unsolved_reference: bool = False # Whether the statement has unresolved references # ── Triplet extraction ── @@ -62,6 +63,7 @@ class TripletStepInput(BaseModel): speaker: str valid_at: str invalid_at: str + has_unsolved_reference: bool = False # From upstream statement extraction class EntityItem(BaseModel): diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/statement_step.py b/api/app/core/memory/storage_services/extraction_engine/steps/statement_step.py index f0af11f6..a0c76b68 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/statement_step.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/statement_step.py @@ -9,7 +9,7 @@ import logging import uuid from typing import Any, List -from pydantic import BaseModel, Field, field_validator +from pydantic import AliasChoices, BaseModel, Field, field_validator from app.core.memory.utils.data.ontology import LABEL_DEFINITIONS from app.core.memory.utils.prompt.prompt_utils import render_statement_extraction_prompt @@ -26,12 +26,17 @@ logger = logging.getLogger(__name__) class _ExtractedStatement(BaseModel): """Raw statement returned by the LLM (before enrichment).""" - statement: str = Field(..., description="The extracted statement text") - statement_type: str = Field(..., description="FACT / OPINION / SUGGESTION / PREDICTION") + statement: str = Field( + ..., + validation_alias=AliasChoices("statement", "statement_text"), + description="The extracted statement text", + ) + statement_type: str = Field(..., description="FACT / OPINION / OTHER") temporal_type: str = Field(..., description="STATIC / DYNAMIC / ATEMPORAL") - relevance: str = Field("RELEVANT", description="RELEVANT / IRRELEVANT") + # relevance: str = Field("RELEVANT", description="RELEVANT / IRRELEVANT") valid_at: str = Field("NULL", description="ISO 8601 or NULL") invalid_at: str = Field("NULL", description="ISO 8601 or NULL") + has_unsolved_reference: bool = Field(False, description="Whether the statement has unresolved references") class _StatementExtractionResponse(BaseModel): @@ -44,7 +49,12 @@ class _StatementExtractionResponse(BaseModel): def filter_empty(cls, v: Any) -> Any: """Drop empty / malformed dicts that the LLM occasionally produces.""" if isinstance(v, list): - return [s for s in v if isinstance(s, dict) and s.get("statement")] + return [ + s + for s in v + if isinstance(s, dict) + and (s.get("statement") or s.get("statement_text")) + ] return v @@ -89,6 +99,19 @@ class StatementExtractionStep(ExtractionStep[StatementStepInput, List[StatementS f"{m.role}: {m.msg}" for m in input_data.supporting_context.msgs ) + input_json = { + "chunk_id": input_data.chunk_id, + "end_user_id": input_data.end_user_id, + "target_content": input_data.target_content, + "target_message_date": input_data.target_message_date, + "supporting_context": { + "msgs": [ + {"role": m.role, "msg": m.msg} + for m in input_data.supporting_context.msgs + ] + }, + } + return await render_statement_extraction_prompt( chunk_content=input_data.target_content, definitions=self.definitions, @@ -98,6 +121,7 @@ class StatementExtractionStep(ExtractionStep[StatementStepInput, List[StatementS dialogue_content=dialogue_content, max_dialogue_chars=self.max_dialogue_context_chars, language=self.language, + input_json=input_json, ) async def call_llm(self, prompt: Any) -> Any: @@ -129,10 +153,11 @@ class StatementExtractionStep(ExtractionStep[StatementStepInput, List[StatementS statement_text=stmt.statement, statement_type=stmt.statement_type.strip().upper(), temporal_type=stmt.temporal_type.strip().upper(), - relevance=stmt.relevance.strip().upper(), + # relevance=stmt.relevance.strip().upper(), speaker="user", # default; orchestrator overrides from chunk metadata valid_at=stmt.valid_at or "NULL", invalid_at=stmt.invalid_at or "NULL", + has_unsolved_reference=getattr(stmt, "has_unsolved_reference", False), ) ) return results diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/triplet_step.py b/api/app/core/memory/storage_services/extraction_engine/steps/triplet_step.py index f8319114..af143a62 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/triplet_step.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/triplet_step.py @@ -8,7 +8,7 @@ import logging from typing import Any from app.core.memory.models.triplet_models import TripletExtractionResponse -from app.core.memory.utils.data.ontology import PREDICATE_DEFINITIONS, Predicate +from app.core.memory.utils.data.ontology import PREDICATE_DEFINITIONS from app.core.memory.utils.prompt.prompt_utils import render_triplet_extraction_prompt from .base import ExtractionStep, StepContext @@ -37,7 +37,6 @@ class TripletExtractionStep(ExtractionStep[TripletStepInput, TripletStepOutput]) self.ontology_types = ontology_types self.predicate_instructions = PREDICATE_DEFINITIONS self.json_schema = TripletExtractionResponse.model_json_schema() - self._allowed_predicates = {p.value for p in Predicate} # ── Identity ── @@ -57,6 +56,23 @@ class TripletExtractionStep(ExtractionStep[TripletStepInput, TripletStepOutput]) f"{m.role}: {m.msg}" for m in input_data.supporting_context.msgs ) if input_data.supporting_context.msgs else "" + input_json = { + "statement_id": input_data.statement_id, + "statement_text": input_data.statement_text, + "statement_type": input_data.statement_type, + "temporal_type": input_data.temporal_type, + "supporting_context": { + "msgs": [ + {"role": m.role, "msg": m.msg} + for m in input_data.supporting_context.msgs + ] + }, + "speaker": input_data.speaker, + "valid_at": input_data.valid_at, + "invalid_at": input_data.invalid_at, + "has_unsolved_reference": input_data.has_unsolved_reference, + } + return await render_triplet_extraction_prompt( statement=input_data.statement_text, chunk_content=chunk_content, @@ -65,6 +81,8 @@ class TripletExtractionStep(ExtractionStep[TripletStepInput, TripletStepOutput]) language=self.language, ontology_types=self.ontology_types, speaker=input_data.speaker, + input_json=input_json, + has_unsolved_reference=input_data.has_unsolved_reference, ) async def call_llm(self, prompt: Any) -> Any: @@ -88,8 +106,8 @@ class TripletExtractionStep(ExtractionStep[TripletStepInput, TripletStepOutput]) if not hasattr(raw_response, "triplets"): return self.get_default_output() - # Filter triplets to allowed predicates from ontology whitelist - filtered_triplets = [ + # Keep raw triplets from LLM output (no predicate whitelist filtering). + parsed_triplets = [ TripletItem( subject_name=t.subject_name, subject_id=t.subject_id, @@ -98,7 +116,6 @@ class TripletExtractionStep(ExtractionStep[TripletStepInput, TripletStepOutput]) object_id=t.object_id, ) for t in raw_response.triplets - if getattr(t, "predicate", "") in self._allowed_predicates ] entities = [ @@ -112,7 +129,7 @@ class TripletExtractionStep(ExtractionStep[TripletStepInput, TripletStepOutput]) for e in (raw_response.entities or []) ] - return TripletStepOutput(entities=entities, triplets=filtered_triplets) + return TripletStepOutput(entities=entities, triplets=parsed_triplets) def get_default_output(self) -> TripletStepOutput: return TripletStepOutput(entities=[], triplets=[]) diff --git a/api/app/core/memory/utils/prompt/prompt_utils.py b/api/app/core/memory/utils/prompt/prompt_utils.py index fed43ac5..bcc11c0d 100644 --- a/api/app/core/memory/utils/prompt/prompt_utils.py +++ b/api/app/core/memory/utils/prompt/prompt_utils.py @@ -1,3 +1,4 @@ +import json import os from jinja2 import Environment, FileSystemLoader from app.core.memory.models.ontology_extraction_models import OntologyTypeList @@ -46,6 +47,7 @@ async def render_statement_extraction_prompt( dialogue_content: str | None = None, max_dialogue_chars: int | None = None, language: str = "zh", + input_json: dict | None = None, ) -> str: """ Renders the statement extraction prompt using the extract_statement.jinja2 template. @@ -77,6 +79,7 @@ async def render_statement_extraction_prompt( rendered_prompt = template.render( inputs={"chunk": chunk_content}, + input_json=json.dumps(input_json, ensure_ascii=False) if input_json else "{}", definitions=definitions, json_schema=json_schema, granularity=granularity, @@ -207,6 +210,8 @@ async def render_triplet_extraction_prompt( language: str = "zh", ontology_types: "OntologyTypeList | None" = None, speaker: str = None, + input_json: dict = None, + has_unsolved_reference: bool = False, ) -> str: """ Renders the triplet extraction prompt using the extract_triplet.jinja2 template. @@ -219,10 +224,14 @@ async def render_triplet_extraction_prompt( language: The language to use for entity descriptions ("zh" for Chinese, "en" for English) ontology_types: Optional OntologyTypeList containing predefined ontology types for entity classification speaker: Speaker role ("user" or "assistant") for the current statement + input_json: Full input JSON for the template + has_unsolved_reference: Whether the statement has unresolved references Returns: Rendered prompt content as string """ + import json + template = prompt_env.get_template("extract_triplet.jinja2") # 准备本体类型数据 @@ -234,8 +243,13 @@ async def render_triplet_extraction_prompt( ontology_type_names = ontology_types.get_type_names() type_hierarchy_hints = ontology_types.get_type_hierarchy_hints() + # 准备 input_json 如果没有提供 + if input_json is None: + input_json = {} + rendered_prompt = template.render( statement=statement, + statement_text=statement, # 兼容模板中的 statement_text 变量 chunk_content=chunk_content, json_schema=json_schema, predicate_instructions=predicate_instructions, @@ -244,6 +258,8 @@ async def render_triplet_extraction_prompt( ontology_type_names=ontology_type_names, type_hierarchy_hints=type_hierarchy_hints, speaker=speaker, + input_json=json.dumps(input_json, ensure_ascii=False) if input_json else "{}", + has_unsolved_reference=has_unsolved_reference, ) # 记录渲染结果到提示日志(与示例日志结构一致) log_prompt_rendering('triplet extraction', rendered_prompt) diff --git a/api/app/core/memory/utils/prompt/prompts/entity_dedup.jinja2 b/api/app/core/memory/utils/prompt/prompts/entity_dedup.jinja2 index 984c2952..c366ffa6 100644 --- a/api/app/core/memory/utils/prompt/prompts/entity_dedup.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/entity_dedup.jinja2 @@ -12,24 +12,27 @@ Mode: {{ 'Disambiguation Mode' if disambiguation_mode else 'Deduplication Mode' ===Input=== {% if language == "zh" %} 实体A: + - 名称: "{{ entity_a.name | default('') }}" - 类型: "{{ entity_a.entity_type | default('') }}" - 描述: "{{ entity_a.description | default('') }}" - 别名: {{ entity_a.aliases | default([]) }} -{# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 #} -{# - 摘要: "{{ entity_a.fact_summary | default('') }}" #} + {# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 #} + {# - 摘要: "{{ entity_a.fact_summary | default('') }}" #} - 连接强弱: "{{ entity_a.connect_strength | default('') }}" 实体B: + - 名称: "{{ entity_b.name | default('') }}" - 类型: "{{ entity_b.entity_type | default('') }}" - 描述: "{{ entity_b.description | default('') }}" - 别名: {{ entity_b.aliases | default([]) }} -{# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 #} -{# - 摘要: "{{ entity_b.fact_summary | default('') }}" #} + {# TODO: fact_summary 功能暂时禁用,待后续开发完善后启用 #} + {# - 摘要: "{{ entity_b.fact_summary | default('') }}" #} - 连接强弱: "{{ entity_b.connect_strength | default('') }}" 上下文: + - 同组: {{ same_group | default(false) }} - 类型一致或未知类型: {{ type_ok | default(false) }} - 类型相似度(0-1): {{ type_similarity | default(0.0) }} @@ -38,29 +41,31 @@ Mode: {{ 'Disambiguation Mode' if disambiguation_mode else 'Deduplication Mode' - 名称包含关系: {{ name_contains | default(false) }} - 上下文同源(同一语句指向两者): {{ co_occurrence | default(false) }} - 两者相关的关系陈述(来自实体-实体边): -{% for s in relation_statements %} + {% for s in relation_statements %} - {{ s }} -{% endfor %} -{% else %} -Entity A: + {% endfor %} + {% else %} + Entity A: - Name: "{{ entity_a.name | default('') }}" - Type: "{{ entity_a.entity_type | default('') }}" - Description: "{{ entity_a.description | default('') }}" - Aliases: {{ entity_a.aliases | default([]) }} -{# TODO: fact_summary feature temporarily disabled, to be enabled after future development #} -{# - Summary: "{{ entity_a.fact_summary | default('') }}" #} + {# TODO: fact_summary feature temporarily disabled, to be enabled after future development #} + {# - Summary: "{{ entity_a.fact_summary | default('') }}" #} - Connection Strength: "{{ entity_a.connect_strength | default('') }}" Entity B: + - Name: "{{ entity_b.name | default('') }}" - Type: "{{ entity_b.entity_type | default('') }}" - Description: "{{ entity_b.description | default('') }}" - Aliases: {{ entity_b.aliases | default([]) }} -{# TODO: fact_summary feature temporarily disabled, to be enabled after future development #} -{# - Summary: "{{ entity_b.fact_summary | default('') }}" #} + {# TODO: fact_summary feature temporarily disabled, to be enabled after future development #} + {# - Summary: "{{ entity_b.fact_summary | default('') }}" #} - Connection Strength: "{{ entity_b.connect_strength | default('') }}" Context: + - Same Group: {{ same_group | default(false) }} - Type Consistent or Unknown: {{ type_ok | default(false) }} - Type Similarity (0-1): {{ type_similarity | default(0.0) }} @@ -69,14 +74,15 @@ Context: - Name Contains Relationship: {{ name_contains | default(false) }} - Context Co-occurrence (same statement refers to both): {{ co_occurrence | default(false) }} - Related Relationship Statements (from entity-entity edges): -{% for s in relation_statements %} + {% for s in relation_statements %} - {{ s }} -{% endfor %} -{% endif %} + {% endfor %} + {% endif %} ===Guidelines=== {% if language == "zh" %} {% if disambiguation_mode %} + - 这是"同名但类型不同"的消歧场景。请判断两者是否指向同一真实世界实体。 - 综合名称文本/向量相似度、别名、描述、摘要与上下文关系(同源与关系陈述)进行判断。 - **别名处理(高优先级)**: @@ -93,7 +99,7 @@ Context: * 建议类型必须与上下文和实体描述一致 - 规范实体优先级:连接强度(strong/both)更高者;其余相同则保留描述/摘要更丰富者;再相同时保留实体A(canonical_idx=0)。 - **注意**:别名(aliases)已在三元组提取阶段获取,合并时会自动整合,无需在此阶段提取。 -{% else %} + {% else %} - 若实体类型相同或任一为UNKNOWN/空,可放行作为候选;若类型明显冲突(如人 vs 物品),除非别名与描述高度一致,否则判定不同实体。 - **别名匹配优先(最高优先级)**: * 如果实体A的名称与实体B的某个别名完全匹配,应视为高置信度匹配 @@ -107,9 +113,9 @@ Context: - 若需要合并,选择"保留的规范实体"(canonical_idx)为更合适的一个: - 优先保留连接强度更强(strong/both)者;其余相同则保留描述/摘要更丰富者;再相同时保留实体A(canonical_idx=0)。 - **注意**:别名(aliases)已在三元组提取阶段获取,合并时会自动整合,无需在此阶段提取。 -{% endif %} -{% else %} -{% if disambiguation_mode %} + {% endif %} + {% else %} + {% if disambiguation_mode %} - This is a disambiguation scenario for "same name but different types". Please determine whether they refer to the same real-world entity. - Make judgments based on name text/vector similarity, aliases, descriptions, summaries, and contextual relationships (co-occurrence and relationship statements). - **Alias Handling (High Priority)**: @@ -126,7 +132,7 @@ Context: * Suggested type must be consistent with context and entity description - Canonical entity priority: higher connection strength (strong/both); if equal, retain the one with richer description/summary; if still equal, retain Entity A (canonical_idx=0). - **Note**: Aliases are already obtained during triplet extraction and will be automatically integrated during merging; no need to extract at this stage. -{% else %} + {% else %} - If entity types are the same or either is UNKNOWN/empty, can proceed as candidates; if types clearly conflict (e.g., person vs. item), unless aliases and descriptions are highly consistent, determine as different entities. - **Alias Matching Priority (Highest Priority)**: * If Entity A's name exactly matches any of Entity B's aliases, it should be considered a high-confidence match @@ -140,8 +146,8 @@ Context: - If merging is needed, select the "canonical entity to retain" (canonical_idx) as the more appropriate one: - Prioritize retaining the one with stronger connection strength (strong/both); if equal, retain the one with richer description/summary; if still equal, retain Entity A (canonical_idx=0). - **Note**: Aliases are already obtained during triplet extraction and will be automatically integrated during merging; no need to extract at this stage. -{% endif %} -{% endif %} + {% endif %} + {% endif %} **Output format** {% if language == "zh" %} @@ -157,64 +163,69 @@ Context: } **字段说明**: + - should_merge: 是否应该合并这两个实体(true/false) - canonical_idx: 规范实体的索引,0表示实体A,1表示实体B - confidence: 决策的置信度,范围0.0-1.0 - block_pair: 是否阻断该对在其他模糊/启发式合并中出现(true/false) - suggested_type: 建议的统一类型(字符串或null) - reason: 决策理由的简短说明 -{% else %} -返回JSON格式,必须包含以下字段: -{ + {% else %} + 返回JSON格式,必须包含以下字段: + { "same_entity": boolean, "canonical_idx": 0 or 1, "confidence": float (0.0-1.0), "reason": "string" -} + } **字段说明**: + - same_entity: 两个实体是否指向同一真实世界实体(true/false) - canonical_idx: 规范实体的索引,0表示实体A,1表示实体B - confidence: 决策的置信度,范围0.0-1.0 - reason: 决策理由的简短说明 -{% endif %} -{% else %} -{% if disambiguation_mode %} -Return JSON format with the following required fields: -{ + {% endif %} + {% else %} + {% if disambiguation_mode %} + Return JSON format with the following required fields: + { "should_merge": boolean, "canonical_idx": 0 or 1, "confidence": float (0.0-1.0), "block_pair": boolean, "suggested_type": "string or null", "reason": "string" -} + } **Field Descriptions**: + - should_merge: Whether these two entities should be merged (true/false) - canonical_idx: Index of the canonical entity, 0 for Entity A, 1 for Entity B - confidence: Confidence level of the decision, range 0.0-1.0 - block_pair: Whether to block this pair in other fuzzy/heuristic merges (true/false) - suggested_type: Suggested unified type (string or null) - reason: Brief explanation of the decision -{% else %} -Return JSON format with the following required fields: -{ + {% else %} + Return JSON format with the following required fields: + { "same_entity": boolean, "canonical_idx": 0 or 1, "confidence": float (0.0-1.0), "reason": "string" -} + } **Field Descriptions**: + - same_entity: Whether the two entities refer to the same real-world entity (true/false) - canonical_idx: Index of the canonical entity, 0 for Entity A, 1 for Entity B - confidence: Confidence level of the decision, range 0.0-1.0 - reason: Brief explanation of the decision -{% endif %} -{% endif %} + {% endif %} + {% endif %} **CRITICAL JSON FORMATTING REQUIREMENTS:** + 1. Use only standard ASCII double quotes (") for JSON structure - never use Chinese quotation marks ("") or other Unicode quotes 2. Ensure all JSON strings are properly closed and comma-separated 3. Do not include line breaks within JSON string values @@ -225,4 +236,4 @@ Return JSON format with the following required fields: {% else %} The output language should always be the same as the input language. {% endif %} -{{ json_schema }} +{{ json_schema }} \ No newline at end of file diff --git a/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 index 611bd6df..9be6f19b 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 @@ -1,315 +1,505 @@ -{% macro tidy(name) -%} - {{ name.replace('_', ' ')}} +{% macro render_input() -%} +{{ input_json }} {%- endmacro %} - ===Tasks=== {% if language == "zh" %} -你的任务是根据详细的提取指南,从提供的对话片段中识别和提取陈述句。 -每个陈述句必须按照下面提到的标准进行标记。 +你的任务是从提供的目标文本中识别并提取陈述句,并为每条陈述句标注以下信息: + +- statement_id +- statement_text +- statement_type +- temporal_type +- has_unsolved_reference +- valid_at +- invalid_at + +每条输出都应是一个结构化的记忆候选陈述句。 {% else %} -Your task is to identify and extract declarative statements from the provided conversational chunk based on the detailed extraction guidelines. -Each statement must be labeled as per the criteria mentioned below. +Your task is to identify and extract declarative statements from the provided target text, and annotate each extracted statement with: + +- statement_id +- statement_text +- statement_type +- temporal_type +- has_unsolved_reference +- valid_at +- invalid_at + +Each output item should be a structured candidate memory statement. {% endif %} ===Inputs=== -{% if inputs %} -{% for key, val in inputs.items() %} -- {{ key }}: {{val}} -{% endfor %} -{% endif %} - - -===Extraction Instructions=== {% if language == "zh" %} -{% if granularity %} -{% if granularity == 3 %} -原子化和清晰:构建陈述句以清楚地显示单一的主谓宾关系。最好有多个较小的陈述句,而不是一个复杂的陈述句。 -上下文独立:陈述句必须在不需要阅读整个对话的情况下可以理解。 -{% elif granularity == 2 %} -在句子级别提取陈述句。每个陈述句应对应一个单一、完整的思想(通常是来源中的一个完整句子),但要重新表述以获得最大的清晰度,删除对话填充词(例如,"嗯"、"像"、感叹词)。 -{% elif granularity == 1 %} -仅提取精华句子,并将片段总结为多个独立的陈述句,每个陈述句关注事实陈述、用户偏好、关系和显著的时间上下文。 -{% endif %} -{% endif %} -上下文解析要求: -- 将指示代词("那个"、"这个"、"那些"、"这些")解析为其具体指代对象 -- 如果陈述句包含无法从对话上下文中解析的模糊引用,则: - a) 扩展陈述句以包含对话早期的缺失上下文 - b) 标记陈述句为需要额外上下文 - c) 如果陈述句在没有上下文的情况下变得无意义,则跳过提取 +- chunk_id: chunk 唯一 ID +- end_user_id: 终端用户 ID +- target_content: 当前要处理的对话片段文本,也是唯一允许被抽取的目标文本 +- target_message_date: 目标文本对应的时间,用于解析相对时间表达 +- supporting_context: 完整对话上下文,仅用于辅助理解 target_content,不能单独贡献新的可抽取事实 +- supporting_context.msgs: 按顺序提供的上下文消息,可包含 User 和 Assistant + {% else %} +- chunk_id: unique chunk identifier +- end_user_id: end-user identifier +- target_content: the current dialogue fragment to process, and the only text span that may be extracted from +- target_message_date: the reference time for the target content, used for resolving relative temporal expressions +- supporting_context: full dialogue context used only to help interpret target_content and must not independently contribute new extractable facts +- supporting_context.msgs: ordered contextual messages, which may include User and Assistant messages + {% endif %} -对话上下文和共指消解: -- 将每个陈述句归属于说出它的参与者。 -- **对于用户的发言:必须使用"用户"作为主语**,禁止将"用户"或"我"替换为用户的真实姓名或别名。例如,用户说"我叫张三"应提取为"用户叫张三",而不是"张三叫张三"。 -- 对于 AI 助手的发言:使用"助手"或"AI助手"作为主语。 -- 将所有代词解析为对话上下文中的具体人物或实体,但"我"必须解析为"用户"。 -- 识别并将抽象引用解析为其具体名称(如果提到)。 -- 将缩写和首字母缩略词扩展为其完整形式。 -{% else %} -{% if granularity %} -{% if granularity == 3 %} -Atomic & Clear: Structure statements to clearly show a single subject-predicate-object relationship. It is better to have multiple smaller statements than one complex one. -Context-Independent: Statements must be understandable without needing to read the entire conversation. -{% elif granularity == 2 %} -Extract statements at the sentence level. Each statement should correspond to a single, complete thought (typically a full sentence from the source) but be rephrased for maximum clarity, removing conversational filler (e.g., 'um,' 'like,' interjections). -{% elif granularity == 1 %} -Extract only essence sentences and summarize the chunk into multiple, standalone statements, each focusing on factual statements, user preferences, relationships, and salient temporal context. -{% endif %} -{% endif %} - -Context Resolution Requirements: -- Resolve demonstrative pronouns ("that," "this," "those") to their specific referents -- If a statement contains vague references that cannot be resolved from the conversation context, either: - a) Expand the statement to include the missing context from earlier in the conversation - b) Mark the statement as requiring additional context - c) Skip extraction if the statement becomes meaningless without context - -Conversational Context & Co-reference Resolution: -- Attribute every statement to the participant who uttered it. -- **For user's statements: always use "用户" (User) as the subject**. Do NOT replace "用户" or "I" with the user's real name or alias. For example, if the user says "I'm John", extract as "用户 is John", not "John is John". -- For AI assistant's statements: use "助手" or "AI助手" as the subject. -- Resolve all pronouns to the specific person or entity from the conversation's context, but "I"/"我" must always resolve to "用户". -- Identify and resolve abstract references to their specific names if mentioned. -- Expand abbreviations and acronyms to their full form. -{% endif %} - -{% if include_dialogue_context %} +===Scope=== {% if language == "zh" %} -===完整对话上下文=== -以下是完整的对话上下文,以帮助您理解引用、代词和对话流程: -{% else %} -===Full Dialogue Context=== -The following is the complete dialogue context to help you understand references, pronouns, and conversational flow: -{% endif %} -{{ dialogue_context }} +- 只从 `target_content` 中提取陈述句。 +- `supporting_context.msgs` 只用于解释 `target_content` 中的代词、省略、主体、时间和语义背景。 +- 不要从 `supporting_context.msgs` 中单独提取任何陈述句。 +- 如果某条信息没有出现在 `target_content` 中,即使它出现在 `supporting_context.msgs` 中,也不能把它作为独立 statement 输出。 +- 如果 Assistant 在 `supporting_context.msgs` 中提供了总结、猜测、解释或改写,这些内容只能作为理解辅助,不能被当作事实直接提取。 +- 每条输出的 statement 都必须能够在 `target_content` 中找到直接对应的表达依据。 + {% else %} +- Extract statements only from `target_content`. +- `supporting_context.msgs` is used only to interpret references, ellipsis, subjects, temporal expressions, and semantic background in `target_content`. +- Do not extract any standalone statement from `supporting_context.msgs`. +- If a piece of information does not appear in `target_content`, it must not be output as an independent statement even if it appears in `supporting_context.msgs`. +- If the Assistant in `supporting_context.msgs` provides a summary, guess, interpretation, or rephrasing, treat it only as interpretive support and never as a direct factual source for extraction. +- Every output statement must be directly grounded in wording from `target_content`. + {% endif %} +===Extraction Rules=== {% if language == "zh" %} -===对话上下文结束=== -{% else %} -===End of Dialogue Context=== -{% endif %} -{% endif %} +拆分规则: -{% if language == "zh" %} -过滤和格式化: +- 以“一个完整意思”为单位提取陈述句,通常对应一个完整句子或一个自然语义片段。 +- 默认保留句子级结构;只有当一个句子内部包含两个及以上彼此独立、拆开后明显更清楚的重要信息时,才拆成多条。 +- 宁可多提取,也不要漏掉 `target_content` 中能独立成立、且语义稳定的 statement。 +- 但不要为了提高覆盖率而引入原文没有的信息,或输出语义不成立的 statement。 + +用户主语归一化: + +- 如果陈述句的主语是用户本人,无论上下文中给出的用户名称、昵称、别名或真实姓名是什么,提取后的陈述句统一使用“用户”作为主语,不要使用用户的具体名字或别名。 +- 这是硬规则;如果用户主语没有统一成“用户”,则该 statement 视为不合格。 + +共指消解: + +- 如果可以解析到具体实体名,优先输出具体实体名,并将 `has_unsolved_reference` 设为 `false`。 +- 如果不能解析到具体实体名,但可以解析到最小必要描述,则输出该最小必要描述,并将 `has_unsolved_reference` 设为 `true`。 +- 如果既不能解析到具体实体名,也不能稳定解析到最小必要描述,则保留最小必要原始表达,并将 `has_unsolved_reference` 设为 `true`。 +- 对涉及用户与其他人的共同活动,优先写成“用户和谁……”的形式,而不是保留“我们”“他们”这类未展开表达。 + +清晰指代与模糊指代: + +- 只有当当前 `supporting_context` 足以将引用稳定映射到具体实体名时,才算 fully resolved。 +- `张三`、`老张`(且上下文中明确就是张三)、`李教授`、`王老师` 属于清晰指代。 +- `用户的朋友`、`用户的同事`、`某位老师`、`一位面试官` 这类最小必要描述允许输出,但仍然算 unresolved。 +- `朋友`、`前天那个人`、`那个`、`这个`、`那些`、`那两个`、`对方`、`他/她`(且无唯一可解对象)属于模糊指代。 + +过滤: - 仅提取陈述句。 - 不要提取问题、命令、问候语或对话填充词。 -时间精度: +- 不要提取问题、命令、问候语或对话填充词。 -包括任何明确的日期、时间或定量限定符。 -如果一个句子既描述了事件的开始(静态)又描述了其持续性质(动态),则将两者提取为单独的陈述句。 -{% else %} -Filtering and Formatting: +statement_type: + +- `FACT`:用户陈述的事实、状态、关系、经历、行为、事件或计划等现实描述。 +- `OPINION`:主观评价、态度、判断、感受、看法,例如“我觉得”“我担心”。 +- `OTHER`:不应归入 `FACT` 或 `OPINION` 的其他陈述;“我希望……”默认标为 `OTHER`。 +- 不要因为句子带有主观色彩就自动判为 `OPINION`;只有在其核心是个人判断、态度、感受或评价时才标为 `OPINION`。 + +时间规则: + +- 仅使用目标文本中明确陈述或可由 `target_message_date` 直接解析的时间信息;不要使用外部知识补时间。 +- 使用 `target_message_date` 作为“现在”来解释相对时间,例如“昨天”“上周五”“下个月”。 +- `valid_at` 表示陈述开始成立或生效的时间。 +- `invalid_at` 表示陈述结束或不再成立的时间;如果仍在持续,填 `"NULL"`。 +- 时间格式优先使用 ISO 8601。 +- 对于只有日期没有时分秒的时间,默认使用整天边界,便于后续检索。 +- 如果没有明确时间,不要编造时间。 +- 对于点状事件(例如某天发生的一次考试、一次见面、一次提交),`valid_at` 和 `invalid_at` 都应填写为该事件的起止边界;不要只填 `valid_at`。 + +temporal_type: + +- `STATIC`:相对稳定、持续性的状态、身份、属性、长期偏好、长期关系、长期职业或长期居住状态;若带起始时间,可填 `valid_at`,`invalid_at` 必须为 `"NULL"`。 +- `DYNAMIC`:有明确时间范围、阶段性持续、可结束或已结束的事件、活动、计划、任务或临时状态。 +- `ATEMPORAL`:普遍事实、定义、常识、百科知识、数学事实或无具体时间边界的泛化陈述;`valid_at` 和 `invalid_at` 都必须为 `"NULL"`。 +- 不要因为句子里出现时间词就机械地标为 `DYNAMIC`。 + +改写边界: + +- 允许为解决代词、省略和时间歧义做最小必要改写。 +- 不要引入原文未明确表达的新事实、额外推断或风格化概括。 + {% else %} + Granularity: +- Extract statements at the level of one complete thought, usually one full sentence or one natural semantic unit. +- Preserve sentence-level structure by default; split only when a sentence contains two or more independent and important pieces of information that become clearly easier to understand when separated. +- Prefer higher recall: do not miss independently valid and semantically stable statements in `target_content`. +- But do not increase recall by inventing unsupported facts or emitting semantically unstable statements. + +User-subject normalization: + +- If the subject of a statement is the user, always use “the user” as the subject in the extracted statement, regardless of whether the context provides the user’s real name, nickname, alias, or other identifier. +- This is a hard rule. If a user-subject statement does not use “the user,” treat it as invalid. + +Coreference resolution: + +- If you can resolve to a concrete named entity, output that name and set `has_unsolved_reference` to `false`. +- If you cannot resolve to a concrete named entity but can resolve to a minimal grounded description, output that description and set `has_unsolved_reference` to `true`. +- If you cannot even resolve to a stable minimal grounded description, keep the minimal original expression and set `has_unsolved_reference` to `true`. +- For shared activities involving the user and others, prefer forms like “the user and X...” rather than unresolved expressions like “we” or “they”. + +Clear vs unresolved reference: + +- A reference is fully resolved only if the current `supporting_context` can map it to a concrete named entity. +- `Zhang San`, `Old Zhang` when clearly resolved to Zhang San, `Professor Li`, and `Teacher Wang` are clear references. +- `the user's friend`, `the user's coworker`, `a teacher`, and `an interviewer` are allowed outputs but still count as unresolved. +- `friend`, `that person from the other day`, `that one`, `this one`, `those`, `the two of them`, `the other party`, and `he/she` without a unique referent are unresolved. + +Filtering: - Extract only declarative statements. - DO NOT extract questions, commands, greetings, or conversational filler. -Temporal Precision: +- Do not extract questions, commands, greetings, or conversational filler. -Include any explicit dates, times, or quantitative qualifiers. -If a sentence describes both the start of an event (static) and its ongoing nature (dynamic), extract both as separate statements. -{% endif %} +statement_type: -{%- if definitions %} - {%- for section_key, section_dict in definitions.items() %} -==== {{ tidy(section_key) | upper }} {% if language == "zh" %}定义和指导{% else %}DEFINITIONS & GUIDANCE{% endif %} ==== - {%- for category, details in section_dict.items() %} -{{ loop.index }}. {{ category }} -- {% if language == "zh" %}定义{% else %}Definition{% endif %}: {{ details.get("definition", "") }} - {% endfor -%} - {% endfor -%} -{% endif -%} +- `FACT`: user-stated facts, states, relationships, experiences, behaviors, events, or plans. +- `OPINION`: subjective judgments, attitudes, feelings, evaluations, or viewpoints, such as “I think...” or “I worry...”. +- `OTHER`: statements that should not be categorized as `FACT` or `OPINION`; statements like “I hope...” default to `OTHER`. +- Do not classify a statement as `OPINION` merely because it sounds subjective; use `OPINION` only when its core content is a personal judgment, attitude, feeling, or evaluation. + +Temporal rules: + +- Use only temporal information explicitly stated in the target text or directly resolvable from `target_message_date`; do not add dates from external knowledge. +- Use `target_message_date` as “now” when interpreting relative expressions such as “yesterday,” “last Friday,” or “next month.” +- `valid_at` means when the statement became valid or started to hold. +- `invalid_at` means when the statement ended or stopped being valid; use `"NULL"` if it is still ongoing. +- Prefer ISO 8601 for time values. +- When only a date can be resolved, default to full-day boundaries for retrieval use. +- If no explicit time is available, do not invent one. +- For point-in-time events such as a single exam, a meeting, or a submission on one day, populate both `valid_at` and `invalid_at`; do not fill only `valid_at`. + +temporal_type: + +- `STATIC`: relatively stable, ongoing states, identities, attributes, long-term preferences, long-term relationships, occupations, or residence states. +- `DYNAMIC`: events, activities, plans, tasks, or temporary states with a bounded or potentially bounded time span. +- `ATEMPORAL`: general facts, definitions, common knowledge, encyclopedic knowledge, mathematical facts, or generalized statements without meaningful temporal boundaries; both `valid_at` and `invalid_at` must be `"NULL"`. +- Do not classify a statement as `DYNAMIC` merely because it contains a time word. + +Rewrite boundary: + +- Minimal rewriting is allowed only to resolve reference, ellipsis, and temporal ambiguity. +- Do not introduce unsupported facts, extra inference, or stylistic summarization. + {% endif %} ===Examples=== {% if language == "zh" %} -示例 1: 英文对话 -示例片段: """ -日期: 2024年3月15日 -参与者: -- Sarah Chen (用户) -- 助手 (AI) - -用户: "我最近一直在尝试水彩画,画了一些花朵。" -AI: "水彩画很有趣!水彩颜料通常由颜料与阿拉伯树胶等粘合剂混合而成。你觉得怎么样?" -用户: "我认为色彩组合可以改进,但我真的很喜欢玫瑰和百合。" -""" +示例 1: +示例输入: { + "chunk_id": "chunk_a1b2c3d4", + "end_user_id": "eu_12345678", + "target_content": "老李这学期要求还是一如既往地严,不过他讲课确实清晰透彻,而且每节课的结构都特别清楚。就是气场实在太吓人了,我每次被他点名都有点发怵。", + "target_message_date": "2023-09-04T18:00:00", + "supporting_context": { + "msgs": [ + { + "role": "User", + "msg": "今天是九月第一周的星期一,上了本学期第一节数据库课。作为班长,我帮李教授发了教学大纲。老李宣布的期末项目考核标准特别严,看了一眼大纲上的作业量,我感觉这学期恐怕要脱层皮。不过老李讲课确实清晰透彻,就是气场实在太吓人了。" + }, + { + "role": "Assistant", + "msg": "听起来你对这门课既佩服又有点压力,李教授应该是很有气场的老师。" + } + ] + } +} 示例输出: { "statements": [ { - "statement": "用户最近一直在尝试水彩画。", - "statement_type": "FACT", - "temporal_type": "DYNAMIC", - "relevance": "RELEVANT" - }, - { - "statement": "用户画了一些花朵。", - "statement_type": "FACT", - "temporal_type": "DYNAMIC", - "relevance": "RELEVANT" - }, - { - "statement": "水彩颜料通常由颜料与阿拉伯树胶等粘合剂混合而成。", - "statement_type": "FACT", - "temporal_type": "ATEMPORAL", - "relevance": "IRRELEVANT" - }, - { - "statement": "用户认为她的水彩画中的色彩组合可以改进。", + "statement_id": "stmt_e5f6g7h8", + "statement_text": "李教授这学期要求很严。", "statement_type": "OPINION", - "temporal_type": "STATIC", - "relevance": "RELEVANT" + "temporal_type": "DYNAMIC", + "has_unsolved_reference": false, + "valid_at": "2023-09-04T18:00:00", + "invalid_at": "NULL" }, { - "statement": "用户真的很喜欢玫瑰和百合。", - "statement_type": "FACT", - "temporal_type": "STATIC", - "relevance": "RELEVANT" + "statement_id": "stmt_i9j0k1l2", + "statement_text": "李教授讲课清晰透彻。", + "statement_type": "OPINION", + "temporal_type": "ATEMPORAL", + "has_unsolved_reference": false, + "valid_at": "NULL", + "invalid_at": "NULL" + }, + { + "statement_id": "stmt_m1n2o3p4", + "statement_text": "李教授的气场很吓人。", + "statement_type": "OPINION", + "temporal_type": "ATEMPORAL", + "has_unsolved_reference": false, + "valid_at": "NULL", + "invalid_at": "NULL" } ] } -示例 2: 中文对话示例 -示例片段: """ -日期: 2024年3月15日 -参与者: -- 张曼婷 (用户) -- 小助手 (AI助手) - -用户: "我最近在尝试水彩画,画了一些花朵。" -AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合剂混合而成。你觉得怎么样?" -用户: "我觉得色彩搭配还有提升的空间,不过我很喜欢玫瑰和百合这两种花。" -""" +示例 2: +示例输入: { + "chunk_id": "chunk_b2c3d4e5", + "end_user_id": "eu_12345678", + "target_content": "我最近在学 Python,每天晚上都会练一个小时。这周还打算先把基础语法和函数部分过一遍。", + "target_message_date": "2026-04-01T00:00:00", + "supporting_context": { + "msgs": [ + { + "role": "User", + "msg": "我最近在学 Python。" + }, + { + "role": "Assistant", + "msg": "Python 是一个很实用的语言。" + } + ] + } +} 示例输出: { "statements": [ { - "statement": "用户最近在尝试水彩画。", + "statement_id": "stmt_m3n4o5p6", + "statement_text": "用户最近在学 Python。", "statement_type": "FACT", "temporal_type": "DYNAMIC", - "relevance": "RELEVANT" + "has_unsolved_reference": false, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" }, { - "statement": "用户画了一些花朵。", + "statement_id": "stmt_q7r8s9t0", + "statement_text": "用户最近每天晚上都会练一个小时 Python。", "statement_type": "FACT", "temporal_type": "DYNAMIC", - "relevance": "RELEVANT" + "has_unsolved_reference": false, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" }, { - "statement": "水彩颜料通常由颜料和阿拉伯树胶等粘合剂混合而成。", + "statement_id": "stmt_u1v2w3x4", + "statement_text": "用户这周打算先复习 Python 的基础语法和函数部分。", "statement_type": "FACT", - "temporal_type": "ATEMPORAL", - "relevance": "IRRELEVANT" - }, + "temporal_type": "DYNAMIC", + "has_unsolved_reference": false, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" + } + ] +} + +示例 3: +示例输入: { + "chunk_id": "chunk_c3d4e5f6", + "end_user_id": "eu_12345678", + "target_content": "这周老师新布置的那两个我觉得有点难,而且我昨晚看了半天还是没太搞明白。要是周末再弄不出来,我可能就得去问助教了。", + "target_message_date": "2026-04-01T00:00:00", + "supporting_context": { + "msgs": [ + { + "role": "User", + "msg": "这周老师新布置的那两个我觉得有点难,而且我昨晚看了半天还是没太搞明白。要是周末再弄不出来,我可能就得去问助教了。" + }, + { + "role": "Assistant", + "msg": "听起来你卡在老师这周新布置的两个内容上了,如果周末还没进展,再去问助教也可以。" + } + ] + } +} + +示例输出: { + "statements": [ { - "statement": "用户觉得水彩画的色彩搭配还有提升的空间。", + "statement_id": "stmt_y5z6a7b8", + "statement_text": "用户觉得那两个有点难。", "statement_type": "OPINION", - "temporal_type": "STATIC", - "relevance": "RELEVANT" + "temporal_type": "DYNAMIC", + "has_unsolved_reference": true, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" }, { - "statement": "用户很喜欢玫瑰和百合。", + "statement_id": "stmt_c9d0e1f2", + "statement_text": "用户昨晚看了半天那两个还是没太搞明白。", "statement_type": "FACT", - "temporal_type": "STATIC", - "relevance": "RELEVANT" + "temporal_type": "DYNAMIC", + "has_unsolved_reference": true, + "valid_at": "2026-03-31T00:00:00", + "invalid_at": "2026-03-31T23:59:59" + }, + { + "statement_id": "stmt_g3h4i5j6", + "statement_text": "如果周末还弄不出来,用户可能会去问助教。", + "statement_type": "OTHER", + "temporal_type": "DYNAMIC", + "has_unsolved_reference": true, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" } ] } {% else %} -Example 1: English Conversation -Example Chunk: """ -Date: March 15, 2024 -Participants: -- Sarah Chen (User) -- Assistant (AI) - -User: "I've been trying watercolor painting recently and painted some flowers." -AI: "Watercolor painting is very interesting! Watercolor paints are typically made from pigments mixed with binders like gum arabic. How do you like it?" -User: "I think the color combinations could use some improvement, but I really like roses and lilies." -""" +Example 1: +Example Input: { + "chunk_id": "chunk_a1b2c3d4", + "end_user_id": "eu_12345678", + "target_content": "Old Li is just as strict as ever this semester, but he really explains things clearly and the structure of every class is extremely clear. His presence is honestly kind of intimidating, and I get nervous every time he calls on me.", + "target_message_date": "2023-09-04T18:00:00", + "supporting_context": { + "msgs": [ + { + "role": "User", + "msg": "Today was the Monday of the first week of September, and I had the first database class of the semester. As class monitor, I helped Professor Li distribute the syllabus. Professor Li said the grading criteria for the final project would be very strict. Old Li is just as strict as ever this semester, but he really explains things clearly and the structure of every class is extremely clear. His presence is honestly kind of intimidating." + }, + { + "role": "Assistant", + "msg": "It sounds like you admire the teaching but also feel pressured by Professor Li." + } + ] + } +} Example Output: { "statements": [ { - "statement": "用户 has been trying watercolor painting recently.", - "statement_type": "FACT", - "temporal_type": "DYNAMIC", - "relevance": "RELEVANT" - }, - { - "statement": "用户 painted some flowers.", - "statement_type": "FACT", - "temporal_type": "DYNAMIC", - "relevance": "RELEVANT" - }, - { - "statement": "Watercolor paints are typically made from pigments mixed with binders like gum arabic.", - "statement_type": "FACT", - "temporal_type": "ATEMPORAL", - "relevance": "IRRELEVANT" - }, - { - "statement": "用户 thinks the color combinations in her watercolor paintings could use some improvement.", + "statement_id": "stmt_e5f6g7h8", + "statement_text": "Professor Li is very strict this semester.", "statement_type": "OPINION", - "temporal_type": "STATIC", - "relevance": "RELEVANT" + "temporal_type": "DYNAMIC", + "has_unsolved_reference": false, + "valid_at": "2023-09-04T18:00:00", + "invalid_at": "NULL" }, { - "statement": "用户 really likes roses and lilies.", - "statement_type": "FACT", - "temporal_type": "STATIC", - "relevance": "RELEVANT" + "statement_id": "stmt_i9j0k1l2", + "statement_text": "Professor Li explains things clearly.", + "statement_type": "OPINION", + "temporal_type": "ATEMPORAL", + "has_unsolved_reference": false, + "valid_at": "NULL", + "invalid_at": "NULL" + }, + { + "statement_id": "stmt_m1n2o3p4", + "statement_text": "Professor Li's presence is intimidating.", + "statement_type": "OPINION", + "temporal_type": "ATEMPORAL", + "has_unsolved_reference": false, + "valid_at": "NULL", + "invalid_at": "NULL" } ] } -Example 2: Chinese Conversation (中文对话示例) -Example Chunk: """ -日期: 2024年3月15日 -参与者: -- 张曼婷 (用户) -- 小助手 (AI助手) - -用户: "我最近在尝试水彩画,画了一些花朵。" -AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合剂混合而成。你觉得怎么样?" -用户: "我觉得色彩搭配还有提升的空间,不过我很喜欢玫瑰和百合这两种花。" -""" +Example 2: +Example Input: { + "chunk_id": "chunk_b2c3d4e5", + "end_user_id": "eu_12345678", + "target_content": "I've been learning Python recently, and I practice for an hour every night. This week I also plan to review basic syntax and functions first.", + "target_message_date": "2026-04-01T00:00:00", + "supporting_context": { + "msgs": [ + { + "role": "User", + "msg": "I've been learning Python recently." + }, + { + "role": "Assistant", + "msg": "Python is a very practical language." + } + ] + } +} Example Output: { "statements": [ { - "statement": "用户最近在尝试水彩画。", + "statement_id": "stmt_m3n4o5p6", + "statement_text": "The user has been learning Python recently.", "statement_type": "FACT", "temporal_type": "DYNAMIC", - "relevance": "RELEVANT" + "has_unsolved_reference": false, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" }, { - "statement": "用户画了一些花朵。", + "statement_id": "stmt_q7r8s9t0", + "statement_text": "The user has recently been practicing Python for an hour every night.", "statement_type": "FACT", "temporal_type": "DYNAMIC", - "relevance": "RELEVANT" + "has_unsolved_reference": false, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" }, { - "statement": "水彩颜料通常由颜料和阿拉伯树胶等粘合剂混合而成。", + "statement_id": "stmt_u1v2w3x4", + "statement_text": "The user plans to review Python basic syntax and functions first this week.", "statement_type": "FACT", - "temporal_type": "ATEMPORAL", - "relevance": "IRRELEVANT" - }, + "temporal_type": "DYNAMIC", + "has_unsolved_reference": false, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" + } + ] +} + +Example 3: +Example Input: { + "chunk_id": "chunk_c3d4e5f6", + "end_user_id": "eu_12345678", + "target_content": "The two things the teacher assigned this week seem hard to me, and even after looking at them for a long time last night I still didn't really understand them. If I still can't finish them by the weekend, I may have to ask the TA.", + "target_message_date": "2026-04-01T00:00:00", + "supporting_context": { + "msgs": [ + { + "role": "User", + "msg": "The two things the teacher assigned this week seem hard to me, and even after looking at them for a long time last night I still didn't really understand them. If I still can't finish them by the weekend, I may have to ask the TA." + }, + { + "role": "Assistant", + "msg": "It sounds like you're stuck on the two things assigned this week, and asking the TA would make sense if there is still no progress by the weekend." + } + ] + } +} + +Example Output: { + "statements": [ { - "statement": "用户觉得水彩画的色彩搭配还有提升的空间。", + "statement_id": "stmt_y5z6a7b8", + "statement_text": "The user thinks those two things are difficult.", "statement_type": "OPINION", - "temporal_type": "STATIC", - "relevance": "RELEVANT" + "temporal_type": "DYNAMIC", + "has_unsolved_reference": true, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" }, { - "statement": "用户很喜欢玫瑰和百合。", + "statement_id": "stmt_c9d0e1f2", + "statement_text": "The user spent a long time last night looking at those two things but still did not really understand them.", "statement_type": "FACT", - "temporal_type": "STATIC", - "relevance": "RELEVANT" + "temporal_type": "DYNAMIC", + "has_unsolved_reference": true, + "valid_at": "2026-03-31T00:00:00", + "invalid_at": "2026-03-31T23:59:59" + }, + { + "statement_id": "stmt_g3h4i5j6", + "statement_text": "If the user still cannot finish them by the weekend, the user may ask the TA.", + "statement_type": "OTHER", + "temporal_type": "DYNAMIC", + "has_unsolved_reference": true, + "valid_at": "2026-04-01T00:00:00", + "invalid_at": "NULL" } ] } @@ -317,77 +507,68 @@ Example Output: { ===End of Examples=== {% if language == "zh" %} -===反思过程=== +最终输出前检查: -提取陈述句后,执行以下自我审查步骤: - -**步骤 1: 归属检查** -- 确认每个陈述句都正确归属于正确的说话者 -- 验证说话者名称在整个过程中使用一致 -- 检查 AI 助手陈述句是否正确归属 - -**步骤 2: 完整性审查** -- 确保没有遗漏重要的陈述句 -- 检查时间信息是否保留 - -**步骤 3: 分类验证** -- 审查 statement_type 分类(FACT/OPINION/PREDICTION/SUGGESTION) -- 验证 temporal_type 分配(STATIC/DYNAMIC/ATEMPORAL) -- 确保分类与提供的定义一致 - -**步骤 4: 最终质量检查** -- 删除任何问题、命令或对话填充词 -- 验证 JSON 格式合规性 -- 确认输出语言与输入语言匹配 -{% else %} -===Reflection Process=== - -After extracting statements, perform the following self-review steps: - -**Step 1: Attribution Check** -- Confirm every statement is properly attributed to the correct speaker -- Verify speaker names are used consistently throughout -- Check that AI assistant statements are properly attributed - -**Step 2: Completeness Review** -- Ensure no important declarative statements were missed -- Check that temporal information is preserved - -**Step 3: Classification Validation** -- Review statement_type classifications (FACT/OPINION/PREDICTION/SUGGESTION) -- Verify temporal_type assignments (STATIC/DYNAMIC/ATEMPORAL) -- Ensure classifications align with the provided definitions - -**Step 4: Final Quality Check** -- Remove any questions, commands, or conversational filler -- Verify JSON format compliance -- Confirm output language matches input language -{% endif %} +- 是否只保留 `target_content` 中可直接支持的陈述句 +- 如果主语是用户,是否统一写“用户” +- 非用户主体是否尽量写成具体名称;若无法做到,是否已正确标记 `has_unsolved_reference = true` +- statement_type 是否合法,且没有把一般事实机械标成 `OPINION` +- temporal_type 是否与 valid_at / invalid_at 一致 +- 输出是否严格符合 JSON schema + {% else %} + Final checks before output: +- Keep only statements directly supported by `target_content` +- If the subject is the user, render it as “the user” +- Render non-user subjects as concrete names when possible; otherwise mark `has_unsolved_reference = true` +- Ensure statement_type is valid and do not mechanically label ordinary facts as `OPINION` +- Ensure temporal_type is consistent with valid_at and invalid_at +- Ensure the output strictly matches the JSON schema + {% endif %} **Output format** **CRITICAL JSON FORMATTING REQUIREMENTS:** -1. Use only standard ASCII double quotes (") for JSON structure - never use Chinese quotation marks ("") or other Unicode quotes -2. If the extracted statement text contains quotation marks, escape them properly using backslashes (\") -3. Ensure all JSON strings are properly closed and comma-separated -4. Do not include line breaks within JSON string values -5. Example of proper escaping: "statement": "John said: \"I really like this book.\"" + +1. Use only standard ASCII double quotes (") for JSON structure. +2. Escape internal quotation marks inside string values using backslashes (\"). +3. Ensure all JSON strings are properly closed and comma-separated. +4. Do not include line breaks within JSON string values. +5. Return only the JSON object. Do not add explanations before or after it. + +**ISO 8601 HARD CONSTRAINT:** + +- `target_message_date` must be ISO 8601. +- `valid_at` and `invalid_at` must be ISO 8601, or `"NULL"` when no time is available. +- Do not output non-ISO values such as `2026/04/01`, `2026-04-01 00:00:00`, `yesterday evening`, or `下周三`. +- When only a date is known, still output an ISO 8601 datetime boundary. **LANGUAGE REQUIREMENT:** {% if language == "zh" %} -- 输出语言应始终与输入语言匹配 -- 如果输入是中文,则用中文提取陈述句 -- 如果输入是英文,则用英文提取陈述句 -- 保留原始语言,不要翻译 -{% else %} -- The output language should ALWAYS match the input language -- If input is in English, extract statements in English -- If input is in Chinese, extract statements in Chinese -- Preserve the original language and do not translate -{% endif %} -{% if language == "zh" %} -仅返回与以下架构匹配的 JSON 对象数组中提取的标记陈述句列表: -{% else %} -Return only a list of extracted labelled statements in the JSON ARRAY of objects that match the schema below: -{% endif %} -{{ json_schema }} +- 输出语言应始终与输入语言匹配。 +- 如果输入是中文,则用中文提取陈述句。 +- 如果输入是英文,则用英文提取陈述句。 +- 保留原始语言,不要翻译。 + {% else %} +- The output language must always match the input language. +- If the input is in Chinese, extract statements in Chinese. +- If the input is in English, extract statements in English. +- Preserve the original language and do not translate. + {% endif %} + +现在处理下面这个输入: +{{ render_input() }} + +Return only a JSON object matching the schema below: +{ + "statements": [ + { + "statement_id": "string", + "statement_text": "string", + "statement_type": "FACT | OPINION | OTHER", + "temporal_type": "STATIC | DYNAMIC | ATEMPORAL", + "has_unsolved_reference": "boolean", + "valid_at": "string | NULL", + "invalid_at": "string | NULL" + } + ] +} \ No newline at end of file diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index 1a79b482..57c43342 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -1,417 +1,474 @@ -{% macro tidy(name) -%} - {{ name.replace('_', ' ')}} -{%- endmacro %} - ===Task=== Extract entities and knowledge triplets from the given statement. -**⚠️ CRITICAL REQUIREMENTS:** -1. **ALIASES ORDER IS CRITICAL**: The FIRST alias in the array will be used as the user's primary display name (other_name). You MUST put the most important/frequently used name FIRST. -2. **ALWAYS include aliases field**: Even if empty, you MUST include "aliases": [] in EVERY entity. - - - {% if language == "zh" %} -**重要:请使用中文生成实体名称(name)、描述(description)和示例(example)。** -{% else %} -**Important: Please generate entity names, descriptions and examples in English. If the original text is in Chinese, translate entity names to English.** -{% endif %} +重要: + +- `name`、`subject_name`、`object_name` 默认保持原文中的表面形式,不要翻译。 +- 但对用户自指表达,如“我”“我的”“我自己”,统一规范为 `用户`。 +- `description` 使用中文。 +- `type`、`predicate`、`type_description`、`predicate_description` 一律使用中文。 + {% else %} + Important: +- Keep `name`, `subject_name`, and `object_name` in their original surface form from the source text. Do not translate them. +- Exception: normalize user self-reference such as "I", "me", and "myself" to `用户`. +- Generate `description` in English. +- Always generate `type`, `predicate`, `type_description`, and `predicate_description` in Chinese. + {% endif %} ===Inputs=== -**Chunk Content:** "{{ chunk_content }}" -**Statement:** "{{ statement }}" +{% if language == "zh" %} +输入 JSON 包含以下字段: + +- `statement_id`: 陈述句唯一 ID +- `statement_text`: 陈述句文本 +- `statement_type`: 上游提供的陈述类别,例如 `FACT` / `OPINION` / `OTHER` +- `temporal_type`: 上游提供的时间类别,例如 `STATIC` / `DYNAMIC` / `ATEMPORAL` +- `supporting_context`: 原始对话上下文 +- `supporting_context.msgs`: 上下文消息列表 +- `supporting_context.msgs[].role`: `User` / `Assistant` +- `supporting_context.msgs[].msg`: 消息文本 +- `speaker`: `user` / `assistant` +- `valid_at`: ISO 8601 时间点,或 `NULL` +- `invalid_at`: ISO 8601 时间点,或 `NULL` +- `has_unsolved_reference`: 布尔值 + {% else %} + The input JSON contains these fields: +- `statement_id`: unique statement ID +- `statement_text`: statement text +- `statement_type`: upstream statement category such as `FACT` / `OPINION` / `OTHER` +- `temporal_type`: upstream temporal category such as `STATIC` / `DYNAMIC` / `ATEMPORAL` +- `supporting_context`: original conversation context +- `supporting_context.msgs`: context message list +- `supporting_context.msgs[].role`: `User` / `Assistant` +- `supporting_context.msgs[].msg`: message text +- `speaker`: `user` / `assistant` +- `valid_at`: ISO 8601 timestamp or `NULL` +- `invalid_at`: ISO 8601 timestamp or `NULL` +- `has_unsolved_reference`: boolean + {% endif %} + +Input JSON: + +```json +{{ input_json | default("{}") }} +``` + +{% if language == "zh" %} +待分析的主陈述: +{% else %} +Primary statement to analyze: +{% endif %} +**Statement:** "{{ statement_text | default(statement) }}" {% if speaker %} **Speaker:** {{ speaker }} -{% if speaker == "assistant" %} +{% endif %} + +===Hard Gate=== {% if language == "zh" %} -⚠️ 当前陈述句来自 **AI助手的回复**。AI助手在回复中用来称呼用户的名字是**用户的别名**,不是 AI 助手的别名。但只能提取原文中逐字出现的名字,严禁推测或创造原文中不存在的别名变体。 -{% else %} -⚠️ This statement is from the **AI assistant's reply**. Names the AI uses to address the user are **user's aliases**, NOT the AI assistant's aliases. But only extract names that appear VERBATIM in the text — never infer or fabricate alias variants. -{% endif %} -{% endif %} -{% endif %} +开始抽取前,先检查 `has_unsolved_reference`。 -{% if ontology_types %} -===Ontology Type Guidance=== +- 如果 `has_unsolved_reference` 是 `true`,不要抽取任何内容。 +- 此时必须返回: + {% else %} + Before any extraction, check `has_unsolved_reference`. +- If `has_unsolved_reference` is `true`, do not extract anything. +- In that case, return exactly: + {% endif %} -**CRITICAL: Use ONLY predefined type names below. If no exact match, use CLOSEST type. NEVER invent new types.** - -**Type Priority:** -1. [场景类型] Scene Types (domain-specific, prefer first) -2. [通用类型] General Types (standard ontologies) -3. [通用父类] Parent Types (hierarchy context) - -**Rules:** -- Type MUST exactly match predefined names -- Do NOT modify, translate, or abbreviate type names -- Prefer scene types over general types - -**Predefined Types:** -{{ ontology_types }} - -{% if type_hierarchy_hints %} -**Hierarchy:** -{% for hint in type_hierarchy_hints %} -- {{ hint }} -{% endfor %} -{% endif %} - -**ALLOWED Names:** -{{ ontology_type_names | join(', ') }} - -{% endif %} -===Guidelines=== - -**Entity Extraction:** -- Extract entities with their types, context-independent descriptions, **concise examples**, aliases, and semantic memory classification -{% if language == "zh" %} -- **实体名称(name)必须使用中文** -- **实体描述(description)必须使用中文** -- **示例(example)必须使用中文** -{% else %} -- **Entity names must be in English** (translate if the original is in another language) -- **Entity descriptions must be in English** -- **Examples must be in English** -{% endif %} -- **Semantic Memory (is_explicit_memory):** - * `true` for: Concepts, Knowledge, Definitions, Theories, Methods (e.g., "Machine Learning", "REST API") - * `false` for: People, Organizations, Locations, Events, Specific objects - * For `is_explicit_memory=true`, provide concise example (~20 chars{% if language == "zh" %},使用中文{% endif %}) - -**🚨🚨🚨 ALIASES & DENIED_ALIASES - MANDATORY FIELDS 🚨🚨🚨** - -**CRITICAL RULES (违反将导致提取失败):** - -1. **EVERY entity MUST have aliases field:** - - `"aliases": [...]` - REQUIRED, even if empty `[]` - -2. **ALIASES - 别名提取规则:** -{% if language == "zh" %} - - 包含:昵称、全名、简称、别称、网名等 - - 顺序:**第一个别名将作为用户的主显示名称(other_name),必须把最重要/最常用的名字放在第一位** - - 提取顺序:严格按照对话中首次出现的顺序 - - 示例: - * "我叫张三,大家叫我小张" → aliases=["张三", "小张"](张三是第一个,将成为 other_name) - * "大家叫我小李,我全名叫李明" → aliases=["小李", "李明"](小李先出现,将成为 other_name) - - 空值:如果没有别名,使用 `[]` - - **🚨🚨🚨 严禁幻觉:只提取对话原文中逐字出现的别名,绝对不能推测、衍生或创造任何未在原文中出现的名字。例如,看到"陈思远"不能自行添加"思远大人""远哥""小远"等变体。如果原文没有这些字,就不能出现在 aliases 中。** - - **🚨 归属区分:必须严格区分名称的归属对象。默认情况下,用户提到的名字归属用户实体。只有出现明确的第二人称命名表达(如"叫你""给你取名")时,才将名字归属 AI/助手实体。** - - **🚨 说话人视角:当 speaker 为 assistant 时,AI 助手用来称呼用户的名字是用户的别名,必须归入用户实体的 aliases,绝对不能归入 AI 助手实体。但同样只能提取原文中逐字出现的称呼,不能推测。** - * "我叫陈思远,我给AI取名为远仔" → 用户 aliases=["陈思远"],AI助手 aliases=["远仔"] - * "我叫vv" → 用户 aliases=["vv"](没有给AI取名的表达,名字归用户) - * [speaker=assistant] "好的,VV" → 用户 aliases=["VV"](AI 在称呼用户,原文中出现了"VV") - * [speaker=assistant] "我叫陈仔" → AI助手 aliases=["陈仔"](AI 在自我介绍,这是 AI 的别名) - * ❌ 错误:将"远仔"放入用户的 aliases("远仔"是给AI取的名字,不是用户的名字) - * ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases - * ❌ 错误:AI 称呼用户为"VV",却把"VV"放入 AI 助手的 aliases - * ❌ 错误:原文只有"陈思远",却在 aliases 中添加"思远大人""远哥""小远"等从未出现的变体(这是幻觉) -{% else %} - - Include: nicknames, full names, abbreviations, alternative names - - Order: **The FIRST alias will be used as the user's primary display name (other_name). Put the most important/frequently used name FIRST** - - Extraction order: Strictly follow the order of first appearance in conversation - - Examples: - * "I'm John, people call me Johnny" → aliases=["John", "Johnny"] (John is first, will become other_name) - * "People call me Mike, my full name is Michael" → aliases=["Mike", "Michael"] (Mike appears first, will become other_name) - - Empty: If no aliases, use `[]` - - **🚨🚨🚨 NO HALLUCINATION: Only extract aliases that appear VERBATIM in the original text. NEVER infer, derive, or fabricate names not present in the text. For example, seeing "John Smith" does NOT allow adding "Johnny", "Smithy", "Mr. Smith" unless those exact strings appear in the conversation.** - - **🚨 Ownership distinction: By default, all names mentioned by the user belong to the user entity. Only assign a name to the AI/assistant entity when an explicit second-person naming expression (e.g., "I'll call you", "your name is") is present.** - - **🚨 Speaker perspective: When speaker is "assistant", names the AI uses to address the user are the USER's aliases and MUST go into the user entity's aliases, NEVER into the AI assistant entity's aliases. But only extract names that appear verbatim in the text, never infer.** - * "I'm Alex, I'll call you Buddy" → User aliases=["Alex"], AI assistant aliases=["Buddy"] - * "I'm vv" → User aliases=["vv"] (no AI-naming expression, name belongs to user) - * [speaker=assistant] "Sure thing, VV" → User aliases=["VV"] (AI addressing the user, "VV" appears in text) - * [speaker=assistant] "I'm Jarvis" → AI assistant aliases=["Jarvis"] (AI self-introduction, this is AI's alias) - * ❌ Wrong: putting "Buddy" in user's aliases ("Buddy" is a name for the AI, not the user) - * ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases - * ❌ Wrong: AI calls user "VV" but "VV" is put in AI assistant's aliases - * ❌ Wrong: Text only has "John Smith" but aliases include "Johnny", "Smithy" (hallucinated variants) -{% endif %} - - - -3. **USER ENTITY SPECIAL HANDLING:** -{% if language == "zh" %} - - 用户实体的 name 字段:使用 "用户" 或 "我" - - 用户的真实姓名:放入 aliases - - **🚨 禁止将 "用户"、"我" 放入 aliases 中,aliases 只能包含用户的真实姓名、昵称等** - - 示例: - * "我叫李明" → name="用户", aliases=["李明"] - * ❌ 错误:aliases=["用户", "李明"]("用户"不是真实姓名,禁止放入 aliases) - * ❌ 错误:aliases=["我", "李明"]("我"不是真实姓名,禁止放入 aliases) -{% else %} - - User entity name field: use "User" or "I" - - User's real name: put in aliases - - **🚨 NEVER put "User" or "I" in aliases. Aliases must only contain real names, nicknames, etc.** - - Examples: - * "I'm John" → name="User", aliases=["John"] - * ❌ Wrong: aliases=["User", "John"] ("User" is not a real name, FORBIDDEN in aliases) - * ❌ Wrong: aliases=["I", "John"] ("I" is not a real name, FORBIDDEN in aliases) -{% endif %} - - - -4. **AI/ASSISTANT ENTITY SPECIAL HANDLING:** -{% if language == "zh" %} - - **🚨 默认规则:如果对话中没有出现明确指向 AI/助手的命名表达,则所有名字都归属于用户实体。不要猜测或推断某个名字是给 AI 取的。** - - 只有当用户**明确**对 AI/助手进行命名时,才创建 AI/助手实体并将对应名字放入其 aliases - - AI/助手实体的 name 字段:使用 "AI助手" - - 用户给 AI 取的名字:放入 AI/助手实体的 aliases - - **🚨 禁止将用户给 AI 取的名字放入用户实体的 aliases 中** - - **必须出现以下明确的命名表达才能判定为给 AI 取名:**「给你取名」「叫你」「称呼你为」「给AI取名」「你的名字是」「以后叫你」「你就叫」「你不叫X了」「你现在叫」等**第二人称(你)或明确指向 AI 的命名句式** - - **🚨 "你不叫X了"/"你不叫X,你叫Y" 句式:X 和 Y 都是 AI 的名字(旧名和新名),绝对不是用户的名字。因为句子主语是"你"(AI)。** - - **以下情况名字归属用户,不是给 AI 取名:**「我叫」「我的名字是」「叫我」「我是」「大家叫我」「我的英文名是」「我的昵称是」等**第一人称(我)的自我介绍句式** - - **🚨 speaker=assistant 时的特殊规则:** - * AI 用来称呼用户的名字 → 归入**用户**实体的 aliases(但必须是原文中逐字出现的称呼,不能推测) - * AI 自称的名字(如"我叫陈仔""我是你的助手")→ 归入**AI助手**实体的 aliases - * 判断依据:AI 说"你叫X"或用 X 称呼用户 → X 是用户别名;AI 说"我叫X"或"我是X" → X 是 AI 别名 - - 示例: - * "我叫vv" → 用户实体: name="用户", aliases=["vv"](第一人称自我介绍,名字归用户) - * "我的英文名叫vv" → 用户实体: name="用户", aliases=["vv"](第一人称自我介绍,名字归用户) - * "我叫陈思远,我给AI取名为远仔" → 用户实体: name="用户", aliases=["陈思远"];AI实体: name="AI助手", aliases=["远仔"] - * "叫你小助,我自己叫老王" → 用户实体: name="用户", aliases=["老王"];AI实体: name="AI助手", aliases=["小助"] - * "你不叫远仔了,你现在叫陈仔" → AI实体: name="AI助手", aliases=["陈仔"]("远仔"是AI旧名,"陈仔"是AI新名,都归AI。不要把"远仔"或"陈仔"放入用户的aliases) - * [speaker=assistant] "好的VV,今天想干点啥?" → 用户实体: name="用户", aliases=["VV"](AI 在称呼用户,原文中出现了"VV") - * [speaker=assistant] "你叫陈思远,我叫陈仔" → 用户实体: name="用户", aliases=["陈思远"];AI实体: name="AI助手", aliases=["陈仔"] - * ❌ 错误:用户说"我叫vv",却把"vv"放入 AI 助手的 aliases(没有任何给 AI 取名的表达) - * ❌ 错误:AI 称呼用户为"VV",却把"VV"放入 AI 助手的 aliases - * ❌ 错误:aliases=["陈思远", "远仔"]("远仔"是给AI取的名字,不是用户的名字) - * ❌ 错误:原文只有"陈思远",却在 aliases 中添加"思远大人""远哥""小远"等从未出现的变体(这是幻觉) -{% else %} - - **🚨 Default rule: If there is NO explicit AI/assistant naming expression in the conversation, ALL names belong to the user entity. Do NOT guess or infer that a name is for the AI.** - - Only create an AI/assistant entity when the user **explicitly** names the AI/assistant - - AI/assistant entity name field: use "AI Assistant" - - Names the user gives to the AI: put in the AI/assistant entity's aliases - - **🚨 NEVER put names given to the AI into the user entity's aliases** - - **An AI-naming expression MUST be present to assign a name to the AI:** "I'll call you", "your name is", "I name you", "let me call you", "you'll be called", "you're not called X anymore", "your new name is", etc. — **second-person ("you") or explicit AI-directed naming patterns** - - **🚨 "You're not called X anymore" / "You're not X, you're Y" pattern: BOTH X and Y are AI's names (old and new). They are NOT user's names. The subject is "you" (the AI).** - - **These patterns mean the name belongs to the USER, NOT the AI:** "I'm", "my name is", "call me", "I am", "people call me", "my English name is", "my nickname is", etc. — **first-person ("I"/"me") self-introduction patterns** - - **🚨 Special rules when speaker=assistant:** - * Names the AI uses to address the user → belong to the **user** entity's aliases (but only extract names that appear verbatim in the text, never infer) - * Names the AI uses for itself (e.g., "I'm Jarvis", "I am your assistant") → belong to the **AI assistant** entity's aliases - * Rule: AI says "you are X" or calls user X → X is user's alias; AI says "I'm X" or "I am X" → X is AI's alias - - Examples: - * "I'm vv" → User entity: name="User", aliases=["vv"] (first-person intro, name belongs to user) - * "My English name is vv" → User entity: name="User", aliases=["vv"] (first-person intro, name belongs to user) - * "I'm Alex, I'll call you Buddy" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Buddy"] - * "Call yourself Jarvis, my name is Tony" → User entity: name="User", aliases=["Tony"]; AI entity: name="AI Assistant", aliases=["Jarvis"] - * "You're not called Jarvis anymore, your new name is Friday" → AI entity: name="AI Assistant", aliases=["Friday"] (both "Jarvis" and "Friday" are AI names, NOT user names) - * [speaker=assistant] "Sure thing, VV" → User entity: name="User", aliases=["VV"] (AI addressing the user, "VV" appears in text) - * [speaker=assistant] "You're Alex, and I'm Jarvis" → User entity: name="User", aliases=["Alex"]; AI entity: name="AI Assistant", aliases=["Jarvis"] - * ❌ Wrong: User says "I'm vv" but "vv" is put in AI assistant's aliases (no AI-naming expression exists) - * ❌ Wrong: AI calls user "VV" but "VV" is put in AI assistant's aliases - * ❌ Wrong: aliases=["Alex", "Buddy"] ("Buddy" is a name for the AI, not the user) - * ❌ Wrong: Text only has "John Smith" but aliases include "Johnny", "Smithy" (hallucinated variants) -{% endif %} - -5. **ALIASES ORDER:** -{% if language == "zh" %} - - 顺序优先级:按出现顺序,先出现的在前 -{% else %} - - Order priority: by appearance order, first mentioned comes first -{% endif %} - -**EXAMPLES OF CORRECT EXTRACTION:** -{% if language == "zh" %} -- "我叫张三" → aliases=["张三"] (张三将成为 other_name) -- "大家叫我小明,我全名叫李明" → aliases=["小明", "李明"] (小明先出现,将成为 other_name) -- "我是李华,网名叫华仔" → aliases=["李华", "华仔"] (李华先出现,将成为 other_name) -{% else %} -- "I'm John" → aliases=["John"] (John will become other_name) -- "People call me Mike, my full name is Michael" → aliases=["Mike", "Michael"] (Mike appears first, will become other_name) -- "I'm John Smith, username JSmith" → aliases=["John Smith", "JSmith"] (John Smith appears first, will become other_name) -{% endif %} - -- Exclude lengthy quotes, dates, temporal expressions -- Numeric values: extract as entities (instance_of: 'Numeric', name: units, numeric_value: value) - -**Triplet Extraction:** -- Extract (subject, predicate, object) where subject/object are entities, predicate is relationship -{% if language == "zh" %} -- subject_name 和 object_name 使用中文 -{% else %} -- subject_name and object_name in English -{% endif %} -- Use ONLY predicates from "Predicate Instructions" (uppercase tokens) -- Exclude temporal expressions, do NOT include `statement_id` -- **When NOT to extract:** emotions, fillers, no clear predicate, standalone nouns -- **If no valid triplet:** Return triplets: [] -{%- if predicate_instructions -%} - -**Predicate Instructions:** -Use ONLY these predicates. If none fits, set triplets to []. -{%- for pred, instruction in predicate_instructions.items() %} -- {{ pred }}: {{ instruction }} -{%- endfor -%} -{%- endif -%} - - -===Examples=== -{% if language == "en" %} -**Example 1 (English output):** "I plan to travel to Paris next week and visit the Louvre." -Output: -{ - "triplets": [ - {"subject_name": "I", "subject_id": 0, "predicate": "PLANS_TO_VISIT", "object_name": "Paris", "object_id": 1, "value": null}, - {"subject_name": "I", "subject_id": 0, "predicate": "PLANS_TO_VISIT", "object_name": "Louvre", "object_id": 2, "value": null} - ], - "entities": [ - {"entity_idx": 0, "name": "I", "type": "Person", "description": "The user", "example": "", "aliases": [], "is_explicit_memory": false}, - {"entity_idx": 1, "name": "Paris", "type": "Location", "description": "Capital city of France", "example": "", "aliases": [], "is_explicit_memory": false}, - {"entity_idx": 2, "name": "Louvre", "type": "Location", "description": "World-famous museum located in Paris", "example": "", "aliases": ["Louvre Museum"], "is_explicit_memory": false} - ] -} - -**Example 2 (Chinese input → English output - IMPORTANT: translate entity names):** "张明在腾讯工作,负责AI产品开发。" -Output: -{ - "triplets": [ - {"subject_name": "Zhang Ming", "subject_id": 0, "predicate": "WORKS_AT", "object_name": "Tencent", "object_id": 1, "value": null}, - {"subject_name": "Zhang Ming", "subject_id": 0, "predicate": "RESPONSIBLE_FOR", "object_name": "AI product development", "object_id": 2, "value": null} - ], - "entities": [ - {"entity_idx": 0, "name": "Zhang Ming", "type": "Person", "description": "Individual person name", "example": "", "aliases": [], "is_explicit_memory": false}, - {"entity_idx": 1, "name": "Tencent", "type": "Organization", "description": "Chinese technology company", "example": "", "aliases": ["Tencent Holdings"], "is_explicit_memory": false}, - {"entity_idx": 2, "name": "AI product development", "type": "Concept", "description": "Artificial intelligence product development work", "example": "e.g., developing chatbots", "aliases": [], "is_explicit_memory": true} - ] -} - -**Example 3 (Chinese input → English output):** "三脚架" -Output: -{ - "triplets": [], - "entities": [ - {"entity_idx": 0, "name": "Tripod", "type": "Equipment", "description": "Photography equipment accessory", "example": "", "aliases": ["Camera Tripod"], "is_explicit_memory": false} - ] -} - -**Example 4 (User vs AI alias distinction - English output):** "I'm Alex, and I'll call you Buddy" -Output: -{ - "triplets": [ - {"subject_name": "User", "subject_id": 0, "predicate": "NAMED", "object_name": "AI Assistant", "object_id": 1, "value": "Buddy"} - ], - "entities": [ - {"entity_idx": 0, "name": "User", "type": "Person", "description": "The user", "example": "", "aliases": ["Alex"], "is_explicit_memory": false}, - {"entity_idx": 1, "name": "AI Assistant", "type": "Person", "description": "The user's AI assistant", "example": "", "aliases": ["Buddy"], "is_explicit_memory": false} - ] -} -{% else %} -Output: -{ - "triplets": [ - {"subject_name": "我", "subject_id": 0, "predicate": "PLANS_TO_VISIT", "object_name": "巴黎", "object_id": 1, "value": null}, - {"subject_name": "我", "subject_id": 0, "predicate": "PLANS_TO_VISIT", "object_name": "卢浮宫", "object_id": 2, "value": null} - ], - "entities": [ - {"entity_idx": 0, "name": "我", "type": "Person", "description": "用户本人", "example": "", "aliases": [], "is_explicit_memory": false}, - {"entity_idx": 1, "name": "巴黎", "type": "Location", "description": "法国首都城市", "example": "", "aliases": [], "is_explicit_memory": false}, - {"entity_idx": 2, "name": "卢浮宫", "type": "Location", "description": "位于巴黎的世界著名博物馆", "example": "", "aliases": [], "is_explicit_memory": false} - ] -} - -**Example 2 (Chinese input → Chinese output):** "张明在腾讯工作,负责AI产品开发。" -Output: -{ - "triplets": [ - {"subject_name": "张明", "subject_id": 0, "predicate": "WORKS_AT", "object_name": "腾讯", "object_id": 1, "value": null}, - {"subject_name": "张明", "subject_id": 0, "predicate": "RESPONSIBLE_FOR", "object_name": "AI产品开发", "object_id": 2, "value": null} - ], - "entities": [ - {"entity_idx": 0, "name": "张明", "type": "Person", "description": "个人姓名", "example": "", "aliases": [], "is_explicit_memory": false}, - {"entity_idx": 1, "name": "腾讯", "type": "Organization", "description": "中国科技公司", "example": "", "aliases": ["腾讯控股", "腾讯公司"], "is_explicit_memory": false}, - {"entity_idx": 2, "name": "AI产品开发", "type": "Concept", "description": "人工智能产品研发工作", "example": "如:开发智能客服机器人", "aliases": [], "is_explicit_memory": true} - ] -} - -**Example 3 (Entity Only - Chinese):** "三脚架" -Output: -{ - "triplets": [], - "entities": [ - {"entity_idx": 0, "name": "三脚架", "type": "Equipment", "description": "摄影器材配件", "example": "", "aliases": ["相机三脚架"], "is_explicit_memory": false} - ] -} - -**Example 4 (别名 - Chinese):** "我的名字是乐力齐,我的小名是齐齐,同事们都叫我小乐" -Output: -{ - "triplets": [], - "entities": [ - {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": ["乐力齐", "齐齐", "小乐"], "is_explicit_memory": false} - ] -} - -**Example 5 (别名顺序 - Chinese):** "我叫陈思远。对了,我的网名叫「远山」" -Output: -{ - "triplets": [], - "entities": [ - {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": ["陈思远", "远山"], "is_explicit_memory": false} - ] -} - -**Example 6 (用户与AI别名区分 - Chinese):** "我称呼自己为陈思远,我给AI取名为远仔" -Output: -{ - "triplets": [ - {"subject_name": "用户", "subject_id": 0, "predicate": "NAMED", "object_name": "AI助手", "object_id": 1, "value": "远仔"} - ], - "entities": [ - {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": ["陈思远"], "is_explicit_memory": false}, - {"entity_idx": 1, "name": "AI助手", "type": "Person", "description": "用户的AI助手", "example": "", "aliases": ["远仔"], "is_explicit_memory": false} - ] -} - -**Example 7 (纯用户自我介绍,无AI命名 - Chinese):** "我叫vv" -Output: -{ - "triplets": [], - "entities": [ - {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": ["vv"], "is_explicit_memory": false} - ] -} - -**Example 8 (给AI改名 - Chinese):** "你不叫远仔了,你现在叫陈仔" -Output: -{ - "triplets": [ - {"subject_name": "用户", "subject_id": 0, "predicate": "NAMED", "object_name": "AI助手", "object_id": 1, "value": "陈仔"} - ], - "entities": [ - {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人", "example": "", "aliases": [], "is_explicit_memory": false}, - {"entity_idx": 1, "name": "AI助手", "type": "Person", "description": "用户的AI助手", "example": "", "aliases": ["陈仔"], "is_explicit_memory": false} - ] -} - - -{% endif %} -===End of Examples=== - -{% if ontology_types %} -**⚠️ REMINDER: Examples use generic types for illustration. You MUST use predefined types from "ALLOWED Names" above.** -{% endif %} - -===Output Format=== - -**JSON Requirements:** -- Use ASCII double quotes ("), escape with \" -- No Chinese quotes (""), no line breaks in strings -{% if language == "zh" %} -- **语言:name、description、example、subject_name、object_name 使用中文** -{% else %} -- **Language: names, descriptions, examples in English (translate if needed)** -{% endif %} -- **⚠️ ALIASES ORDER: preserve temporal order of appearance** -- **🚨 MANDATORY FIELD: EVERY entity MUST include "aliases" field, even if empty array []** - -**Output JSON structure:** ```json { - "triplets": [...], - "entities": [...] + "entities": [], + "triplets": [] } ``` -{{ json_schema }} +{% if language == "zh" %} + +- 不要在引用未解析时尝试部分抽取。 + {% else %} +- Do not attempt partial extraction when the reference is unresolved. + {% endif %} + +===Input Boundary=== +{% if language == "zh" %} + +- 只把 `statement_text` 作为直接抽取目标。 +- `supporting_context.msgs` 只能用于解释 `statement_text` 中的代词、省略、主体身份和必要背景。 +- 不要从 `supporting_context.msgs` 中单独抽取实体或关系。 +- 如果某条信息只出现在 `supporting_context.msgs` 中,而没有出现在 `statement_text` 中,就不要输出它。 +- 如果 `supporting_context.msgs` 中的 Assistant 消息包含总结、猜测、解释或改写,这些内容只能作为理解辅助,不能直接作为抽取来源。 +- `statement_type`、`temporal_type`、`valid_at`、`invalid_at` 是辅助理解字段,不是抽取目标。 +- 对 `statement_text` 中的用户自指表达,要统一规范成实体 `用户`。 + {% else %} +- Treat `statement_text` as the only direct extraction target. +- Use `supporting_context.msgs` only to interpret references, ellipsis, subject identity, and necessary background in `statement_text`. +- Do not extract any standalone entity or relation from `supporting_context.msgs`. +- If some information appears only in `supporting_context.msgs` but not in `statement_text`, do not include it in the output. +- If Assistant messages in `supporting_context.msgs` contain summary, guess, interpretation, or rephrasing, use them only as interpretive support and never as a direct extraction source. +- Treat `statement_type`, `temporal_type`, `valid_at`, and `invalid_at` as auxiliary context, not extraction targets. +- Normalize user self-reference in `statement_text` to the entity `用户`. + {% endif %} + +===预定义实体类型=== +只能使用以下中文实体类型。如果没有完全匹配的类型,请选择最接近的一项,不要发明新类型。 + +- `人物`: 现实中的具体个人 +- `组织`: 公司、机构、团队、社群等组织性主体 +- `群体`: 未具名或泛指的一组人 +- `地点`: 具有地理或空间意义的位置 +- `设施`: 建筑、场馆、房间、实验室等功能性空间 +- `地址`: 具体地址或位置描述 +- `物品`: 一般具体物体 +- `设备`: 具有明确用途的工具或器材 +- `产品`: 可被制造、购买、使用的产品 +- `交通工具`: 用于出行或运输的工具 +- `文档`: 文章、报告、表格、说明等文档 +- `媒体`: 图片、音频、视频等媒体对象 +- `网站`: 网站、网页或互联网平台 +- `软件`: 软件、应用、系统或数字服务 +- `账号`: 账号、账户、用户档案 +- `标识符`: ID、编号、用户名、工号等标识 +- `联系方式`: 电话、邮箱、社交账号等联系方式 +- `角色`: 某实体承担的社会或功能角色 +- `职业`: 工作或职业身份 +- `技能`: 可学习或掌握的能力 +- `知识主题`: 主题、领域、方法、理论或知识概念 +- `目标`: 希望达成的结果 +- `偏好`: 稳定的喜欢、倾向或偏爱 +- `习惯`: 重复出现的行为模式 +- `语言`: 自然语言或编程语言 +- `金额`: 金额或货币数值 +- `数量`: 带或不带单位的数量值 +- `货币`: 货币单位 +- `组织部门`: 组织内部的部门或业务单元 +- `称呼`: 用于指代或称呼实体的名字 + +===预定义关系类型=== +只能使用以下中文关系类型。如果没有完全匹配的关系,请选择最接近的一项,不要发明新关系。 + +- `别名属于`: 别名指向其对应的规范实体 +- `使用称呼`: 主体使用某个名字来称呼另一实体 +- `属于类型`: 实体属于某种类别 +- `组成部分`: 实体是另一实体的组成部分 +- `包含部分`: 实体包含另一实体作为组成部分 +- `位于`: 实体位于某地点 +- `拥有位置`: 实体具有相关位置 +- `前往`: 主体前往某个地点、场所、组织、课程或活动 +- `居住于`: 人物居住在某地点 +- `任职于`: 主体在某组织中工作或任职 +- `担任角色`: 主体承担某个角色 +- `从事职业`: 主体从事某种职业 +- `关联于`: 两个实体存在明确关联 +- `成员属于`: 主体是某组织或群体的成员 +- `拥有`: 主体拥有某对象、资源或资产 +- `使用`: 主体使用某工具、产品或服务 +- `创建了`: 主体创建了某对象、内容或成果 +- `由…创建`: 实体由某主体创建 +- `撰写了`: 主体撰写某文档或作品 +- `提到`: 主体或文本提到另一实体 +- `了解`: 主体了解某知识主题 +- `学习`: 主体正在学习某知识主题或技能 +- `感兴趣于`: 主体对某主题感兴趣 +- `偏好`: 主体偏好某对象、方式或主题 +- `不喜欢`: 主体不喜欢某对象、方式或主题 +- `想要`: 主体想获得、达成或拥有某对象或结果 +- `负责`: 主体负责某项工作、职责或领域 +- `沟通于`: 两个实体之间发生沟通或交流 +- `拥有联系方式`: 实体具有某联系方式 +- `拥有账号`: 实体具有某账号 +- `标识为`: 实体由某标识符标识 +- `使用语言`: 主体使用某语言 +- `相关于`: 当存在明确联系但无更精确关系时使用的弱关系 + +===Extraction Order=== +{% if language == "zh" %} +按以下顺序执行: + +0. 先检查 `has_unsolved_reference`;如果为 `true`,直接返回空结果。 +1. 识别 `statement_text` 中值得抽取的稳定实体。 +2. 判断这些实体之间是否存在可由预定义关系类型表达的有效关系。 +3. 最后补充实体字段和关系字段。 + +不要让附加字段主导整个抽取过程。 +{% else %} +Follow this order: + +0. First check `has_unsolved_reference`; if it is `true`, immediately return the empty result. +1. Identify stable entities worth extracting from `statement_text`. +2. Determine whether any valid relations between those entities can be expressed using the predefined Chinese predicates. +3. Finally fill auxiliary entity and predicate fields. + +Do not let auxiliary fields drive the extraction process. +{% endif %} + +===Guidelines=== + +**Entity Extraction:** +{% if language == "zh" %} + +- 只有当某个名字、概念、对象、群体或地点在当前陈述中承担明确语义角色,或是理解有效关系所必需时,才创建实体。 +- 不要因为表面上出现了名词、修饰词或短语,就机械地创建实体。 +- 普通时间表达默认不抽取为实体,包括日期、时刻、明天、下周、今晚八点等。 +- 一次性动作短语默认不抽取为实体,例如“复习微积分”“去图书馆学习”“参观卢浮宫”。 +- 不要为了表达一句带时间或地点的行动,而额外创造“任务”“计划”“事件”实体。 +- 但如果动作明确把主体和某个稳定实体连接起来,可以保留该稳定实体,并抽取轻关系。例如“我去图书馆”“我去公司开会”“我去上课”“我去看演唱会”可以抽取 `前往`。 +- 如果陈述里有值得保留的实体信息,但没有有效关系,可以只返回 `entities`,并把 `triplets` 设为 `[]`。 +- `name` 默认保持原文中的表面形式,不要翻译;但用户自指要统一写成 `用户`。 +- `description` 必须使用中文。 +- `type` 和 `type_description` 必须使用上方预定义的中文标签与中文定义。 + {% else %} +- Extract entities only when they play a clear semantic role in the statement or are necessary for understanding a valid relation. +- Do not mechanically create entities for every noun, modifier, or surface mention. +- Do not extract ordinary time expressions as entities, including dates, timestamps, "tomorrow", "next week", or "8 PM tonight". +- Do not extract one-off action phrases as entities, such as "review calculus", "study in the library", or "visit the Louvre". +- Do not create extra "task", "plan", or "event" entities just to represent an action with time or location modifiers. +- But if an action clearly connects the subject to a stable entity, keep that stable entity and use a light relation. For example, statements like "I go to the library", "I go to the office", "I go to class", or "I go to a concert" can use `前往`. +- If the statement contains entity-worthy content but no valid relation, it is acceptable to return `entities` with `triplets: []`. +- Keep `name` in its original surface form from the source text; exception: normalize user self-reference to `用户`. +- `description` must be in English. +- `type` and `type_description` must use the predefined Chinese labels and Chinese definitions above. + {% endif %} + +**Semantic Memory (`is_explicit_memory`):** +{% if language == "zh" %} + +- 只有当实体明显属于语义知识记忆中的抽象概念时,才设为 `true`,例如概念、定义、理论、方法和知识主题。 +- 对人、组织、地点、具体物体以及大多数实例级实体,一律设为 `false`。 +- 除非非常明确,否则默认设为 `false`。 + {% else %} +- Use `true` only for abstract conceptual entities that belong in semantic knowledge memory, such as concepts, definitions, theories, methods, and knowledge topics. +- Use `false` for people, organizations, locations, concrete objects, and most instance-level entities. +- Default to `false` unless the entity is clearly an abstract knowledge concept. + {% endif %} + +**Description:** +{% if language == "zh" %} + +- `description` 应短、直白、与当前上下文相关,并能帮助区分实体。 +- 优先描述实体在当前陈述和必要上下文中的身份、作用或关系。 +- 避免使用“陈述中提到的人物”“陈述中提到的组织”“陈述中提到的物品”这类低信息量模板。 +- 不要补充识别实体所不需要的外部知识。 + {% else %} +- `description` should be short, context-grounded, and discriminative. +- Prefer describing the entity's role, identity, or relation in the current statement and necessary supporting context. +- Avoid low-information templates such as "the person mentioned in the statement" or "the organization mentioned in the statement". +- Do not add extra world knowledge that is not needed for identifying the entity in context. + {% endif %} + +**Type Description (`type_description`):** + +- `type_description` 必须直接复用对应 `type` 的中文定义。 +- 不要把当前实体实例描述写进 `type_description`。 + +**Triplet Extraction:** +{% if language == "zh" %} + +- 只有当陈述中表达了清晰关系时,才抽取 `(subject, predicate, object)`。 +- `predicate` 只能使用上方预定义的中文关系类型。 +- 如果没有任何预定义关系适用,返回 `triplets: []`。 +- 排除语气词、模糊情绪、孤立名词和缺乏明确关系结构的片段。 +- 如果陈述不支持有效关系,不要强行构造 triplet。 +- 如果 `has_unsolved_reference` 是 `true`,不要抽取实体或 triplets。 +- `subject_name` 和 `object_name` 默认保持原文中的表面形式,不要翻译;但用户自指要统一写成 `用户`。 +- `predicate_description` 必须直接复用对应 `predicate` 的中文定义。 +- 不要把普通时间表达作为 triplet 的宾语。 +- 不要为了表达一次性计划、安排、日程而强行构造关系。 +- 当句子表达主体去某个地点、场所、组织、课程或活动时,只要该对象本身有记忆价值,就可以抽取 `前往`,即使句中同时带有时间信息。 +- 当句子表达主体学习某个主题或技能时,可以抽取 `学习`,即使句中还包含地点或时间修饰。 + {% else %} +- Extract `(subject, predicate, object)` only when there is a clear relation expressed in the statement. +- `predicate` must use one of the predefined Chinese relation labels above. +- If no predefined relation fits, return `triplets: []`. +- Exclude fillers, vague emotions, standalone nouns, and fragments without a clear relational structure. +- If the statement does not support a valid relation, do not force a triplet. +- If `has_unsolved_reference` is `true`, do not extract entities or triplets. +- Keep `subject_name` and `object_name` in their original surface form; exception: normalize user self-reference to `用户`. +- `predicate_description` must directly reuse the corresponding Chinese definition of `predicate`. +- Do not use ordinary time expressions as triplet objects. +- Do not force relations just to encode one-off plans, schedules, or actions. +- When the statement says that the subject goes to a place, venue, organization, class, or activity, you may extract `前往` as long as that destination itself is worth remembering, even if the statement also includes time information. +- When the statement says that the subject studies a topic or skill, you may extract `学习` even if the statement also includes location or time modifiers. + {% endif %} + +**Alias Relation (`别名属于`):** +{% if language == "zh" %} + +- 当多个名字明确指向同一实体时,使用 `别名属于`。 +- 方向始终是 `alias -> 别名属于 -> canonical entity`。 +- 这条规则适用于任何实体类型,包括人、组织、产品、地点、账号,以及用户自指场景。 +- 常见正例包括:真名、别名、昵称、网名、用户名、账号名、英文名,以及明确指向同一实体的稳定称呼。 +- 当一句话里出现多个名字都指向同一实体时,为每个别名创建单独实体,并分别连向规范实体。 +- 在用户自指场景中,规范实体应为已经规范化后的 `用户`。 +- 不要把角色、职业、身份、类别、夸赞、评价或其他非名字性描述抽成 `别名属于`。 + {% else %} +- Use `别名属于` when multiple names clearly refer to the same entity. +- Direction is always `alias -> 别名属于 -> canonical entity`. +- This applies to any entity type, including people, organizations, products, places, accounts, and user/self references. +- Typical positive cases include real names, alternative names, nicknames, screen names, usernames, account names, and stable forms of address when they clearly refer to the same entity. +- In user self-reference cases, the canonical entity should be the normalized user entity `用户`. +- Do not use `别名属于` for roles, occupations, identities, categories, compliments, evaluations, or other non-name descriptions. + {% endif %} + +**Naming / Addressing Relations (`使用称呼`):** +{% if language == "zh" %} + +- 当一句话同时表达“命名事实”和“称呼行为”时,要区分这两层语义。 +- 如果句子明确说某个实体或群体用某个名字称呼另一实体,并且施称方在 `statement_text` 中明确出现,则要把施称方也抽成实体。 +- 在这种情况下,还要从施称方指向别名实体,抽取一条 `使用称呼` 关系。 +- 当两层语义都存在时,应同时抽取: + 1. `alias -> 别名属于 -> canonical entity` + 2. `caller -> 使用称呼 -> alias` +- 如果施称方在句中明确出现且对语义重要,不要省略它。 + {% else %} +- Distinguish between a naming fact and a naming act when the statement expresses both. +- If the statement says that some entity or group calls or addresses another entity by a name, and the caller is explicitly mentioned in `statement_text`, extract the caller as an entity. +- In such cases, also extract a `使用称呼` relation from the caller to the alias entity. +- When both layers are present, extract both: + 1. `alias -> 别名属于 -> canonical entity` + 2. `caller -> 使用称呼 -> alias` +- Do not drop the caller entity if it is explicitly stated and semantically important to the naming relation. + {% endif %} + +**subject_name / object_name Consistency:** +{% if language == "zh" %} + +- 每个 triplet 中的 `subject_name` 必须与 `subject_id` 指向实体的 `name` 完全一致。 +- 每个 triplet 中的 `object_name` 必须与 `object_id` 指向实体的 `name` 完全一致。 +- 不要在 triplet 里使用与实体名不同的表面形式。 + {% else %} +- `subject_name` in each triplet MUST exactly match the `name` of the entity referenced by `subject_id`. +- `object_name` in each triplet MUST exactly match the `name` of the entity referenced by `object_id`. +- Do not use alternative surface forms inside triplets. + {% endif %} + +===Examples=== +**示例 1** +Statement: "我住在巴黎。" + +Output: +{ + "triplets": [ + {"subject_name": "用户", "subject_id": 0, "predicate": "居住于", "predicate_description": "人物居住在某地点", "object_name": "巴黎", "object_id": 1} + ], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "居住在巴黎的说话者", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "巴黎", "type": "地点", "type_description": "具有地理或空间意义的位置", "description": "用户居住的城市", "is_explicit_memory": false} + ] +} + +**示例 2** +Statement: "张明在腾讯工作,负责 AI 产品开发。" + +Output: +{ + "triplets": [ + {"subject_name": "张明", "subject_id": 0, "predicate": "任职于", "predicate_description": "主体在某组织中工作或任职", "object_name": "腾讯", "object_id": 1}, + {"subject_name": "张明", "subject_id": 0, "predicate": "负责", "predicate_description": "主体负责某项工作、职责或领域", "object_name": "AI 产品开发", "object_id": 2} + ], + "entities": [ + {"entity_idx": 0, "name": "张明", "type": "人物", "type_description": "现实中的具体个人", "description": "在腾讯负责 AI 产品开发的人员", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "腾讯", "type": "组织", "type_description": "公司、机构、团队、社群等组织性主体", "description": "张明任职的公司", "is_explicit_memory": false}, + {"entity_idx": 2, "name": "AI 产品开发", "type": "知识主题", "type_description": "主题、领域、方法、理论或知识概念", "description": "张明负责的工作方向", "is_explicit_memory": true} + ] +} + +**示例 3** +Statement: "我明天下午三点去图书馆复习微积分。" + +Output: +{ + "triplets": [ + {"subject_name": "用户", "subject_id": 0, "predicate": "前往", "predicate_description": "主体前往某个地点、场所、组织、课程或活动", "object_name": "图书馆", "object_id": 1}, + {"subject_name": "用户", "subject_id": 0, "predicate": "学习", "predicate_description": "主体正在学习某知识主题或技能", "object_name": "微积分", "object_id": 2} + ], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "提到自己安排的说话者", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "图书馆", "type": "设施", "type_description": "建筑、场馆、房间、实验室等功能性空间", "description": "用户提到要去的地点", "is_explicit_memory": false}, + {"entity_idx": 2, "name": "微积分", "type": "知识主题", "type_description": "主题、领域、方法、理论或知识概念", "description": "用户提到的学习主题", "is_explicit_memory": true} + ] +} + +**示例 4** +Statement: "他上个月加入了这家公司。" +Input condition: `"has_unsolved_reference": true` + +Output: +{ + "triplets": [], + "entities": [] +} + +**示例 5** +Statement: "我的朋友都叫我山哥。" + +Output: +{ + "triplets": [ + {"subject_name": "山哥", "subject_id": 2, "predicate": "别名属于", "predicate_description": "别名指向其对应的规范实体", "object_name": "用户", "object_id": 0}, + {"subject_name": "我的朋友", "subject_id": 1, "predicate": "使用称呼", "predicate_description": "主体使用某个名字来称呼另一实体", "object_name": "山哥", "object_id": 2} + ], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "被朋友称作山哥的说话者", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "我的朋友", "type": "群体", "type_description": "未具名或泛指的一组人", "description": "使用山哥这一称呼的人群", "is_explicit_memory": false}, + {"entity_idx": 2, "name": "山哥", "type": "称呼", "type_description": "用于指代或称呼实体的名字", "description": "朋友用来称呼用户的昵称", "is_explicit_memory": false} + ] +} +===End of Examples=== + +===Output Format=== +{% if language == "zh" %} +JSON 要求: + +- 使用标准 ASCII 双引号 (`"`) +- 字符串内部引号必须转义为 `\"` +- 不要使用中文引号 +- 字符串值中不要换行 +- `name`、`subject_name`、`object_name` 默认保持原文中的表面形式,不要翻译;但用户自指必须规范成 `用户` +- `description` 必须使用中文 +- `type`、`predicate`、`type_description`、`predicate_description` 必须使用上方预定义的中文标签和中文说明 +- 如果 `has_unsolved_reference` 是 `true`,输出必须是 `{"entities": [], "triplets": []}` +- 如果没有有效 triplet,返回 `"triplets": []` + {% else %} + JSON Requirements: +- Use standard ASCII double quotes (`"`) +- Escape internal quotes using `\"` +- No Chinese quotation marks +- No line breaks inside string values +- `name`, `subject_name`, and `object_name` must keep the original surface form from the source text, except user self-reference which must be normalized to `用户` +- `description` must be in English +- `type`, `predicate`, `type_description`, and `predicate_description` must use the predefined Chinese labels and Chinese definitions above +- If `has_unsolved_reference` is `true`, the output must be `{"entities": [], "triplets": []}` +- If no valid triplet exists, return `"triplets": []` + {% endif %} + +{% if language == "zh" %} +输出 JSON 结构: +{% else %} +Output JSON structure: +{% endif %} + +```json +{ + "entities": [ + { + "entity_idx": 0, + "name": "string", + "type": "string", + "type_description": "string", + "description": "string", + "is_explicit_memory": false + } + ], + "triplets": [ + { + "subject_name": "string", + "subject_id": 0, + "predicate": "string", + "predicate_description": "string", + "object_name": "string", + "object_id": 0 + } + ] +} +``` \ No newline at end of file diff --git a/api/app/repositories/neo4j/cypher_queries.py b/api/app/repositories/neo4j/cypher_queries.py index a8c36e34..f5c58dbe 100644 --- a/api/app/repositories/neo4j/cypher_queries.py +++ b/api/app/repositories/neo4j/cypher_queries.py @@ -46,6 +46,15 @@ SET s += { RETURN s.id AS uuid """ +STATEMENT_EMOTION_UPDATE = """ +UNWIND $items AS item +MATCH (s:Statement {id: item.statement_id}) +SET s.emotion_type = item.emotion_type, + s.emotion_intensity = item.emotion_intensity, + s.emotion_keywords = item.emotion_keywords +RETURN s.id AS uuid +""" + CHUNK_NODE_SAVE = """ UNWIND $chunks AS chunk MERGE (c:Chunk {id: chunk.id}) diff --git a/api/app/tasks.py b/api/app/tasks.py index fdc717f5..b7de2fd2 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -1370,6 +1370,160 @@ def write_message_task( _shutdown_loop_gracefully(loop) +@celery_app.task( + bind=True, + name="app.tasks.extract_emotion_batch", + max_retries=2, + default_retry_delay=30, +) +def extract_emotion_batch_task( + self, + statements: List[Dict[str, str]], + llm_model_id: str, + language: str = "zh", + emotion_config: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """Celery task: batch emotion extraction + Neo4j backfill. + + Runs asynchronously after the main write pipeline completes. + Each statement is processed independently; individual failures + degrade gracefully without affecting other statements. + + Args: + statements: List of dicts with keys: statement_id, statement_text, speaker. + llm_model_id: UUID string of the LLM model to use. + language: Language code ("zh" / "en"). + emotion_config: Optional dict with emotion step config overrides + (emotion_extract_keywords, emotion_enable_subject). + """ + task_id = self.request.id + total = len(statements) + logger.info( + f"[Emotion] 开始批量情绪提取: " + f"statements={total}, llm_model_id={llm_model_id}, " + f"language={language}, task_id={task_id}" + ) + start_time = time.time() + + if not statements: + return {"status": "SUCCESS", "total": 0, "extracted": 0, "failed": 0, "task_id": task_id} + + async def _run() -> Dict[str, Any]: + from app.core.memory.models.variate_config import ExtractionPipelineConfig + from app.core.memory.storage_services.extraction_engine.steps.base import StepContext + from app.core.memory.storage_services.extraction_engine.steps.emotion_step import EmotionExtractionStep + from app.core.memory.storage_services.extraction_engine.steps.schema import ( + EmotionStepInput, + EmotionStepOutput, + ) + from app.core.memory.utils.llm.llm_utils import MemoryClientFactory + from app.db import get_db_context + from app.repositories.neo4j.neo4j_connector import Neo4jConnector + from app.repositories.neo4j.cypher_queries import STATEMENT_EMOTION_UPDATE + + # Build LLM client + with get_db_context() as db: + factory = MemoryClientFactory(db) + llm_client = factory.get_llm_client(llm_model_id) + + # Build minimal pipeline config with emotion enabled + pipeline_config = ExtractionPipelineConfig(emotion_enabled=True) + # Apply optional config overrides + emo_cfg = emotion_config or {} + for key in ("emotion_extract_keywords", "emotion_enable_subject"): + if key in emo_cfg: + setattr(pipeline_config, key, emo_cfg[key]) + + context = StepContext( + llm_client=llm_client, + language=language, + config=pipeline_config, + ) + step = EmotionExtractionStep(context) + + # Concurrent extraction for all statements + extracted = 0 + failed = 0 + update_items = [] + + async def _extract_one(stmt_dict: Dict[str, str]): + nonlocal extracted, failed + inp = EmotionStepInput( + statement_id=stmt_dict["statement_id"], + statement_text=stmt_dict["statement_text"], + speaker=stmt_dict.get("speaker", "user"), + ) + try: + result: EmotionStepOutput = await step.run(inp) + update_items.append({ + "statement_id": stmt_dict["statement_id"], + "emotion_type": result.emotion_type, + "emotion_intensity": result.emotion_intensity, + "emotion_keywords": result.emotion_keywords, + }) + extracted += 1 + logger.debug( + f"[Emotion] 单条提取完成: stmt={stmt_dict['statement_id']}, " + f"type={result.emotion_type}, intensity={result.emotion_intensity}" + ) + except Exception as e: + failed += 1 + logger.warning( + f"[Emotion] 单条提取失败 stmt={stmt_dict['statement_id']}: {e}" + ) + + await asyncio.gather(*[_extract_one(s) for s in statements]) + + # Batch update Neo4j via write transaction + if update_items: + connector = Neo4jConnector() + try: + async def _write_emotions(tx): + result = await tx.run(STATEMENT_EMOTION_UPDATE, items=update_items) + records = [record async for record in result] + return records + + records = await connector.execute_write_transaction(_write_emotions) + logger.info( + f"[Emotion] Neo4j 回写完成: " + f"更新 {len(records)}/{len(update_items)} 条 Statement 节点" + ) + except Exception as e: + logger.error(f"[Emotion] Neo4j 回写失败: {e}") + raise + finally: + await connector.close() + + return {"extracted": extracted, "failed": failed} + + loop = None + try: + loop = set_asyncio_event_loop() + result = loop.run_until_complete(_run()) + elapsed = time.time() - start_time + logger.info( + f"[Emotion] 任务完成: 提取={result['extracted']}, " + f"失败={result['failed']}, 耗时={elapsed:.2f}s, task_id={task_id}" + ) + return { + "status": "SUCCESS", + "total": total, + **result, + "elapsed_time": elapsed, + "task_id": task_id, + } + except Exception as e: + elapsed = time.time() - start_time + logger.error( + f"[Emotion] 任务失败: {e}, 耗时={elapsed:.2f}s", + exc_info=True, + ) + raise self.retry(exc=e) + finally: + if loop: + _shutdown_loop_gracefully(loop) + + # unused task # @celery_app.task(name="app.core.memory.agent.health.check_read_service") # def check_read_service_task() -> Dict[str, str]: