From 7747ed7ac107959cd927e297883c6afb66a97a86 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Tue, 28 Apr 2026 13:32:29 +0800 Subject: [PATCH] refactor(memory): enhance extraction ontology and add assistant pruning graph support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Expand entity type ontology with detailed definitions, examples, and notes (merged types: 地点设施, 物品设备, 产品服务, 软件平台, 角色职业, 知识能力, 偏好习惯目标, 称呼别名, 智能体) - Add relation ontology taxonomy with 15 predicate categories and usage rules - Strengthen reference resolution rules: resolve pronouns before extraction, skip unresolvable references entirely - Add guidelines to avoid extracting abstract propositions, emotions, and low-value entities (effort/reward/success patterns) - Add 7 new extraction examples covering edge cases - Add AssistantOriginal/AssistantPruned node models and graph persistence (PRUNED_TO and BELONGS_TO_DIALOG edges, Neo4j indexes and constraints) - Add graph_build_step.py for building graph nodes/edges from DialogData - Update write_pipeline.py to pass assistant pruning nodes/edges to graph saver - Update data_pruning.py with related preprocessing changes --- api/app/core/memory/models/graph_models.py | 44 ++ .../core/memory/pipelines/write_pipeline.py | 12 + .../data_preprocessing/data_pruning.py | 24 + .../steps/extraction_pipeline_orchestrator.py | 2 + .../steps/graph_build_step.py | 91 ++++ .../prompt/prompts/extracat_Pruning.jinja2 | 303 +++++------ .../prompt/prompts/extract_statement.jinja2 | 92 +++- .../prompt/prompts/extract_triplet.jinja2 | 480 +++++++++++++++--- api/app/repositories/neo4j/create_indexes.py | 33 ++ api/app/repositories/neo4j/cypher_queries.py | 200 ++------ api/app/repositories/neo4j/graph_saver.py | 57 +++ 11 files changed, 917 insertions(+), 421 deletions(-) diff --git a/api/app/core/memory/models/graph_models.py b/api/app/core/memory/models/graph_models.py index 2248ce05..cd44588d 100644 --- a/api/app/core/memory/models/graph_models.py +++ b/api/app/core/memory/models/graph_models.py @@ -578,3 +578,47 @@ class PerceptualNode(Node): domain: str file_type: str summary_embedding: list[float] | None + + +class AssistantOriginalNode(Node): + """Node storing the original text of an Assistant message before pruning. + + Attributes: + pair_id: Shared ID with the corresponding AssistantPrunedNode for pairing + dialog_id: ID of the parent dialogue this message belongs to + text: The full original Assistant response text + """ + pair_id: str = Field(..., description="Shared pairing ID with the corresponding pruned node") + dialog_id: str = Field(..., description="ID of the parent dialogue") + text: str = Field(..., description="Original Assistant message text") + + +class AssistantPrunedNode(Node): + """Node storing the pruned (compressed) text of an Assistant message. + + Attributes: + pair_id: Shared ID with the corresponding AssistantOriginalNode for pairing + dialog_id: ID of the parent dialogue this message belongs to + text: The pruned memory hint text (or "NULL" if no memory value) + memory_type: Type of the memory hint (comfort|suggestion|recommendation|warning|instruction|NULL) + text_embedding: Optional embedding vector for semantic search on pruned text + """ + pair_id: str = Field(..., description="Shared pairing ID with the corresponding original node") + dialog_id: str = Field(..., description="ID of the parent dialogue") + text: str = Field(..., description="Pruned assistant memory hint text") + memory_type: str = Field(..., description="Memory type: comfort|suggestion|recommendation|warning|instruction|NULL") + text_embedding: Optional[List[float]] = Field(None, description="Embedding vector for semantic search") + + +class AssistantPrunedEdge(Edge): + """Edge connecting an AssistantOriginal node to its AssistantPruned node (PRUNED_TO). + + Attributes: + pair_id: Shared pairing ID for traceability + """ + pair_id: str = Field(..., description="Shared pairing ID for traceability") + + +class AssistantDialogEdge(Edge): + """Edge connecting an AssistantOriginal node to its parent Dialogue node (BELONGS_TO_DIALOG).""" + pass diff --git a/api/app/core/memory/pipelines/write_pipeline.py b/api/app/core/memory/pipelines/write_pipeline.py index a68798db..9883f42a 100644 --- a/api/app/core/memory/pipelines/write_pipeline.py +++ b/api/app/core/memory/pipelines/write_pipeline.py @@ -77,6 +77,10 @@ class ExtractionResult(BaseModel): stmt_entity_edges: List[StatementEntityEdge] entity_entity_edges: List[EntityEntityEdge] perceptual_edges: List[PerceptualEdge] + assistant_original_nodes: List[Any] = Field(default_factory=list) + assistant_pruned_nodes: List[Any] = Field(default_factory=list) + assistant_pruned_edges: List[Any] = Field(default_factory=list) + assistant_dialog_edges: List[Any] = Field(default_factory=list) dialog_data_list: List[Any] = Field( default_factory=list, description="原始 DialogData 列表,类型为 Any 以避免循环依赖", @@ -482,6 +486,10 @@ class WritePipeline: stmt_entity_edges=dedup_result.statement_entity_edges, entity_entity_edges=dedup_result.entity_entity_edges, perceptual_edges=graph.perceptual_edges, + assistant_original_nodes=graph.assistant_original_nodes, + assistant_pruned_nodes=graph.assistant_pruned_nodes, + assistant_pruned_edges=graph.assistant_pruned_edges, + assistant_dialog_edges=graph.assistant_dialog_edges, dialog_data_list=dialog_data_list, ) @@ -523,6 +531,10 @@ class WritePipeline: entity_edges=result.entity_entity_edges, perceptual_edges=result.perceptual_edges, connector=self._neo4j_connector, + assistant_original_nodes=result.assistant_original_nodes, + assistant_pruned_nodes=result.assistant_pruned_nodes, + assistant_pruned_edges=result.assistant_pruned_edges, + assistant_dialog_edges=result.assistant_dialog_edges, ) if success: logger.info("Successfully saved all data to Neo4j") diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py index 4933c286..07481070 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py @@ -15,7 +15,9 @@ import hashlib import json import logging from collections import OrderedDict +from datetime import datetime from typing import List, Optional, Dict +from uuid import uuid4 from pydantic import BaseModel, Field @@ -39,6 +41,16 @@ def message_has_files(message: "ConversationMessage") -> bool: return message.files and len(message.files) > 0 +class AssistantPruningRecord(BaseModel): + """单个 User-Assistant 消息对的剪枝记录,用于后续写入 Neo4j。""" + + pair_id: str = Field(..., description="唯一配对 ID,Original 和 Pruned 节点共享") + original_text: str = Field(..., description="Assistant 原始回复全文") + pruned_text: str = Field(..., description="剪枝后文本(assistant_memory_hint),或 'NULL'") + memory_type: str = Field(..., description="comfort|suggestion|recommendation|warning|instruction|NULL") + created_at: str = Field(..., description="ISO 时间戳") + + class AssistantPruningResponse(BaseModel): """LLM 对单个 User-Assistant 消息对的剪枝结果。 @@ -95,6 +107,9 @@ class SemanticPruner: # Snapshot 数据收集:每个消息对的 input + gold self._snapshot_records: List[Dict] = [] + # 剪枝记录:用于后续写入 Neo4j(AssistantOriginal + AssistantPruned 节点) + self.pruning_records: List[AssistantPruningRecord] = [] + # 运行日志 self.run_logs: List[str] = [] @@ -246,6 +261,15 @@ class SemanticPruner: }, }) + # 收集剪枝记录(用于后续写入 Neo4j) + self.pruning_records.append(AssistantPruningRecord( + pair_id=uuid4().hex, + original_text=asst_msg.msg, + pruned_text=result.assistant_memory_hint, + memory_type=result.assistant_memory_type, + created_at=datetime.now().isoformat(), + )) + if result.assistant_memory_hint == "NULL": self._log( f" [{label}] 索引{asst_idx} → NULL,删除 " diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/extraction_pipeline_orchestrator.py b/api/app/core/memory/storage_services/extraction_engine/steps/extraction_pipeline_orchestrator.py index 4649a17e..72d7901f 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/extraction_pipeline_orchestrator.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/extraction_pipeline_orchestrator.py @@ -855,6 +855,7 @@ class NewExtractionOrchestrator: entity_idx=e.entity_idx, name=e.name, type=e.type, + type_description=getattr(e, "type_description", ""), description=e.description, is_explicit_memory=e.is_explicit_memory, ) @@ -865,6 +866,7 @@ class NewExtractionOrchestrator: subject_name=t.subject_name, subject_id=t.subject_id, predicate=t.predicate, + predicate_description=getattr(t, "predicate_description", ""), object_name=t.object_name, object_id=t.object_id, ) diff --git a/api/app/core/memory/storage_services/extraction_engine/steps/graph_build_step.py b/api/app/core/memory/storage_services/extraction_engine/steps/graph_build_step.py index f329c98d..33791f77 100644 --- a/api/app/core/memory/storage_services/extraction_engine/steps/graph_build_step.py +++ b/api/app/core/memory/storage_services/extraction_engine/steps/graph_build_step.py @@ -28,6 +28,10 @@ from app.core.memory.models.graph_models import ( StatementChunkEdge, StatementEntityEdge, StatementNode, + AssistantOriginalNode, + AssistantPrunedNode, + AssistantPrunedEdge, + AssistantDialogEdge, ) from app.core.memory.models.message_models import DialogData, TemporalInfo @@ -47,6 +51,10 @@ class GraphBuildResult: "stmt_entity_edges", "entity_entity_edges", "perceptual_edges", + "assistant_original_nodes", + "assistant_pruned_nodes", + "assistant_pruned_edges", + "assistant_dialog_edges", ) def __init__( @@ -60,6 +68,10 @@ class GraphBuildResult: stmt_entity_edges: List[StatementEntityEdge], entity_entity_edges: List[EntityEntityEdge], perceptual_edges: List[PerceptualEdge], + assistant_original_nodes: Optional[List[AssistantOriginalNode]] = None, + assistant_pruned_nodes: Optional[List[AssistantPrunedNode]] = None, + assistant_pruned_edges: Optional[List[AssistantPrunedEdge]] = None, + assistant_dialog_edges: Optional[List[AssistantDialogEdge]] = None, ): self.dialogue_nodes = dialogue_nodes self.chunk_nodes = chunk_nodes @@ -70,6 +82,10 @@ class GraphBuildResult: self.stmt_entity_edges = stmt_entity_edges self.entity_entity_edges = entity_entity_edges self.perceptual_edges = perceptual_edges + self.assistant_original_nodes = assistant_original_nodes or [] + self.assistant_pruned_nodes = assistant_pruned_nodes or [] + self.assistant_pruned_edges = assistant_pruned_edges or [] + self.assistant_dialog_edges = assistant_dialog_edges or [] async def build_graph_nodes_and_edges( @@ -343,6 +359,77 @@ async def build_graph_nodes_and_edges( f"实体-实体边: {len(entity_entity_edges)}" ) + # ── Assistant 剪枝节点和边 ── + assistant_original_nodes: List[AssistantOriginalNode] = [] + assistant_pruned_nodes: List[AssistantPrunedNode] = [] + assistant_pruned_edges: List[AssistantPrunedEdge] = [] + assistant_dialog_edges: List[AssistantDialogEdge] = [] + + for dialog_data in dialog_data_list: + pruning_records = dialog_data.metadata.get("assistant_pruning_records", []) + for record in pruning_records: + pair_id = record["pair_id"] + original_id = f"ao_{pair_id}" + pruned_id = f"ap_{pair_id}" + + # AssistantOriginal 始终创建(记录原始对话) + original_node = AssistantOriginalNode( + id=original_id, + name=f"AssistantOriginal_{pair_id[:8]}", + end_user_id=dialog_data.end_user_id, + run_id=dialog_data.run_id, + created_at=dialog_data.created_at, + expired_at=dialog_data.expired_at, + pair_id=pair_id, + dialog_id=dialog_data.id, + text=record["original_text"], + ) + assistant_original_nodes.append(original_node) + + # BELONGS_TO_DIALOG: Original → Dialogue + assistant_dialog_edges.append(AssistantDialogEdge( + source=original_id, + target=dialog_data.id, + end_user_id=dialog_data.end_user_id, + run_id=dialog_data.run_id, + created_at=dialog_data.created_at, + )) + + # pruned_text 为 NULL 时不创建 AssistantPruned 节点和 PRUNED_TO 边 + if record["pruned_text"] == "NULL": + continue + + pruned_node = AssistantPrunedNode( + id=pruned_id, + name=f"AssistantPruned_{pair_id[:8]}", + end_user_id=dialog_data.end_user_id, + run_id=dialog_data.run_id, + created_at=dialog_data.created_at, + expired_at=dialog_data.expired_at, + pair_id=pair_id, + dialog_id=dialog_data.id, + text=record["pruned_text"], + memory_type=record["memory_type"], + ) + assistant_pruned_nodes.append(pruned_node) + + # PRUNED_TO: Original → Pruned + assistant_pruned_edges.append(AssistantPrunedEdge( + source=original_id, + target=pruned_id, + end_user_id=dialog_data.end_user_id, + run_id=dialog_data.run_id, + created_at=dialog_data.created_at, + pair_id=pair_id, + )) + + if assistant_original_nodes: + logger.info( + f"Assistant 剪枝节点创建完成 - " + f"原始节点: {len(assistant_original_nodes)}, " + f"剪枝节点: {len(assistant_pruned_nodes)}" + ) + if progress_callback: nodes_edges_stats = { "dialogue_nodes_count": len(dialogue_nodes), @@ -365,4 +452,8 @@ async def build_graph_nodes_and_edges( stmt_entity_edges=stmt_entity_edges, entity_entity_edges=entity_entity_edges, perceptual_edges=perceptual_edges, + assistant_original_nodes=assistant_original_nodes, + assistant_pruned_nodes=assistant_pruned_nodes, + assistant_pruned_edges=assistant_pruned_edges, + assistant_dialog_edges=assistant_dialog_edges, ) diff --git a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 index 3061e663..f31e535a 100644 --- a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 @@ -1,199 +1,130 @@ -{# - 对话级抽取与相关性判定模板(用于剪枝加速) - 输入:pruning_scene, ontology_class_infos, dialog_text, language - - ontology_class_infos: List[{class_name: str, class_description: str}] - 输出:严格 JSON(不要包含任何多余文本),字段: - - is_related: bool,是否与所选场景相关 - - times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等) - - ids: [string],编号/ID/订单号/申请号/账号等 - - amounts: [string],金额/费用/价格相关(带单位或货币符号) - - contacts: [string],联系方式(电话/手机号/邮箱/微信/QQ等) - - addresses: [string],地址/地点相关文本 - - keywords: [string],其它有助于保留的重要关键词(与场景强相关的术语) - - preserve_keywords: [string],必须保留的情绪/兴趣/爱好/个人偏好相关词或短语片段 +你是一个面向记忆存储的 Assistant 辅助信息提取器。 - 要求: - - 必须只输出上述 JSON,且键名一致;不得输出解释、前后缀;不得包含注释。 - - times/ids/amounts/contacts/addresses/keywords/preserve_keywords 仅抽取原文片段或规范化后的简单字符串。 - - 仅输出上述键;避免多余解释或字段。 -#} +任务: -{# ── 确定场景说明 ── #} -{% if ontology_class_infos and ontology_class_infos | length > 0 %} - {% if language == 'en' %} - {% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is relevant if it involves any of the following entity types.' %} - {% else %} - {% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关。' %} - {% endif %} -{% else %} - {% if language == 'en' %} - {% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %} - {% else %} - {% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %} - {% endif %} -{% endif %} +- 输入是一个 JSON,对话放在 `msgs` 数组里,且数组中只有两条消息:第一条是 `User`,第二条是 `Assistant`。 +- 你只处理第二条消息里的 `Assistant.msg`。 +- 第一条消息里的 `User.msg` 只用于理解上下文,不允许出现在输出里。 +- 你的输出必须包含两个字段: + 1. `assistant_memory_hint` + 2. `assistant_memory_type` -{% if language == "zh" %} -你是一个对话内容分析助手。请对下方对话全文进行一次性分析,完成两项任务: -1. 判断对话是否与指定场景相关; -2. 从对话中抽取所有需要保留的重要信息片段。 +目标: -场景说明:{{ instruction }} +- 从 `Assistant.msg` 中提取一条适合后续检索的极短辅助摘要。 +- 删除冗长解释、寒暄、礼貌话术、重复复述和空泛铺垫。 +- 允许做摘要式改写,但只能保留原消息中已经出现的建议、推荐、提醒、安慰、步骤或其他对后续记忆有帮助的核心内容。 +- 如果没有值得保留的信息,`assistant_memory_hint` 输出 `"NULL"`,`assistant_memory_type` 也输出 `"NULL"`。 -{% if ontology_class_infos and ontology_class_infos | length > 0 %} -【本场景实体类型定义】 -以下实体类型定义了本场景中哪些内容是重要的。 -凡是与以下任意类型相关的内容,都必须保留,并将关键词/短语提取到 keywords 字段: +硬约束: -{% for info in ontology_class_infos %} -- {{ info.class_name }}:{{ info.class_description }} -{% endfor %} +- 不得改写、复述或输出 `User.msg`。 +- 不得捏造新事实、新建议、新步骤、新材料。 +- 不得改变 `Assistant` 原始语义和立场。 +- 可以压缩、合并、重写 `Assistant.msg`,但必须忠于原内容。 +- `assistant_memory_type` 只能从以下枚举中选择: + `comfort | suggestion | recommendation | warning | instruction | NULL` +- 只输出严格 JSON,不要输出解释。 -重要提示:只要对话中出现与上述任意实体类型相关的内容,即判定为相关(is_related=true)。 -{% endif %} +压缩原则: ---- -【必须保留的内容(不可删除)】 -以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段: -- 时间信息:日期、时间点、时间段、有效期 → times 字段 -- 编号信息:学号、工号、订单号、申请号、账号、ID → ids 字段 -- 金额信息:价格、费用、金额(含货币符号或单位,如"100元"、"¥200")→ amounts 字段(注意:考试分数、成绩分数不属于金额,不要放入此字段) -- 联系方式:电话、手机号、邮箱、微信、QQ → contacts 字段 -- 地址信息:地点、地址、位置 → addresses 字段 -- 场景关键词:与**当前场景**强相关的专业术语、事件名称 → keywords 字段(注意:只放与当前场景直接相关的词,跨场景的内容不要放入此字段) -- **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段 -- **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段 -- **个人情感态度**:对人际关系、情感状态的明确表达(如"我跟室友闹矛盾了"、"我都快抑郁了")→ preserve_keywords 字段 -- 注意:学业目标(如"我想考研")、成绩(如"87分")、学科偏好(如"喜欢数学")属于学业信息,不属于情绪/情感,不要放入 preserve_keywords 字段 +- 优先保留具体建议、推荐、提醒、操作步骤、风险提示、安慰动作。 +- 优先删除长背景解释、寒暄、礼貌收尾、对用户原话的重复复述。 +- 如果原文是长说明、长步骤、长菜谱,输出更短的概要版本,但不要丢掉核心意图。 +- 优先保留最短但仍有信息密度的版本。 +- `assistant_memory_hint` 尽量写成完整句,不要只写零散词组或标签。 +- 优先使用显式主语来写结果,例如: + `安慰了用户……` + `建议用户……` + `推荐用户……` + `提醒用户……` -【场景无关内容标记】 -请从对话中识别出与当前场景({{ pruning_scene }})**既不相关、也无语义关联**的消息片段,将其原文(或关键片段)提取到 scene_unrelated_snippets 字段。 -判断标准: -- 与场景实体类型完全无关 -- 与场景话题没有因果/时间/情境上的关联(例如:不是"因为上课所以累"这种关联) -- 纯粹是另一个话题的内容(如在教育场景中讨论购物、娱乐等) -注意:有情绪/感受表达的消息即使话题不同,也可能有语义关联,请谨慎标记。 - -**重要:scene_unrelated_snippets 必须认真填写,不能为空数组。** -如果对话中存在与场景无关的内容,必须将其原文片段提取出来。 - -示例(场景=在线教育): -- "我最近心情很差,跟室友闹矛盾了" → 与教育场景无关,加入 scene_unrelated_snippets -- "她总是很晚回来吵到我睡觉" → 与教育场景无关,加入 scene_unrelated_snippets -- "对,我都快抑郁了" → 与教育场景无关,加入 scene_unrelated_snippets -- "期末考试12月25日" → 与教育场景相关,不加入 scene_unrelated_snippets -- "我上次高数作业87分" → 与教育场景相关,不加入 scene_unrelated_snippets -- "我的目标是考研" → 与教育场景相关,不加入 scene_unrelated_snippets - -示例(场景=情感陪伴): -- "我最近心情很差,跟室友闹矛盾了" → 与情感陪伴场景相关(情绪+关系),不加入 scene_unrelated_snippets -- "对,我都快抑郁了" → 与情感陪伴场景相关(情绪),不加入 scene_unrelated_snippets -- "期末考试12月25日,3号教学楼201室" → 与情感陪伴场景无关(教育信息),加入 scene_unrelated_snippets -- "我上次高数作业87分,这次能考好吗" → 与情感陪伴场景无关(学业信息),加入 scene_unrelated_snippets -- "我的目标是考研,想读应用数学" → 与情感陪伴场景无关(学业目标),加入 scene_unrelated_snippets - -【可以删除的内容】 -以下类型的内容属于低价值信息,可以在剪枝时删除: -- 纯寒暄问候:如"你好"、"在吗"、"拜拜"、"嗯"、"好的"、"哦"等无实质内容的短语 -- 纯表情/符号:如"[微笑]"、"😊"、"哈哈"等 -- 重复确认:如"对对对"、"是的是的"、"嗯嗯嗯"等无新增信息的重复 -- 无意义填充:如"啊"、"呢"、"嘛"等语气词单独成句 - -**注意:即使消息很短,只要包含情绪、兴趣、爱好、个人观点等有价值信息,就必须保留,不得删除。** -例如: -- "我好开心呀" → 包含情绪(开心),必须保留,preserve_keywords 中加入"开心" -- "好喜欢打羽毛球呀" → 包含兴趣爱好(喜欢打羽毛球),必须保留,preserve_keywords 中加入"喜欢打羽毛球" -- "我好难过" → 包含情绪(难过),必须保留,preserve_keywords 中加入"难过" -- "太好啦!看到你开心,我也跟着心情亮起来" → 包含情绪,必须保留,preserve_keywords 中加入"开心" - ---- -对话全文: -""" -{{ dialog_text }} -""" - -只输出严格 JSON(键固定、顺序不限): +Few-shot 示例 1 +输入: { - "is_related": , - "times": [...], - "ids": [...], - "amounts": [...], - "contacts": [...], - "addresses": [...], - "keywords": [...], - "preserve_keywords": [...], - "scene_unrelated_snippets": [...] + "msgs": [ + { + "role": "User", + "msg": "我室友小雯这学期一直在准备毕业论文,这两周都在改答辩 PPT。她下周三答辩,我有点担心她会紧张。" + }, + { + "role": "Assistant", + "msg": "听起来你很关心小雯,也希望她答辩顺利。她现在紧张其实很正常,很多人在答辩前都会这样。" + } + ] } -{% else %} -You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks: -1. Determine whether the dialogue is relevant to the specified scene; -2. Extract all important information fragments that must be preserved. - -Scenario Description: {{ instruction }} - -{% if ontology_class_infos and ontology_class_infos | length > 0 %} -[Scene Entity Type Definitions] -The following entity types define what content is important in this scene. -Content related to ANY of these types must be preserved and extracted into the keywords field: - -{% for info in ontology_class_infos %} -- {{ info.class_name }}: {{ info.class_description }} -{% endfor %} - -Important: If the dialogue contains content related to any of the entity types above, mark it as relevant (is_related=true). -{% endif %} - ---- -[MUST PRESERVE (cannot be deleted)] -The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields: -- Time information: dates, time points, durations, expiry dates → times field -- ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field -- Amount information: prices, fees, amounts (with currency symbols or units, e.g., "$100", "¥200") → amounts field (Note: exam scores and grades are NOT amounts, do not put them here) -- Contact information: phone numbers, emails, WeChat, QQ → contacts field -- Address information: locations, addresses, places → addresses field -- Scene keywords: professional terms and event names strongly related to **the current scene** → keywords field (Note: only put terms directly related to the current scene; cross-scene content should not be placed here) -- **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field -- **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field -- **Personal emotional attitudes**: clear expressions about interpersonal relationships or emotional states (e.g., "I had a fight with my roommate", "I'm almost depressed") → preserve_keywords field -- Note: Academic goals (e.g., "I want to pursue a master's degree"), grades (e.g., "87 points"), and subject preferences (e.g., "I like math") are academic information, NOT emotions/feelings — do not put them in preserve_keywords - -[Scene-Unrelated Content Marking] -Please identify message snippets in the dialogue that are **neither relevant to nor semantically associated with** the current scene ({{ pruning_scene }}), and extract their original text (or key fragments) into the scene_unrelated_snippets field. -Criteria: -- Completely unrelated to the scene's entity types -- No causal/temporal/contextual association with the scene topic (e.g., "feeling tired because of class" IS associated) -- Purely belongs to a different topic (e.g., discussing shopping or entertainment in an education scene) -Note: Messages with emotional/feeling expressions may still have semantic association even if the topic differs — mark carefully. - -[CAN BE DELETED] -The following types of content are low-value and can be removed during pruning: -- Pure greetings: e.g., "hello", "are you there", "bye", "ok", "yeah" — short phrases with no substantive content -- Pure emojis/symbols: e.g., "[smile]", "😊", "haha" -- Repetitive confirmations: e.g., "yes yes yes", "right right", "uh huh" — repetitions with no new information -- Meaningless fillers: standalone interjections like "ah", "well", "hmm" - -**Note: Even if a message is short, if it contains emotions, interests, hobbies, or personal opinions, it MUST be preserved.** -Examples: -- "I'm so happy!" → contains emotion (happy), must preserve; add "happy" to preserve_keywords -- "I love playing badminton!" → contains interest (love playing badminton), must preserve; add "love playing badminton" to preserve_keywords -- "I feel so sad" → contains emotion (sad), must preserve; add "sad" to preserve_keywords - ---- -Full Dialogue: -""" -{{ dialog_text }} -""" - -Output strict JSON only (fixed keys, order doesn't matter): +输出: { - "is_related": , - "times": [...], - "ids": [...], - "amounts": [...], - "contacts": [...], - "addresses": [...], - "keywords": [...], - "preserve_keywords": [...], - "scene_unrelated_snippets": [...] + "assistant_memory_hint": "安慰了用户对室友答辩状态的担忧。", + "assistant_memory_type": "comfort" } -{% endif %} + +Few-shot 示例 2 +输入: +{ + "msgs": [ + { + "role": "User", + "msg": "我最近总失眠,已经两周了,想先自己调一调。" + }, + { + "role": "Assistant", + "msg": "如果你想先自己调整,可以先减少咖啡因摄入,尤其下午和晚上尽量不要再喝咖啡或浓茶,同时把睡前刷手机的时间压缩一些,尽量固定上床时间,先连续观察几天。" + } + ] +} +输出: +{ + "assistant_memory_hint": "建议用户减少咖啡因摄入、减少睡前刷手机时间并固定上床时间。", + "assistant_memory_type": "suggestion" +} + +Few-shot 示例 3 +输入: +{ + "msgs": [ + { + "role": "User", + "msg": "我晚上想做个简单点的减脂餐,最好二十分钟左右能搞定。" + }, + { + "role": "Assistant", + "msg": "你可以做一个鸡胸肉沙拉碗,主要用鸡胸肉、生菜、黄瓜和圣女果。鸡胸肉简单煎熟切块后和蔬菜拌在一起,调味尽量用橄榄油加一点醋,不要放太多沙拉酱。" + } + ] +} +输出: +{ + "assistant_memory_hint": "推荐用户做鸡胸肉沙拉碗,并提醒用户调味时少放沙拉酱。", + "assistant_memory_type": "recommendation" +} + +Few-shot 示例 4 +输入: +{ + "msgs": [ + { + "role": "User", + "msg": "剪枝引擎和萃取引擎我都想先做,但是估计都会比较花时间。" + }, + { + "role": "Assistant", + "msg": "这两个模块都涉及比较多的设计和实现细节。如果你想先推进,我建议先拆需求,再分别评估开发量。" + } + ] +} +输出: +{ + "assistant_memory_hint": "建议用户先拆需求,再分别评估两个模块的开发量。", + "assistant_memory_type": "suggestion" +} + +现在处理下面这个输入。 +输入: +{{ dialog_text }} + +只输出严格 JSON: +{ + "assistant_memory_hint": "", + "assistant_memory_type": "comfort | suggestion | recommendation | warning | instruction | NULL" +} \ No newline at end of file diff --git a/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 index 9be6f19b..9669144a 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_statement.jinja2 @@ -2,7 +2,7 @@ {{ input_json }} {%- endmacro %} -===Tasks=== +=== Tasks === {% if language == "zh" %} 你的任务是从提供的目标文本中识别并提取陈述句,并为每条陈述句标注以下信息: @@ -11,11 +11,12 @@ - statement_text - statement_type - temporal_type +- has_emotional_state - has_unsolved_reference - valid_at - invalid_at -每条输出都应是一个结构化的记忆候选陈述句。 +每条输出都应是一个结构化的候选记忆陈述句。 {% else %} Your task is to identify and extract declarative statements from the provided target text, and annotate each extracted statement with: @@ -23,6 +24,7 @@ Your task is to identify and extract declarative statements from the provided ta - statement_text - statement_type - temporal_type +- has_emotional_state - has_unsolved_reference - valid_at - invalid_at @@ -30,7 +32,7 @@ Your task is to identify and extract declarative statements from the provided ta Each output item should be a structured candidate memory statement. {% endif %} -===Inputs=== +=== Inputs === {% if language == "zh" %} - chunk_id: chunk 唯一 ID @@ -48,7 +50,7 @@ Each output item should be a structured candidate memory statement. - supporting_context.msgs: ordered contextual messages, which may include User and Assistant messages {% endif %} -===Scope=== +=== Scope === {% if language == "zh" %} - 只从 `target_content` 中提取陈述句。 @@ -66,12 +68,12 @@ Each output item should be a structured candidate memory statement. - Every output statement must be directly grounded in wording from `target_content`. {% endif %} -===Extraction Rules=== +=== Extraction Rules === {% if language == "zh" %} 拆分规则: - 以“一个完整意思”为单位提取陈述句,通常对应一个完整句子或一个自然语义片段。 -- 默认保留句子级结构;只有当一个句子内部包含两个及以上彼此独立、拆开后明显更清楚的重要信息时,才拆成多条。 +- 默认保留句子级结构;只有当一个句子内部包含两个及以上彼此独立、拆开后明显更清晰的重要信息时,才拆成多条。 - 宁可多提取,也不要漏掉 `target_content` 中能独立成立、且语义稳定的 statement。 - 但不要为了提高覆盖率而引入原文没有的信息,或输出语义不成立的 statement。 @@ -82,6 +84,9 @@ Each output item should be a structured candidate memory statement. 共指消解: +- 先完成最终的 `statement_text` 改写,再判断 `has_unsolved_reference`。 +- `has_unsolved_reference` 必须基于最终输出的 `statement_text` 判断,而不是基于原始 `target_content` 里是否出现过代词来判断。 +- 如果最终 `statement_text` 已经把引用改写成具体实体名,例如“助理恭喜用户”“小李点了一杯美式咖啡”,则 `has_unsolved_reference` 必须是 `false`。 - 如果可以解析到具体实体名,优先输出具体实体名,并将 `has_unsolved_reference` 设为 `false`。 - 如果不能解析到具体实体名,但可以解析到最小必要描述,则输出该最小必要描述,并将 `has_unsolved_reference` 设为 `true`。 - 如果既不能解析到具体实体名,也不能稳定解析到最小必要描述,则保留最小必要原始表达,并将 `has_unsolved_reference` 设为 `true`。 @@ -117,6 +122,15 @@ statement_type: - 如果没有明确时间,不要编造时间。 - 对于点状事件(例如某天发生的一次考试、一次见面、一次提交),`valid_at` 和 `invalid_at` 都应填写为该事件的起止边界;不要只填 `valid_at`。 +情感状态判断: + +- `has_emotional_state` 只用于判断当前 statement 是否反映了用户的情感状态。 +- 如果根据当前 statement 和 supporting_context,可以判断用户当前存在某种情感状态,则输出 `true`。 +- 该字段不是情绪分类字段,不要求输出具体情绪类型。 +- 明确情绪表达例如“开心”“难过”“紧张”“有压力”通常应标为 `true`。 +- 即使没有明确情绪词,只要语义足以表明用户当前具有情感状态,也可以标为 `true`,例如“我很好”。 +- 如果只是客观事实、动作描述或安排,且无法从当前上下文稳定判断用户情感状态,则输出 `false`。 + temporal_type: - `STATIC`:相对稳定、持续性的状态、身份、属性、长期偏好、长期关系、长期职业或长期居住状态;若带起始时间,可填 `valid_at`,`invalid_at` 必须为 `"NULL"`。 @@ -129,7 +143,7 @@ temporal_type: - 允许为解决代词、省略和时间歧义做最小必要改写。 - 不要引入原文未明确表达的新事实、额外推断或风格化概括。 {% else %} - Granularity: + Splitting rules: - Extract statements at the level of one complete thought, usually one full sentence or one natural semantic unit. - Preserve sentence-level structure by default; split only when a sentence contains two or more independent and important pieces of information that become clearly easier to understand when separated. - Prefer higher recall: do not miss independently valid and semantically stable statements in `target_content`. @@ -149,6 +163,9 @@ Coreference resolution: Clear vs unresolved reference: +- First produce the final rewritten `statement_text`, then decide `has_unsolved_reference`. +- `has_unsolved_reference` must be judged from the final `statement_text`, not from whether the original `target_content` once contained a pronoun. +- If the final `statement_text` already resolves the reference to a concrete named entity, such as “The assistant congratulates the user” or “Xiao Li ordered an Americano,” then `has_unsolved_reference` must be `false`. - A reference is fully resolved only if the current `supporting_context` can map it to a concrete named entity. - `Zhang San`, `Old Zhang` when clearly resolved to Zhang San, `Professor Li`, and `Teacher Wang` are clear references. - `the user's friend`, `the user's coworker`, `a teacher`, and `an interviewer` are allowed outputs but still count as unresolved. @@ -177,6 +194,15 @@ Temporal rules: - If no explicit time is available, do not invent one. - For point-in-time events such as a single exam, a meeting, or a submission on one day, populate both `valid_at` and `invalid_at`; do not fill only `valid_at`. +Emotional-state detection: + +- `has_emotional_state` is used only to judge whether the current statement reflects the user's emotional state. +- If the current statement plus supporting context is sufficient to infer that the user currently has some emotional state, output `true`. +- This field is not an emotion category field. Do not infer or output a specific emotion label here. +- Explicit emotion wording such as “happy”, “sad”, “nervous”, or “under pressure” should usually be marked `true`. +- Statements without explicit emotion words may still be `true` if the user's emotional state is reasonably inferable, such as “I am fine.” +- If the statement is only an objective fact or action description and the user's emotional state cannot be stably inferred from the current context, output `false`. + temporal_type: - `STATIC`: relatively stable, ongoing states, identities, attributes, long-term preferences, long-term relationships, occupations, or residence states. @@ -190,7 +216,7 @@ Rewrite boundary: - Do not introduce unsupported facts, extra inference, or stylistic summarization. {% endif %} -===Examples=== +=== Examples === {% if language == "zh" %} 示例 1: 示例输入: { @@ -219,6 +245,7 @@ Rewrite boundary: "statement_text": "李教授这学期要求很严。", "statement_type": "OPINION", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "2023-09-04T18:00:00", "invalid_at": "NULL" @@ -228,17 +255,19 @@ Rewrite boundary: "statement_text": "李教授讲课清晰透彻。", "statement_type": "OPINION", "temporal_type": "ATEMPORAL", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "NULL", "invalid_at": "NULL" }, { "statement_id": "stmt_m1n2o3p4", - "statement_text": "李教授的气场很吓人。", + "statement_text": "用户每次被李教授点名都有点发怵。", "statement_type": "OPINION", - "temporal_type": "ATEMPORAL", + "temporal_type": "DYNAMIC", + "has_emotional_state": true, "has_unsolved_reference": false, - "valid_at": "NULL", + "valid_at": "2023-09-04T18:00:00", "invalid_at": "NULL" } ] @@ -248,13 +277,13 @@ Rewrite boundary: 示例输入: { "chunk_id": "chunk_b2c3d4e5", "end_user_id": "eu_12345678", - "target_content": "我最近在学 Python,每天晚上都会练一个小时。这周还打算先把基础语法和函数部分过一遍。", + "target_content": "我最近在学Python,每天晚上都会练一个小时。这周还打算先把基础语法和函数部分过一遍。", "target_message_date": "2026-04-01T00:00:00", "supporting_context": { "msgs": [ { "role": "User", - "msg": "我最近在学 Python。" + "msg": "我最近在学Python。" }, { "role": "Assistant", @@ -268,27 +297,30 @@ Rewrite boundary: "statements": [ { "statement_id": "stmt_m3n4o5p6", - "statement_text": "用户最近在学 Python。", + "statement_text": "用户最近在学Python。", "statement_type": "FACT", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" }, { "statement_id": "stmt_q7r8s9t0", - "statement_text": "用户最近每天晚上都会练一个小时 Python。", + "statement_text": "用户最近每晚都会练一个小时Python。", "statement_type": "FACT", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" }, { "statement_id": "stmt_u1v2w3x4", - "statement_text": "用户这周打算先复习 Python 的基础语法和函数部分。", + "statement_text": "用户这周打算先复习Python的基础语法和函数部分。", "statement_type": "FACT", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" @@ -323,6 +355,7 @@ Rewrite boundary: "statement_text": "用户觉得那两个有点难。", "statement_type": "OPINION", "temporal_type": "DYNAMIC", + "has_emotional_state": true, "has_unsolved_reference": true, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" @@ -332,6 +365,7 @@ Rewrite boundary: "statement_text": "用户昨晚看了半天那两个还是没太搞明白。", "statement_type": "FACT", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": true, "valid_at": "2026-03-31T00:00:00", "invalid_at": "2026-03-31T23:59:59" @@ -341,6 +375,7 @@ Rewrite boundary: "statement_text": "如果周末还弄不出来,用户可能会去问助教。", "statement_type": "OTHER", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": true, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" @@ -375,6 +410,7 @@ Example Output: { "statement_text": "Professor Li is very strict this semester.", "statement_type": "OPINION", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "2023-09-04T18:00:00", "invalid_at": "NULL" @@ -384,17 +420,19 @@ Example Output: { "statement_text": "Professor Li explains things clearly.", "statement_type": "OPINION", "temporal_type": "ATEMPORAL", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "NULL", "invalid_at": "NULL" }, { "statement_id": "stmt_m1n2o3p4", - "statement_text": "Professor Li's presence is intimidating.", + "statement_text": "The user gets nervous every time Professor Li calls on the user.", "statement_type": "OPINION", - "temporal_type": "ATEMPORAL", + "temporal_type": "DYNAMIC", + "has_emotional_state": true, "has_unsolved_reference": false, - "valid_at": "NULL", + "valid_at": "2023-09-04T18:00:00", "invalid_at": "NULL" } ] @@ -427,6 +465,7 @@ Example Output: { "statement_text": "The user has been learning Python recently.", "statement_type": "FACT", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" @@ -436,6 +475,7 @@ Example Output: { "statement_text": "The user has recently been practicing Python for an hour every night.", "statement_type": "FACT", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" @@ -445,6 +485,7 @@ Example Output: { "statement_text": "The user plans to review Python basic syntax and functions first this week.", "statement_type": "FACT", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": false, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" @@ -479,6 +520,7 @@ Example Output: { "statement_text": "The user thinks those two things are difficult.", "statement_type": "OPINION", "temporal_type": "DYNAMIC", + "has_emotional_state": true, "has_unsolved_reference": true, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" @@ -488,6 +530,7 @@ Example Output: { "statement_text": "The user spent a long time last night looking at those two things but still did not really understand them.", "statement_type": "FACT", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": true, "valid_at": "2026-03-31T00:00:00", "invalid_at": "2026-03-31T23:59:59" @@ -497,6 +540,7 @@ Example Output: { "statement_text": "If the user still cannot finish them by the weekend, the user may ask the TA.", "statement_type": "OTHER", "temporal_type": "DYNAMIC", + "has_emotional_state": false, "has_unsolved_reference": true, "valid_at": "2026-04-01T00:00:00", "invalid_at": "NULL" @@ -504,7 +548,7 @@ Example Output: { ] } {% endif %} -===End of Examples=== +=== End of Examples === {% if language == "zh" %} 最终输出前检查: @@ -512,7 +556,9 @@ Example Output: { - 是否只保留 `target_content` 中可直接支持的陈述句 - 如果主语是用户,是否统一写“用户” - 非用户主体是否尽量写成具体名称;若无法做到,是否已正确标记 `has_unsolved_reference = true` +- 如果最终 `statement_text` 已经落到具体实体名,`has_unsolved_reference` 是否已经改为 `false` - statement_type 是否合法,且没有把一般事实机械标成 `OPINION` +- `has_emotional_state` 是否仅用于判断是否存在情感状态,而没有被当作情绪分类字段 - temporal_type 是否与 valid_at / invalid_at 一致 - 输出是否严格符合 JSON schema {% else %} @@ -520,7 +566,9 @@ Example Output: { - Keep only statements directly supported by `target_content` - If the subject is the user, render it as “the user” - Render non-user subjects as concrete names when possible; otherwise mark `has_unsolved_reference = true` +- If the final `statement_text` already resolves the reference to a concrete named entity, ensure `has_unsolved_reference = false` - Ensure statement_type is valid and do not mechanically label ordinary facts as `OPINION` +- Ensure `has_emotional_state` is used only for emotional-state presence detection, not emotion classification - Ensure temporal_type is consistent with valid_at and invalid_at - Ensure the output strictly matches the JSON schema {% endif %} @@ -555,8 +603,7 @@ Example Output: { - Preserve the original language and do not translate. {% endif %} -现在处理下面这个输入: -{{ render_input() }} +现在处理下面这个输入:{{ render_input() }} Return only a JSON object matching the schema below: { @@ -566,6 +613,7 @@ Return only a JSON object matching the schema below: "statement_text": "string", "statement_type": "FACT | OPINION | OTHER", "temporal_type": "STATIC | DYNAMIC | ATEMPORAL", + "has_emotional_state": "boolean", "has_unsolved_reference": "boolean", "valid_at": "string | NULL", "invalid_at": "string | NULL" diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index 57c43342..bc1cf7ac 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -5,13 +5,21 @@ Extract entities and knowledge triplets from the given statement. 重要: - `name`、`subject_name`、`object_name` 默认保持原文中的表面形式,不要翻译。 -- 但对用户自指表达,如“我”“我的”“我自己”,统一规范为 `用户`。 +- 但在抽取前,必须先做指代解析。 +- 用户自指表达,如“我”“我的”“我自己”,一律规范为 `用户`。 +- 非用户自指代词或指示表达,如“他”“她”“它”“这个”“那个”“这家”“那家”“这里”“那里”,如果能从 `supporting_context` 中稳定解析出具体指代,则必须替换为具体指代实体名。 +- 如果上述代词或指示表达不能稳定解析,则整条跳过。 +- 命名关系中新出现的称呼、别名、昵称、产品名保持原样,不做替换。 - `description` 使用中文。 - `type`、`predicate`、`type_description`、`predicate_description` 一律使用中文。 {% else %} Important: -- Keep `name`, `subject_name`, and `object_name` in their original surface form from the source text. Do not translate them. -- Exception: normalize user self-reference such as "I", "me", and "myself" to `用户`. +- Keep `name`, `subject_name`, and `object_name` in their original surface form from the source text by default. +- But you MUST resolve references before extraction. +- Normalize user self-reference such as "I", "me", and "myself" to `用户`. +- For non-user pronouns or demonstratives such as "he", "she", "it", "this", "that", "this company", "that place", if a stable referent can be resolved from `supporting_context`, replace them with the resolved entity name. +- If such references cannot be resolved stably, skip the entire statement. +- Newly introduced names in naming or alias expressions must stay in their original form. - Generate `description` in English. - Always generate `type`, `predicate`, `type_description`, and `predicate_description` in Chinese. {% endif %} @@ -69,11 +77,13 @@ Primary statement to analyze: 开始抽取前,先检查 `has_unsolved_reference`。 - 如果 `has_unsolved_reference` 是 `true`,不要抽取任何内容。 -- 此时必须返回: +- 如果 `statement_text` 中仍存在无法稳定解析的代词、指示词或省略主体,也应视为 unresolved reference。 +- 这两种情况下都必须返回: {% else %} Before any extraction, check `has_unsolved_reference`. - If `has_unsolved_reference` is `true`, do not extract anything. -- In that case, return exactly: +- If unresolved pronouns, demonstratives, or omitted subjects still remain in `statement_text`, treat the statement as unresolved as well. +- In either case, return exactly: {% endif %} ```json @@ -86,8 +96,10 @@ Primary statement to analyze: {% if language == "zh" %} - 不要在引用未解析时尝试部分抽取。 +- 不要保留“他”“这个”“那个”这类原代词继续输出实体或关系。 {% else %} - Do not attempt partial extraction when the reference is unresolved. +- Do not keep unresolved forms such as "he", "this", or "that" as extracted entities or relation arguments. {% endif %} ===Input Boundary=== @@ -100,6 +112,8 @@ Primary statement to analyze: - 如果 `supporting_context.msgs` 中的 Assistant 消息包含总结、猜测、解释或改写,这些内容只能作为理解辅助,不能直接作为抽取来源。 - `statement_type`、`temporal_type`、`valid_at`、`invalid_at` 是辅助理解字段,不是抽取目标。 - 对 `statement_text` 中的用户自指表达,要统一规范成实体 `用户`。 +- 对其他可稳定解析的代词或指示表达,要替换为具体指代实体名后再抽取。 +- 对命名关系中新出现的称呼、别名、昵称、产品名,不要因为上下文可推断其所指而直接改写,它们应保持原样作为实体名。 {% else %} - Treat `statement_text` as the only direct extraction target. - Use `supporting_context.msgs` only to interpret references, ellipsis, subject identity, and necessary background in `statement_text`. @@ -108,41 +122,253 @@ Primary statement to analyze: - If Assistant messages in `supporting_context.msgs` contain summary, guess, interpretation, or rephrasing, use them only as interpretive support and never as a direct extraction source. - Treat `statement_type`, `temporal_type`, `valid_at`, and `invalid_at` as auxiliary context, not extraction targets. - Normalize user self-reference in `statement_text` to the entity `用户`. +- Replace other resolvable pronouns or demonstratives with their resolved entity names before extraction. +- For newly introduced names in naming or alias expressions, do not rewrite them even if the context reveals who they refer to; keep them as entity names. {% endif %} ===预定义实体类型=== 只能使用以下中文实体类型。如果没有完全匹配的类型,请选择最接近的一项,不要发明新类型。 -- `人物`: 现实中的具体个人 -- `组织`: 公司、机构、团队、社群等组织性主体 -- `群体`: 未具名或泛指的一组人 -- `地点`: 具有地理或空间意义的位置 -- `设施`: 建筑、场馆、房间、实验室等功能性空间 -- `地址`: 具体地址或位置描述 -- `物品`: 一般具体物体 -- `设备`: 具有明确用途的工具或器材 -- `产品`: 可被制造、购买、使用的产品 -- `交通工具`: 用于出行或运输的工具 -- `文档`: 文章、报告、表格、说明等文档 -- `媒体`: 图片、音频、视频等媒体对象 -- `网站`: 网站、网页或互联网平台 -- `软件`: 软件、应用、系统或数字服务 -- `账号`: 账号、账户、用户档案 -- `标识符`: ID、编号、用户名、工号等标识 -- `联系方式`: 电话、邮箱、社交账号等联系方式 -- `角色`: 某实体承担的社会或功能角色 -- `职业`: 工作或职业身份 -- `技能`: 可学习或掌握的能力 -- `知识主题`: 主题、领域、方法、理论或知识概念 -- `目标`: 希望达成的结果 -- `偏好`: 稳定的喜欢、倾向或偏爱 -- `习惯`: 重复出现的行为模式 -- `语言`: 自然语言或编程语言 -- `金额`: 金额或货币数值 -- `数量`: 带或不带单位的数量值 -- `货币`: 货币单位 -- `组织部门`: 组织内部的部门或业务单元 -- `称呼`: 用于指代或称呼实体的名字 +- `人物` + - definition: 可稳定指向、可被当作具体个体区分和归并的个人实体。 + - positive_examples: `用户`、`张三`、`王教授`、`小林` + - negative_examples: `老师`、`导师`、`学生`、`他们` + - notes: 强调“这个人是谁”,不强调他承担的社会身份;用户自指统一归为 `用户`。 + +- `组织` + - definition: 公司、机构、学校、实验室、团队、社群等组织性主体。 + - positive_examples: `腾讯`、`清华大学`、`机器人公司`、`实验室` + - negative_examples: `人事部`、`教研组`、`办公室` + - notes: 如果表达的是组织内部单元,当前一级仍优先并入 `组织`,除非后续单独扩展子类。 + +- `群体` + - definition: 边界相对稳定、可被当作整体引用的一组人。 + - positive_examples: `我的朋友`、`同事们`、`实验室成员` + - negative_examples: `他们`、`一些人`、`一个朋友` + - notes: 只用于边界相对稳定的人群;边界不稳或 unresolved 的表达不要归入 `群体`。 + +- `智能体` + - definition: 具有行动、交互或执行能力的非人主体,如机器人、AI 或其他智慧体。 + - positive_examples: `机器人查票员`、`家务机器人`、`智能助手` + - negative_examples: `手机`、`电脑`、`机器人公司` + - notes: 如果对象只是普通设备,不归入 `智能体`;只有在叙述中被当作主体行动或交互时才使用。 + +- `角色职业` + - definition: 人物承担的社会角色、功能身份或职业身份。 + - positive_examples: `导师`、`老师`、`学生`、`医生`、`程序员` + - negative_examples: `张三`、`王教授`、`我的朋友` + - notes: 强调“这个人是什么身份”,不强调“这个人是谁”;如果文本落到具体个人,优先用 `人物`。 + +- `地点设施` + - definition: 具有地理意义或功能性空间意义的位置与场所。 + - positive_examples: `北京`、`巴黎`、`图书馆`、`办公室`、`教室` + - negative_examples: `这里`、`那里`、`朝这边`、`明天去的地方` + - notes: 地理地点和功能场所当前一级合并;未稳定解析的位置指代表达不要抽取。 + +- `物品设备` + - definition: 可被持有、使用、携带的具体物体、设备、工具或交通工具。 + - positive_examples: `手机`、`电脑`、`相机`、`自行车` + - negative_examples: `微信`、`GitHub`、`会员服务` + - notes: 交通工具当前并入此类;数字服务不归入本类。 + +- `产品服务` + - definition: 可被购买、使用、消费或订阅的产品或服务。 + - positive_examples: `iPhone`、`健身课`、`会员服务` + - negative_examples: `微信`、`GitHub`、`手机` + - notes: 具体商品和服务当前一级合并;纯软件平台优先归入 `软件平台`。 + +- `软件平台` + - definition: 软件、应用、网站、在线平台或数字服务系统。 + - positive_examples: `微信`、`GitHub`、`ChatGPT`、`飞书` + - negative_examples: `iPhone`、`会员服务`、`手机号` + - notes: 软件、网站、平台当前一级合并;如果语境强调的是账号本身,改用 `账号`。 + +- `账号` + - definition: 账户、账号、用户档案类实体。 + - positive_examples: `GitHub账号`、`微信号` + - negative_examples: `用户名`、`工号`、`邮箱` + - notes: 与 `标识符`、`联系方式` 分开;账号是主体可持有的账户对象。 + +- `标识符` + - definition: 用于识别实体的编号、ID、用户名、学号、工号等标识。 + - positive_examples: `学号`、`工号`、`用户名` + - negative_examples: `GitHub账号`、`手机号` + - notes: 当前允许保留,但通常只有在存在明确识别关系时才值得抽取。 + +- `联系方式` + - definition: 可用于联系实体的电话、邮箱、社交联系地址。 + - positive_examples: `手机号`、`邮箱`、`微信联系方式` + - negative_examples: `用户名`、`GitHub账号` + - notes: 当前允许保留,但通常只有在存在明确联系关系时才值得抽取。 + +- `文档媒体` + - definition: 文章、报告、表格、图片、音频、视频等内容载体。 + - positive_examples: `简历`、`论文`、`照片`、`录音` + - negative_examples: `微积分`、`微信`、`学号` + - notes: 文档与媒体当前一级合并;如果只是内容主题,不归入本类。 + +- `知识能力` + - definition: 可学习、掌握、使用或讨论的知识主题、技能、学科或语言。 + - positive_examples: `微积分`、`机器学习`、`写作`、`Python`、`中文` + - negative_examples: `紧张`、`成功`、`意义` + - notes: 不包含情绪、心理状态、抽象结果或价值判断;这些应写入 `description`。 + +- `偏好习惯目标` + - definition: 用户稳定的偏好、重复习惯,以及具体、明确、用户特异且值得长期保留的目标。 + - positive_examples: `喜欢安静环境`、`晨跑`、`通过雅思` + - negative_examples: `紧张`、`开心`、`成功`、`回报` + - notes: 这是高风险类型;只允许稳定偏好、重复习惯、具体目标,不允许抽象愿望或情绪状态。 + +- `称呼别名` + - definition: 用于指代或称呼实体的名字。 + - positive_examples: `山哥`、`老张`、`X1` + - negative_examples: `导师`、`程序员`、`好人` + - notes: 只用于名字性表达,不用于角色、职业、评价词。 + +实体类型总规则: + +- unresolved 或边界不稳的表达,不因“看起来像名词”就创建实体。 +- 情绪、心理状态、金额、数量、普通时间、一次性动作短语,默认不作为独立实体类型抽取。 +- 抽象命题片段、泛化结果、价值判断,默认不创建实体;如有保留价值,应写入相关高价值实体的 `description`。 + +实体类型选择原则: + +- 优先保留对用户画像、偏好、长期身份、稳定关系或持续兴趣有记忆价值的实体类型。 +- 对于“努力”“回报”“意义”“成功”这类泛化概念、抽象命题片段或价值判断,默认不要仅因句中出现就创建实体。 +- `群体` 只用于边界相对稳定、可被当作整体引用的人群;像“他们”“一些人”“一个朋友”这类边界不稳或 unresolved 的表达不要归入 `群体`。 +- `偏好习惯目标` 只能用于稳定偏好、重复习惯或具体明确的用户目标,不能把抽象结果、泛因果终点、空泛愿望或情绪状态强行归入其中。 +- 当前阶段不抽取情绪状态实体;像“紧张”“开心”“难过”“焦虑”“放松”这类情绪或心理状态,不要归入 `知识能力`、`偏好习惯目标` 或其他现有类型。 + +===关系本体大类=== +以下大类是当前 `predicate` 本体树的第一层,用于帮助理解和约束后面的具体关系白名单。输出具体 `predicate` 时仍然必须使用后文列出的细关系,而不是直接输出这些大类名称。 + +- `命名关系` + - definition: 表达实体名称、别名、称呼之间的对应或使用关系。 + - covered_predicates: `别名属于`、`使用称呼` + - positive_examples: `山哥 -> 别名属于 -> 用户`、`我的朋友 -> 使用称呼 -> 山哥` + - negative_examples: `导师 -> 别名属于 -> 用户`、`好人 -> 使用称呼 -> 用户` + - notes: 只处理名字性表达,不处理角色、职业、评价词。 + - status: `enabled` + +- `类型归属关系` + - definition: 表达实体属于某种类别,或主体承担某种角色/职业身份的关系。 + - covered_predicates: `属于类型`、`担任角色`、`从事职业` + - positive_examples: `王教授 -> 担任角色 -> 导师`、`张三 -> 从事职业 -> 程序员` + - negative_examples: `张三 -> 担任角色 -> 山哥`、`用户 -> 从事职业 -> 紧张` + - notes: 用于“是什么”,不用于“叫什么”。 + - status: `enabled` + +- `成员隶属关系` + - definition: 表达主体属于某个组织、群体或集合的成员归属关系。 + - covered_predicates: `成员属于` + - positive_examples: `张三 -> 成员属于 -> 实验室成员`、`用户 -> 成员属于 -> 社群` + - negative_examples: `他们 -> 成员属于 -> 学校`、`一个朋友 -> 成员属于 -> 班级` + - notes: 前提是主体和归属对象都足够稳定;边界不稳的人群不要硬抽。 + - status: `enabled` + +- `任职服务关系` + - definition: 表达人物或主体在组织中的工作、任职或服务关系。 + - covered_predicates: `任职于` + - positive_examples: `张明 -> 任职于 -> 腾讯`、`王教授 -> 任职于 -> 清华大学` + - negative_examples: `张明 -> 任职于 -> 导师`、`用户 -> 任职于 -> 明天的面试` + - notes: 优先用于人物到组织的稳定供职关系。 + - status: `enabled` + +- `空间位置关系` + - definition: 表达实体与地点、场所、空间位置之间的稳定位置关系。 + - covered_predicates: `位于`、`拥有位置`、`居住于` + - positive_examples: `用户 -> 居住于 -> 巴黎`、`办公室 -> 位于 -> 北京` + - negative_examples: `用户 -> 位于 -> 明天下午三点`、`这里 -> 位于 -> 学校` + - notes: 普通时间表达和未解析位置指代不进入此类。 + - status: `enabled` + +- `前往到访关系` + - definition: 表达主体前往、到访某地点、场所、组织、课程或活动对象的关系。 + - covered_predicates: `前往` + - positive_examples: `用户 -> 前往 -> 图书馆`、`用户 -> 前往 -> 公司` + - negative_examples: `用户 -> 前往 -> 明天下午三点`、`用户 -> 前往 -> 复习微积分任务` + - notes: 当前应优先用于稳定倾向或有记忆价值的到访对象,不鼓励因一次性日程而过抽。 + - status: `enabled` + +- `组成包含关系` + - definition: 表达部分与整体、包含与被包含之间的结构关系。 + - covered_predicates: `组成部分`、`包含部分` + - positive_examples: `教研组 -> 组成部分 -> 学院`、`学院 -> 包含部分 -> 教研组` + - negative_examples: `用户 -> 组成部分 -> 图书馆`、`微积分 -> 包含部分 -> 用户` + - notes: 只用于结构性组成关系,不用于临时搭配或抽象联系。 + - status: `enabled` + +- `拥有持有关系` + - definition: 表达主体拥有、持有、配有某对象、账号、联系方式或标识的关系。 + - covered_predicates: `拥有`、`拥有账号`、`拥有联系方式`、`标识为` + - positive_examples: `用户 -> 拥有账号 -> GitHub账号`、`用户 -> 拥有联系方式 -> 邮箱`、`用户 -> 标识为 -> 学号` + - negative_examples: `用户 -> 拥有 -> 紧张`、`努力 -> 拥有 -> 回报` + - notes: 不用于抽象命题、情绪状态或口号式表达。 + - status: `enabled` + +- `使用采用关系` + - definition: 表达主体使用、采用某工具、产品、平台、语言或资源的关系。 + - covered_predicates: `使用`、`使用语言` + - positive_examples: `用户 -> 使用 -> 微信`、`用户 -> 使用语言 -> 中文` + - negative_examples: `用户 -> 使用 -> 成功`、`用户 -> 使用语言 -> 紧张` + - notes: 以后若扩展“采用方法”,也可挂在本大类下。 + - status: `enabled` + +- `创建生产关系` + - definition: 表达主体创建、撰写、生产某对象或结果的关系。 + - covered_predicates: `创建了`、`由…创建`、`撰写了` + - positive_examples: `用户 -> 撰写了 -> 简历`、`简历 -> 由…创建 -> 用户` + - negative_examples: `用户 -> 创建了 -> 明天下午三点`、`努力 -> 由…创建 -> 用户` + - notes: 只用于明确的生产、创作、撰写关系。 + - status: `enabled` + +- `知识学习关系` + - definition: 表达主体与知识、技能、学科、语言等知识能力对象之间的认知、学习或兴趣关系。 + - covered_predicates: `了解`、`学习`、`感兴趣于` + - positive_examples: `用户 -> 学习 -> 微积分`、`用户 -> 了解 -> 机器学习`、`用户 -> 感兴趣于 -> 心理学` + - negative_examples: `用户 -> 学习 -> 紧张`、`用户 -> 感兴趣于 -> 成功` + - notes: 关系对象应是 `知识能力` 类,而不是情绪、价值判断或抽象结果。 + - status: `enabled` + +- `偏好目标关系` + - definition: 表达主体对对象的稳定偏好、厌恶,或对具体明确目标的指向关系。 + - covered_predicates: `偏好`、`不喜欢`、`想要` + - positive_examples: `用户 -> 偏好 -> 安静环境`、`用户 -> 不喜欢 -> 辛辣食物`、`用户 -> 想要 -> 通过雅思` + - negative_examples: `用户 -> 想要 -> 成功`、`用户 -> 偏好 -> 紧张`、`用户 -> 不喜欢 -> 努力就会有回报` + - notes: 这是高风险大类;`想要` 只用于具体、明确、用户特异的目标,不用于抽象愿望。 + - status: `enabled` + +- `职责责任关系` + - definition: 表达主体负责某项工作、职责、事务或领域的关系。 + - covered_predicates: `负责` + - positive_examples: `张三 -> 负责 -> 招聘工作`、`王教授 -> 负责 -> 实验室项目` + - negative_examples: `张三 -> 负责 -> 紧张`、`用户 -> 负责 -> 成功` + - notes: 关系对象应是具体职责或事务,不应是情绪或抽象结果。 + - status: `enabled` + +- `沟通交互关系` + - definition: 表达两个主体之间发生沟通、交流或交互的关系。 + - covered_predicates: `沟通于` + - positive_examples: `用户 -> 沟通于 -> 张三`、`导师 -> 沟通于 -> 学生` + - negative_examples: `用户 -> 沟通于 -> 紧张`、`图书馆 -> 沟通于 -> 微积分` + - notes: 两端通常都应是可作为交互主体的实体。 + - status: `enabled` + +- `提及关系` + - definition: 表达主体或文本明确提到某实体的关系。 + - covered_predicates: `提到` + - positive_examples: `用户 -> 提到 -> 腾讯`、`文档 -> 提到 -> 张三` + - negative_examples: `用户 -> 提到 -> 努力`、`用户 -> 提到 -> 回报`、`用户 -> 提到 -> 紧张` + - notes: 受限大类;不用于保留泛化概念、抽象命题片段、情绪状态或仅在句面上出现但没有记忆价值的对象。 + - status: `restricted` + +- `一般关联关系` + - definition: 表达两个实体之间存在明确、稳定、值得保留,但当前无更精确谓词可用的关联关系。 + - covered_predicates: `关联于`、`相关于` + - positive_examples: `项目 -> 关联于 -> 实验室`、`账号 -> 相关于 -> 平台` + - negative_examples: `努力 -> 相关于 -> 回报`、`用户 -> 关联于 -> 紧张`、`成功 -> 相关于 -> 意义` + - notes: 受限大类;不能作为失败兜底关系,不能用来连接抽象概念、口号式表达或无法成立的关系。 + - status: `restricted` ===预定义关系类型=== 只能使用以下中文关系类型。如果没有完全匹配的关系,请选择最接近的一项,不要发明新关系。 @@ -172,60 +398,90 @@ Primary statement to analyze: - `感兴趣于`: 主体对某主题感兴趣 - `偏好`: 主体偏好某对象、方式或主题 - `不喜欢`: 主体不喜欢某对象、方式或主题 -- `想要`: 主体想获得、达成或拥有某对象或结果 +- `想要`: 主体想获得、达成或拥有具体、明确、用户特异且值得保留的对象或目标,不用于抽象结果、泛化愿望或口号式表达 - `负责`: 主体负责某项工作、职责或领域 - `沟通于`: 两个实体之间发生沟通或交流 - `拥有联系方式`: 实体具有某联系方式 - `拥有账号`: 实体具有某账号 - `标识为`: 实体由某标识符标识 - `使用语言`: 主体使用某语言 -- `相关于`: 当存在明确联系但无更精确关系时使用的弱关系 +- `相关于`: 当存在明确、稳定且具有记忆价值的联系,但无更精确关系时使用的弱关系;不得用于泛化概念、抽象命题片段、口号式表达或仅为补全结构的联系 ===Extraction Order=== {% if language == "zh" %} 按以下顺序执行: 0. 先检查 `has_unsolved_reference`;如果为 `true`,直接返回空结果。 -1. 识别 `statement_text` 中值得抽取的稳定实体。 -2. 判断这些实体之间是否存在可由预定义关系类型表达的有效关系。 -3. 最后补充实体字段和关系字段。 +1. 先做指代解析:用户自指统一替换为 `用户`;其他可稳定解析的代词或指示表达替换为具体指代实体名。 +2. 如果仍存在无法稳定解析的代词、指示词或省略主体,直接返回空结果。 +3. 识别 `statement_text` 中值得抽取的稳定实体。 +4. 判断这些实体之间是否存在可由预定义关系类型表达的有效关系。 +5. 最后补充实体字段和关系字段。 不要让附加字段主导整个抽取过程。 {% else %} Follow this order: 0. First check `has_unsolved_reference`; if it is `true`, immediately return the empty result. -1. Identify stable entities worth extracting from `statement_text`. -2. Determine whether any valid relations between those entities can be expressed using the predefined Chinese predicates. -3. Finally fill auxiliary entity and predicate fields. +1. Resolve references first: normalize user self-reference to `用户`; replace other stably resolvable pronouns or demonstratives with their resolved entity names. +2. If unresolved pronouns, demonstratives, or omitted subjects still remain, immediately return the empty result. +3. Identify stable entities worth extracting from `statement_text`. +4. Determine whether any valid relations between those entities can be expressed using the predefined Chinese predicates. +5. Finally fill auxiliary entity and predicate fields. Do not let auxiliary fields drive the extraction process. {% endif %} ===Guidelines=== +**Reference Resolution:** +{% if language == "zh" %} + +- 指代解析优先于实体抽取和关系抽取。 +- 所有用户自指表达都必须规范成 `用户`,包括“我”“我的”“我自己”等。 +- 对“他”“她”“它”“这个”“那个”“这家”“那家”“这里”“那里”等非用户自指表达,若上下文可稳定解析,则必须用解析后的具体实体名替换。 +- 若非用户自指表达无法稳定解析,则整条跳过,不输出部分结果。 +- 新出现的称呼、别名、昵称、产品名不是待消解代词,应保持原样。 + {% else %} +- Reference resolution happens before entity or relation extraction. +- All user self-reference must be normalized to `用户`, including forms such as "I", "me", "my", and "myself". +- For non-user references such as "he", "she", "it", "this", "that", "this company", "that place", "here", or "there", if the context supports a stable resolution, replace them with the resolved entity name. +- If a non-user reference cannot be resolved stably, skip the entire statement and do not output partial results. +- Newly introduced names, aliases, nicknames, and product names are not pronouns to be resolved; keep them in their original form. + {% endif %} + **Entity Extraction:** {% if language == "zh" %} - 只有当某个名字、概念、对象、群体或地点在当前陈述中承担明确语义角色,或是理解有效关系所必需时,才创建实体。 - 不要因为表面上出现了名词、修饰词或短语,就机械地创建实体。 +- 不要把完整命题、因果链、价值判断或口号式表达拆成多个低价值实体;例如“努力就会有回报”默认不应抽取出“努力”或“回报”作为实体。 - 普通时间表达默认不抽取为实体,包括日期、时刻、明天、下周、今晚八点等。 - 一次性动作短语默认不抽取为实体,例如“复习微积分”“去图书馆学习”“参观卢浮宫”。 - 不要为了表达一句带时间或地点的行动,而额外创造“任务”“计划”“事件”实体。 - 但如果动作明确把主体和某个稳定实体连接起来,可以保留该稳定实体,并抽取轻关系。例如“我去图书馆”“我去公司开会”“我去上课”“我去看演唱会”可以抽取 `前往`。 +- 当句子只是在讨论一般道理、抽象规律、空泛结果或非个体化概念,而这些概念本身不构成可复用记忆时,不要创建实体。 +- 如果句子表达的是用户的观点、信念、判断、愿望或目标倾向,但其中抽象对象不值得作为独立实体保留,则只保留相关高价值实体,不要再创建这些低价值对象实体,并把未抽取的抽象内容压缩写入相关实体的 `description`;例如“用户认为努力就会有回报”应只保留 `用户`,并在 `description` 中体现“用户认为努力就会有回报”。 +- 对于未抽取的抽象实体、抽象命题片段或泛化结果,只要它们对理解该高价值实体有帮助,就应优先写入该实体的 `description`,而不是改用宽泛关系或补造弱实体。 +- 当前阶段同样不要把情绪或心理状态抽成实体;如果句子里出现“紧张”“开心”“难过”“焦虑”“放松”等,应写入相关高价值实体的 `description`,而不是把它们标成 `知识能力`、`偏好习惯目标` 或其他近似类型。 - 如果陈述里有值得保留的实体信息,但没有有效关系,可以只返回 `entities`,并把 `triplets` 设为 `[]`。 -- `name` 默认保持原文中的表面形式,不要翻译;但用户自指要统一写成 `用户`。 +- `name` 默认保持原文中的表面形式,但用户自指必须写成 `用户`,可稳定解析的其他代词必须替换为具体指代实体名。 - `description` 必须使用中文。 - `type` 和 `type_description` 必须使用上方预定义的中文标签与中文定义。 {% else %} - Extract entities only when they play a clear semantic role in the statement or are necessary for understanding a valid relation. - Do not mechanically create entities for every noun, modifier, or surface mention. +- Do not split generic propositions, causal slogans, or value judgments into low-value abstract entities. For example, "effort brings reward" should not create standalone entities for "effort" or "reward" by default. - Do not extract ordinary time expressions as entities, including dates, timestamps, "tomorrow", "next week", or "8 PM tonight". - Do not extract one-off action phrases as entities, such as "review calculus", "study in the library", or "visit the Louvre". - Do not create extra "task", "plan", or "event" entities just to represent an action with time or location modifiers. - But if an action clearly connects the subject to a stable entity, keep that stable entity and use a light relation. For example, statements like "I go to the library", "I go to the office", "I go to class", or "I go to a concert" can use `前往`. +- If the sentence is only about a generic principle, abstract outcome, or non-personalized concept that is not worth remembering on its own, do not create an entity for it. +- If a statement expresses the user's belief, judgment, opinion, wish, or goal tendency but the referenced abstract concepts are not worth keeping as standalone entities, keep only the relevant high-value entities, do not create those low-value concept entities, and compress the unextracted abstract content into the relevant entity `description`. For example, "the user believes effort brings reward" should keep only `用户` and reflect that belief in `description`. +- For abstract entities, proposition fragments, or generic outcomes that are not extracted, prefer writing them into the relevant retained entity's `description` when they help preserve the memory, instead of switching to a broad relation or inventing a weak entity. +- In the current stage, do not extract emotional or psychological states as entities. States such as nervousness, happiness, sadness, anxiety, or relief should be written into the relevant retained entity's `description` rather than mapped to `知识能力`, `偏好习惯目标`, or any other approximate type. - If the statement contains entity-worthy content but no valid relation, it is acceptable to return `entities` with `triplets: []`. -- Keep `name` in its original surface form from the source text; exception: normalize user self-reference to `用户`. +- Keep `name` in its original surface form by default, but write user self-reference as `用户` and replace other stably resolvable references with their resolved entity names. - `description` must be in English. - `type` and `type_description` must use the predefined Chinese labels and Chinese definitions above. {% endif %} @@ -233,11 +489,11 @@ Do not let auxiliary fields drive the extraction process. **Semantic Memory (`is_explicit_memory`):** {% if language == "zh" %} -- 只有当实体明显属于语义知识记忆中的抽象概念时,才设为 `true`,例如概念、定义、理论、方法和知识主题。 +- 只有当实体明显属于语义知识记忆中的抽象知识对象时,才设为 `true`,例如概念、定义、理论、方法以及 `知识能力` 中的知识类对象。 - 对人、组织、地点、具体物体以及大多数实例级实体,一律设为 `false`。 - 除非非常明确,否则默认设为 `false`。 {% else %} -- Use `true` only for abstract conceptual entities that belong in semantic knowledge memory, such as concepts, definitions, theories, methods, and knowledge topics. +- Use `true` only for abstract knowledge-oriented entities that belong in semantic knowledge memory, such as concepts, definitions, theories, methods, and knowledge-oriented members of `知识能力`. - Use `false` for people, organizations, locations, concrete objects, and most instance-level entities. - Default to `false` unless the entity is clearly an abstract knowledge concept. {% endif %} @@ -269,8 +525,14 @@ Do not let auxiliary fields drive the extraction process. - 如果没有任何预定义关系适用,返回 `triplets: []`。 - 排除语气词、模糊情绪、孤立名词和缺乏明确关系结构的片段。 - 如果陈述不支持有效关系,不要强行构造 triplet。 +- 不要为了保留一句抽象判断或泛因果命题,而强行构造“用户-拥有-努力”“努力-导致-回报”这类低价值 triplet。 +- `提到` 不用于保留泛化概念、抽象命题片段、口号式表达或仅在句面上出现但无记忆价值的对象。 +- `相关于` 不用于补救无法成立的关系,也不用于连接“努力”“回报”“成功”“意义”这类抽象概念。 +- `想要` 只用于具体、明确、用户特异且值得保留的对象或目标;如果想要的内容过于抽象或空泛,不要抽取 `想要`,应改写进相关实体的 `description`。 +- 不要为了保留情绪或心理状态而创建实体或弱关系;像“紧张”“开心”“难过”“焦虑”默认应写入相关实体的 `description`。 +- 对于这类观点句,如果相关概念本身不值得保留,也不要只为了补全结构而额外创建对应实体;允许输出仅包含 `用户` 的 `entities` 和空的 `triplets`。 - 如果 `has_unsolved_reference` 是 `true`,不要抽取实体或 triplets。 -- `subject_name` 和 `object_name` 默认保持原文中的表面形式,不要翻译;但用户自指要统一写成 `用户`。 +- `subject_name` 和 `object_name` 默认保持原文中的表面形式,但用户自指必须写成 `用户`,可稳定解析的其他代词必须替换为具体指代实体名。 - `predicate_description` 必须直接复用对应 `predicate` 的中文定义。 - 不要把普通时间表达作为 triplet 的宾语。 - 不要为了表达一次性计划、安排、日程而强行构造关系。 @@ -282,8 +544,14 @@ Do not let auxiliary fields drive the extraction process. - If no predefined relation fits, return `triplets: []`. - Exclude fillers, vague emotions, standalone nouns, and fragments without a clear relational structure. - If the statement does not support a valid relation, do not force a triplet. +- Do not force low-value triplets such as "user-has-effort" or "effort-causes-reward" just to preserve a generic causal belief or slogan-like proposition. +- Do not use `提到` to preserve generic concepts, proposition fragments, slogan-like expressions, or surface mentions that have no memory value. +- Do not use `相关于` as a rescue relation when no real relation exists, and do not connect abstract concepts such as "effort", "reward", "success", or "meaning" with it. +- Use `想要` only for concrete, specific, user-grounded objects or goals worth retaining; if the desired content is too abstract or generic, do not extract `想要` and instead rewrite it into the relevant entity `description`. +- Do not create entities or weak relations just to preserve emotional or psychological states; states such as nervousness, happiness, sadness, or anxiety should normally be written into the relevant retained entity `description`. +- For such opinion statements, if the referenced concepts are not worth keeping, do not create extra entities just to complete a structure; it is valid to return only the `用户` entity with empty `triplets`. - If `has_unsolved_reference` is `true`, do not extract entities or triplets. -- Keep `subject_name` and `object_name` in their original surface form; exception: normalize user self-reference to `用户`. +- Keep `subject_name` and `object_name` in their original surface form by default, but write user self-reference as `用户` and replace other stably resolvable references with their resolved entity names. - `predicate_description` must directly reuse the corresponding Chinese definition of `predicate`. - Do not use ordinary time expressions as triplet objects. - Do not force relations just to encode one-off plans, schedules, or actions. @@ -320,6 +588,7 @@ Do not let auxiliary fields drive the extraction process. 1. `alias -> 别名属于 -> canonical entity` 2. `caller -> 使用称呼 -> alias` - 如果施称方在句中明确出现且对语义重要,不要省略它。 +- 在命名关系中,新出现的称呼、别名、昵称、产品名必须保持原样,不要被替换成其所指实体名。 {% else %} - Distinguish between a naming fact and a naming act when the statement expresses both. - If the statement says that some entity or group calls or addresses another entity by a name, and the caller is explicitly mentioned in `statement_text`, extract the caller as an entity. @@ -328,6 +597,7 @@ Do not let auxiliary fields drive the extraction process. 1. `alias -> 别名属于 -> canonical entity` 2. `caller -> 使用称呼 -> alias` - Do not drop the caller entity if it is explicitly stated and semantically important to the naming relation. +- In naming relations, newly introduced names, aliases, nicknames, or product names must stay in their original form rather than being replaced by their referent. {% endif %} **subject_name / object_name Consistency:** @@ -352,29 +622,28 @@ Output: {"subject_name": "用户", "subject_id": 0, "predicate": "居住于", "predicate_description": "人物居住在某地点", "object_name": "巴黎", "object_id": 1} ], "entities": [ - {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "居住在巴黎的说话者", "is_explicit_memory": false}, - {"entity_idx": 1, "name": "巴黎", "type": "地点", "type_description": "具有地理或空间意义的位置", "description": "用户居住的城市", "is_explicit_memory": false} + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "居住在巴黎的说话者", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "巴黎", "type": "地点设施", "type_description": "具有地理意义或功能性空间意义的位置与场所", "description": "用户居住的城市", "is_explicit_memory": false} ] } **示例 2** -Statement: "张明在腾讯工作,负责 AI 产品开发。" +Statement: "他在腾讯工作。" +Input condition: supporting context has already made it clear that “他” refers to “张明”. Output: { "triplets": [ - {"subject_name": "张明", "subject_id": 0, "predicate": "任职于", "predicate_description": "主体在某组织中工作或任职", "object_name": "腾讯", "object_id": 1}, - {"subject_name": "张明", "subject_id": 0, "predicate": "负责", "predicate_description": "主体负责某项工作、职责或领域", "object_name": "AI 产品开发", "object_id": 2} + {"subject_name": "张明", "subject_id": 0, "predicate": "任职于", "predicate_description": "主体在某组织中工作或任职", "object_name": "腾讯", "object_id": 1} ], "entities": [ - {"entity_idx": 0, "name": "张明", "type": "人物", "type_description": "现实中的具体个人", "description": "在腾讯负责 AI 产品开发的人员", "is_explicit_memory": false}, - {"entity_idx": 1, "name": "腾讯", "type": "组织", "type_description": "公司、机构、团队、社群等组织性主体", "description": "张明任职的公司", "is_explicit_memory": false}, - {"entity_idx": 2, "name": "AI 产品开发", "type": "知识主题", "type_description": "主题、领域、方法、理论或知识概念", "description": "张明负责的工作方向", "is_explicit_memory": true} + {"entity_idx": 0, "name": "张明", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "在腾讯工作的人员", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "腾讯", "type": "组织", "type_description": "公司、机构、学校、实验室、团队、社群等组织性主体。", "description": "张明任职的公司", "is_explicit_memory": false} ] } **示例 3** -Statement: "我明天下午三点去图书馆复习微积分。" +Statement: "我常去图书馆学微积分。" Output: { @@ -383,9 +652,9 @@ Output: {"subject_name": "用户", "subject_id": 0, "predicate": "学习", "predicate_description": "主体正在学习某知识主题或技能", "object_name": "微积分", "object_id": 2} ], "entities": [ - {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "提到自己安排的说话者", "is_explicit_memory": false}, - {"entity_idx": 1, "name": "图书馆", "type": "设施", "type_description": "建筑、场馆、房间、实验室等功能性空间", "description": "用户提到要去的地点", "is_explicit_memory": false}, - {"entity_idx": 2, "name": "微积分", "type": "知识主题", "type_description": "主题、领域、方法、理论或知识概念", "description": "用户提到的学习主题", "is_explicit_memory": true} + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "经常去图书馆学习微积分的说话者", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "图书馆", "type": "地点设施", "type_description": "具有地理意义或功能性空间意义的位置与场所。", "description": "用户经常前往学习的地点", "is_explicit_memory": false}, + {"entity_idx": 2, "name": "微积分", "type": "知识能力", "type_description": "可学习、掌握、使用或讨论的知识主题、技能、学科或语言。", "description": "用户经常学习的主题", "is_explicit_memory": true} ] } @@ -409,9 +678,86 @@ Output: {"subject_name": "我的朋友", "subject_id": 1, "predicate": "使用称呼", "predicate_description": "主体使用某个名字来称呼另一实体", "object_name": "山哥", "object_id": 2} ], "entities": [ - {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "被朋友称作山哥的说话者", "is_explicit_memory": false}, - {"entity_idx": 1, "name": "我的朋友", "type": "群体", "type_description": "未具名或泛指的一组人", "description": "使用山哥这一称呼的人群", "is_explicit_memory": false}, - {"entity_idx": 2, "name": "山哥", "type": "称呼", "type_description": "用于指代或称呼实体的名字", "description": "朋友用来称呼用户的昵称", "is_explicit_memory": false} + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "被朋友称作山哥的说话者", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "我的朋友", "type": "群体", "type_description": "边界相对稳定、可被当作整体引用的一组人。", "description": "使用山哥这一称呼的人群", "is_explicit_memory": false}, + {"entity_idx": 2, "name": "山哥", "type": "称呼别名", "type_description": "用于指代或称呼实体的名字。", "description": "朋友用来称呼用户的昵称", "is_explicit_memory": false} + ] +} + +**示例 6** +Statement: "我认为努力就会有回报。" + +Output: +{ + "triplets": [], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "认为努力就会有回报的说话者", "is_explicit_memory": false} + ] +} + +**示例 7** +Statement: "我想要成功。" + +Output: +{ + "triplets": [], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "想要成功的说话者", "is_explicit_memory": false} + ] +} + +**示例 8** +Statement: "我最近有点紧张,不过这很正常。" + +Output: +{ + "triplets": [], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "最近有些紧张并认为这很正常的说话者", "is_explicit_memory": false} + ] +} + +**示例 9** +Statement: "王教授是导师。" + +Output: +{ + "triplets": [ + {"subject_name": "王教授", "subject_id": 0, "predicate": "担任角色", "predicate_description": "主体承担某个角色", "object_name": "导师", "object_id": 1} + ], + "entities": [ + {"entity_idx": 0, "name": "王教授", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "承担导师角色的具体个人", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "导师", "type": "角色职业", "type_description": "人物承担的社会角色、功能身份或职业身份。", "description": "王教授承担的角色身份", "is_explicit_memory": false} + ] +} + +**示例 10** +Statement: "我的GitHub账号用户名是chen4。" + +Output: +{ + "triplets": [ + {"subject_name": "用户", "subject_id": 0, "predicate": "拥有账号", "predicate_description": "实体具有某账号", "object_name": "GitHub账号", "object_id": 1}, + {"subject_name": "GitHub账号", "subject_id": 1, "predicate": "标识为", "predicate_description": "实体由某标识符标识", "object_name": "chen4", "object_id": 2} + ], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "拥有该 GitHub 账号的说话者", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "GitHub账号", "type": "账号", "type_description": "账户、账号、用户档案类实体。", "description": "用户拥有的 GitHub 账号", "is_explicit_memory": false}, + {"entity_idx": 2, "name": "chen4", "type": "标识符", "type_description": "用于识别实体的编号、ID、用户名、学号、工号等标识。", "description": "该 GitHub 账号对应的用户名标识", "is_explicit_memory": false} + ] +} + +**示例 11** +Statement: "机器人查票员和我沟通。" + +Output: +{ + "triplets": [ + {"subject_name": "机器人查票员", "subject_id": 0, "predicate": "沟通于", "predicate_description": "两个实体之间发生沟通或交流", "object_name": "用户", "object_id": 1} + ], + "entities": [ + {"entity_idx": 0, "name": "机器人查票员", "type": "智能体", "type_description": "具有行动、交互或执行能力的非人主体,如机器人、AI 或其他智慧体。", "description": "与用户发生沟通的机器人主体", "is_explicit_memory": false}, + {"entity_idx": 1, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "与机器人查票员沟通的说话者", "is_explicit_memory": false} ] } ===End of Examples=== @@ -424,10 +770,11 @@ JSON 要求: - 字符串内部引号必须转义为 `\"` - 不要使用中文引号 - 字符串值中不要换行 -- `name`、`subject_name`、`object_name` 默认保持原文中的表面形式,不要翻译;但用户自指必须规范成 `用户` +- `name`、`subject_name`、`object_name` 默认保持原文中的表面形式,但用户自指必须规范成 `用户`,可稳定解析的其他代词必须替换为具体指代实体名 - `description` 必须使用中文 - `type`、`predicate`、`type_description`、`predicate_description` 必须使用上方预定义的中文标签和中文说明 - 如果 `has_unsolved_reference` 是 `true`,输出必须是 `{"entities": [], "triplets": []}` +- 如果存在无法稳定解析的代词或指示表达,输出也必须是 `{"entities": [], "triplets": []}` - 如果没有有效 triplet,返回 `"triplets": []` {% else %} JSON Requirements: @@ -435,10 +782,11 @@ JSON 要求: - Escape internal quotes using `\"` - No Chinese quotation marks - No line breaks inside string values -- `name`, `subject_name`, and `object_name` must keep the original surface form from the source text, except user self-reference which must be normalized to `用户` +- `name`, `subject_name`, and `object_name` keep their original surface forms by default, but user self-reference must be normalized to `用户` and other stably resolvable references must be replaced by their resolved entity names - `description` must be in English - `type`, `predicate`, `type_description`, and `predicate_description` must use the predefined Chinese labels and Chinese definitions above - If `has_unsolved_reference` is `true`, the output must be `{"entities": [], "triplets": []}` +- If unresolved references still remain, the output must also be `{"entities": [], "triplets": []}` - If no valid triplet exists, return `"triplets": []` {% endif %} diff --git a/api/app/repositories/neo4j/create_indexes.py b/api/app/repositories/neo4j/create_indexes.py index fe10322f..9bd624ce 100644 --- a/api/app/repositories/neo4j/create_indexes.py +++ b/api/app/repositories/neo4j/create_indexes.py @@ -46,6 +46,12 @@ async def create_fulltext_indexes(): OPTIONS { indexConfig: { `fulltext.analyzer`: 'cjk' } } """) + # 创建 AssistantPruned 剪枝文本全文索引 + await connector.execute_query(""" + CREATE FULLTEXT INDEX assistantPrunedFulltext IF NOT EXISTS FOR (p:AssistantPruned) ON EACH [p.text] + OPTIONS { indexConfig: { `fulltext.analyzer`: 'cjk' } } + """) + finally: await connector.close() @@ -135,6 +141,17 @@ async def create_vector_indexes(): `vector.similarity_function`: 'cosine' }} """) + + # AssistantPruned text embedding index (optional, for semantic search on pruned hints) + await connector.execute_query(""" + CREATE VECTOR INDEX assistant_pruned_embedding_index IF NOT EXISTS + FOR (p:AssistantPruned) + ON p.text_embedding + OPTIONS {indexConfig: { + `vector.dimensions`: 1024, + `vector.similarity_function`: 'cosine' + }} + """) finally: await connector.close() @@ -179,6 +196,22 @@ async def create_unique_constraints(): """ ) + # AssistantOriginal.id unique + await connector.execute_query( + """ + CREATE CONSTRAINT assistant_original_id_unique IF NOT EXISTS + FOR (o:AssistantOriginal) REQUIRE o.id IS UNIQUE + """ + ) + + # AssistantPruned.id unique + await connector.execute_query( + """ + CREATE CONSTRAINT assistant_pruned_id_unique IF NOT EXISTS + FOR (p:AssistantPruned) REQUIRE p.id IS UNIQUE + """ + ) + finally: await connector.close() diff --git a/api/app/repositories/neo4j/cypher_queries.py b/api/app/repositories/neo4j/cypher_queries.py index 247da0a9..b0d18482 100644 --- a/api/app/repositories/neo4j/cypher_queries.py +++ b/api/app/repositories/neo4j/cypher_queries.py @@ -1363,154 +1363,60 @@ ORDER BY score DESC LIMIT $limit """ -SEARCH_STATEMENTS_BY_KEYWORD = """ -CALL db.index.fulltext.queryNodes("statementsFulltext", $query) YIELD node AS s, score -WHERE ($end_user_id IS NULL OR s.end_user_id = $end_user_id) -OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s) -OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity) -RETURN s.id AS id, - s.statement AS statement, - s.end_user_id AS end_user_id, - s.chunk_id AS chunk_id, - s.created_at AS created_at, - s.expired_at AS expired_at, - s.valid_at AS valid_at, - properties(s)['invalid_at'] AS invalid_at, - c.id AS chunk_id_from_rel, - collect(DISTINCT e.id) AS entity_ids, - COALESCE(s.activation_value, s.importance_score, 0.5) AS activation_value, - COALESCE(s.importance_score, 0.5) AS importance_score, - s.last_access_time AS last_access_time, - COALESCE(s.access_count, 0) AS access_count, - score -ORDER BY score DESC -LIMIT $limit -""" -SEARCH_ENTITIES_BY_NAME_OR_ALIAS = """ -CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score -WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id) -WITH e, score -With collect({entity: e, score: score}) AS fulltextResults +# ── Assistant Pruning Nodes & Edges ── -OPTIONAL MATCH (ae:ExtractedEntity) -WHERE ($end_user_id IS NULL OR ae.end_user_id = $end_user_id) - AND ae.aliases IS NOT NULL - AND ANY(alias IN ae.aliases WHERE toLower(alias) CONTAINS toLower($query)) -WITH fulltextResults, collect(ae) AS aliasEntities - -UNWIND (fulltextResults + [x IN aliasEntities | {entity: x, score: - CASE - WHEN ANY(alias IN x.aliases WHERE toLower(alias) = toLower($query)) THEN 1.0 - WHEN ANY(alias IN x.aliases WHERE toLower(alias) STARTS WITH toLower($query)) THEN 0.9 - ELSE 0.8 - END -}]) AS row -WITH row.entity AS e, row.score AS score -WITH DISTINCT e, MAX(score) AS score -OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e) -OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s) -RETURN e.id AS id, - e.name AS name, - e.end_user_id AS end_user_id, - e.entity_type AS entity_type, - e.created_at AS created_at, - e.expired_at AS expired_at, - e.entity_idx AS entity_idx, - e.statement_id AS statement_id, - e.description AS description, - e.aliases AS aliases, - e.name_embedding AS name_embedding, - e.connect_strength AS connect_strength, - collect(DISTINCT s.id) AS statement_ids, - collect(DISTINCT c.id) AS chunk_ids, - COALESCE(e.activation_value, e.importance_score, 0.5) AS activation_value, - COALESCE(e.importance_score, 0.5) AS importance_score, - e.last_access_time AS last_access_time, - COALESCE(e.access_count, 0) AS access_count, - score -ORDER BY score DESC -LIMIT $limit -""" - -SEARCH_CHUNKS_BY_CONTENT = """ -CALL db.index.fulltext.queryNodes("chunksFulltext", $query) YIELD node AS c, score -WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id) -OPTIONAL MATCH (c)-[:CONTAINS]->(s:Statement) -OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity) -RETURN c.id AS id, - c.end_user_id AS end_user_id, - c.content AS content, - c.dialog_id AS dialog_id, - c.sequence_number AS sequence_number, - collect(DISTINCT s.id) AS statement_ids, - collect(DISTINCT e.id) AS entity_ids, - COALESCE(c.activation_value, 0.5) AS activation_value, - c.last_access_time AS last_access_time, - COALESCE(c.access_count, 0) AS access_count, - score -ORDER BY score DESC -LIMIT $limit -""" - -# MemorySummary keyword search using fulltext index -SEARCH_MEMORY_SUMMARIES_BY_KEYWORD = """ -CALL db.index.fulltext.queryNodes("summariesFulltext", $query) YIELD node AS m, score -WHERE ($end_user_id IS NULL OR m.end_user_id = $end_user_id) -OPTIONAL MATCH (m)-[:DERIVED_FROM_STATEMENT]->(s:Statement) -RETURN m.id AS id, - m.name AS name, - m.end_user_id AS end_user_id, - m.dialog_id AS dialog_id, - m.chunk_ids AS chunk_ids, - m.content AS content, - m.created_at AS created_at, - COALESCE(m.activation_value, m.importance_score, 0.5) AS activation_value, - COALESCE(m.importance_score, 0.5) AS importance_score, - m.last_access_time AS last_access_time, - COALESCE(m.access_count, 0) AS access_count, - score -ORDER BY score DESC -LIMIT $limit -""" - -# Community keyword search: matches name or summary via fulltext index -SEARCH_COMMUNITIES_BY_KEYWORD = """ -CALL db.index.fulltext.queryNodes("communitiesFulltext", $query) YIELD node AS c, score -WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id) -RETURN c.community_id AS id, - c.name AS name, - c.summary AS content, - c.core_entities AS core_entities, - c.member_count AS member_count, - c.end_user_id AS end_user_id, - c.updated_at AS updated_at, - score -ORDER BY score DESC -LIMIT $limit -""" - -FULLTEXT_QUERY_CYPHER_MAPPING = { - Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_KEYWORD, - Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_NAME_OR_ALIAS, - Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_CONTENT, - Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_KEYWORD, - Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_KEYWORD, - Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUALS_BY_KEYWORD +ASSISTANT_ORIGINAL_NODE_SAVE = """ +UNWIND $originals AS orig +MERGE (o:AssistantOriginal {id: orig.id}) +SET o += { + end_user_id: orig.end_user_id, + run_id: orig.run_id, + dialog_id: orig.dialog_id, + pair_id: orig.pair_id, + text: orig.text, + created_at: orig.created_at, + expired_at: orig.expired_at } -USER_ID_QUERY_CYPHER_MAPPING = { - Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_USER_ID, - Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_USER_ID, - Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_USER_ID, - Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_USER_ID, - Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_USER_ID, - Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUAL_BY_USER_ID -} -NODE_ID_QUERY_CYPHER_MAPPING = { - Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_IDS, - Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_IDS, - Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_IDS, - Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_IDS, - Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_IDS, - Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUAL_BY_IDS +RETURN o.id AS uuid +""" + +ASSISTANT_PRUNED_NODE_SAVE = """ +UNWIND $pruneds AS p +MERGE (pr:AssistantPruned {id: p.id}) +SET pr += { + end_user_id: p.end_user_id, + run_id: p.run_id, + dialog_id: p.dialog_id, + pair_id: p.pair_id, + text: p.text, + memory_type: p.memory_type, + text_embedding: p.text_embedding, + created_at: p.created_at, + expired_at: p.expired_at } +RETURN pr.id AS uuid +""" + +ASSISTANT_PRUNED_EDGE_SAVE = """ +UNWIND $edges AS edge +MATCH (o:AssistantOriginal {id: edge.source}) +MATCH (p:AssistantPruned {id: edge.target}) +MERGE (o)-[r:PRUNED_TO]->(p) +SET r.pair_id = edge.pair_id, + r.end_user_id = edge.end_user_id, + r.run_id = edge.run_id, + r.created_at = edge.created_at +RETURN elementId(r) AS uuid +""" + +ASSISTANT_DIALOG_EDGE_SAVE = """ +UNWIND $edges AS edge +MATCH (o:AssistantOriginal {id: edge.source}) +MATCH (d:Dialogue {id: edge.target}) +MERGE (o)-[r:BELONGS_TO_DIALOG]->(d) +SET r.end_user_id = edge.end_user_id, + r.run_id = edge.run_id, + r.created_at = edge.created_at +RETURN elementId(r) AS uuid +""" diff --git a/api/app/repositories/neo4j/graph_saver.py b/api/app/repositories/neo4j/graph_saver.py index 6f0e03a5..6109f189 100644 --- a/api/app/repositories/neo4j/graph_saver.py +++ b/api/app/repositories/neo4j/graph_saver.py @@ -24,6 +24,10 @@ from app.core.memory.models.graph_models import ( EntityEntityEdge, PerceptualNode, PerceptualEdge, + AssistantOriginalNode, + AssistantPrunedNode, + AssistantPrunedEdge, + AssistantDialogEdge, ) import logging @@ -166,6 +170,10 @@ async def save_dialog_and_statements_to_neo4j( statement_entity_edges: List[StatementEntityEdge], perceptual_edges: List[PerceptualEdge], connector: Neo4jConnector, + assistant_original_nodes: Optional[List[AssistantOriginalNode]] = None, + assistant_pruned_nodes: Optional[List[AssistantPrunedNode]] = None, + assistant_pruned_edges: Optional[List[AssistantPrunedEdge]] = None, + assistant_dialog_edges: Optional[List[AssistantDialogEdge]] = None, ) -> bool: """Save dialogue nodes, chunk nodes, statement nodes, entities, and all relationships to Neo4j using graph models. @@ -368,6 +376,55 @@ async def save_dialog_and_statements_to_neo4j( results['perceptual_chunk_edges'] = perceptual_edges_uuids logger.info(f"Successfully saved {len(perceptual_edges_uuids)} perceptual-chunk edges to Neo4j") + # 8. Save assistant original nodes + if assistant_original_nodes: + from app.repositories.neo4j.cypher_queries import ASSISTANT_ORIGINAL_NODE_SAVE + original_data = [node.model_dump() for node in assistant_original_nodes] + result = await tx.run(ASSISTANT_ORIGINAL_NODE_SAVE, originals=original_data) + original_uuids = [record["uuid"] async for record in result] + results['assistant_originals'] = original_uuids + logger.info(f"Successfully saved {len(original_uuids)} assistant original nodes to Neo4j") + + # 9. Save assistant pruned nodes + if assistant_pruned_nodes: + from app.repositories.neo4j.cypher_queries import ASSISTANT_PRUNED_NODE_SAVE + pruned_data = [node.model_dump() for node in assistant_pruned_nodes] + result = await tx.run(ASSISTANT_PRUNED_NODE_SAVE, pruneds=pruned_data) + pruned_uuids = [record["uuid"] async for record in result] + results['assistant_pruneds'] = pruned_uuids + logger.info(f"Successfully saved {len(pruned_uuids)} assistant pruned nodes to Neo4j") + + # 10. Save PRUNED_TO edges (Original → Pruned) + if assistant_pruned_edges: + from app.repositories.neo4j.cypher_queries import ASSISTANT_PRUNED_EDGE_SAVE + edge_data = [{ + "source": edge.source, + "target": edge.target, + "pair_id": edge.pair_id, + "end_user_id": edge.end_user_id, + "run_id": edge.run_id, + "created_at": edge.created_at.isoformat() if edge.created_at else None, + } for edge in assistant_pruned_edges] + result = await tx.run(ASSISTANT_PRUNED_EDGE_SAVE, edges=edge_data) + pruned_edge_uuids = [record["uuid"] async for record in result] + results['assistant_pruned_edges'] = pruned_edge_uuids + logger.info(f"Successfully saved {len(pruned_edge_uuids)} PRUNED_TO edges to Neo4j") + + # 11. Save BELONGS_TO_DIALOG edges (Original → Dialogue) + if assistant_dialog_edges: + from app.repositories.neo4j.cypher_queries import ASSISTANT_DIALOG_EDGE_SAVE + edge_data = [{ + "source": edge.source, + "target": edge.target, + "end_user_id": edge.end_user_id, + "run_id": edge.run_id, + "created_at": edge.created_at.isoformat() if edge.created_at else None, + } for edge in assistant_dialog_edges] + result = await tx.run(ASSISTANT_DIALOG_EDGE_SAVE, edges=edge_data) + dialog_edge_uuids = [record["uuid"] async for record in result] + results['assistant_dialog_edges'] = dialog_edge_uuids + logger.info(f"Successfully saved {len(dialog_edge_uuids)} BELONGS_TO_DIALOG edges to Neo4j") + return results try: