refactor(memory): enhance extraction ontology and add assistant pruning graph support

- Expand entity type ontology with detailed definitions, examples, and notes
  (merged types: 地点设施, 物品设备, 产品服务, 软件平台, 角色职业, 知识能力, 偏好习惯目标, 称呼别名, 智能体)
- Add relation ontology taxonomy with 15 predicate categories and usage rules
- Strengthen reference resolution rules: resolve pronouns before extraction,
  skip unresolvable references entirely
- Add guidelines to avoid extracting abstract propositions, emotions, and
  low-value entities (effort/reward/success patterns)
- Add 7 new extraction examples covering edge cases
- Add AssistantOriginal/AssistantPruned node models and graph persistence
  (PRUNED_TO and BELONGS_TO_DIALOG edges, Neo4j indexes and constraints)
- Add graph_build_step.py for building graph nodes/edges from DialogData
- Update write_pipeline.py to pass assistant pruning nodes/edges to graph saver
- Update data_pruning.py with related preprocessing changes
This commit is contained in:
lanceyq
2026-04-28 13:32:29 +08:00
parent 2355536b44
commit 7747ed7ac1
11 changed files with 917 additions and 421 deletions

View File

@@ -578,3 +578,47 @@ class PerceptualNode(Node):
domain: str
file_type: str
summary_embedding: list[float] | None
class AssistantOriginalNode(Node):
"""Node storing the original text of an Assistant message before pruning.
Attributes:
pair_id: Shared ID with the corresponding AssistantPrunedNode for pairing
dialog_id: ID of the parent dialogue this message belongs to
text: The full original Assistant response text
"""
pair_id: str = Field(..., description="Shared pairing ID with the corresponding pruned node")
dialog_id: str = Field(..., description="ID of the parent dialogue")
text: str = Field(..., description="Original Assistant message text")
class AssistantPrunedNode(Node):
"""Node storing the pruned (compressed) text of an Assistant message.
Attributes:
pair_id: Shared ID with the corresponding AssistantOriginalNode for pairing
dialog_id: ID of the parent dialogue this message belongs to
text: The pruned memory hint text (or "NULL" if no memory value)
memory_type: Type of the memory hint (comfort|suggestion|recommendation|warning|instruction|NULL)
text_embedding: Optional embedding vector for semantic search on pruned text
"""
pair_id: str = Field(..., description="Shared pairing ID with the corresponding original node")
dialog_id: str = Field(..., description="ID of the parent dialogue")
text: str = Field(..., description="Pruned assistant memory hint text")
memory_type: str = Field(..., description="Memory type: comfort|suggestion|recommendation|warning|instruction|NULL")
text_embedding: Optional[List[float]] = Field(None, description="Embedding vector for semantic search")
class AssistantPrunedEdge(Edge):
"""Edge connecting an AssistantOriginal node to its AssistantPruned node (PRUNED_TO).
Attributes:
pair_id: Shared pairing ID for traceability
"""
pair_id: str = Field(..., description="Shared pairing ID for traceability")
class AssistantDialogEdge(Edge):
"""Edge connecting an AssistantOriginal node to its parent Dialogue node (BELONGS_TO_DIALOG)."""
pass

View File

@@ -77,6 +77,10 @@ class ExtractionResult(BaseModel):
stmt_entity_edges: List[StatementEntityEdge]
entity_entity_edges: List[EntityEntityEdge]
perceptual_edges: List[PerceptualEdge]
assistant_original_nodes: List[Any] = Field(default_factory=list)
assistant_pruned_nodes: List[Any] = Field(default_factory=list)
assistant_pruned_edges: List[Any] = Field(default_factory=list)
assistant_dialog_edges: List[Any] = Field(default_factory=list)
dialog_data_list: List[Any] = Field(
default_factory=list,
description="原始 DialogData 列表,类型为 Any 以避免循环依赖",
@@ -482,6 +486,10 @@ class WritePipeline:
stmt_entity_edges=dedup_result.statement_entity_edges,
entity_entity_edges=dedup_result.entity_entity_edges,
perceptual_edges=graph.perceptual_edges,
assistant_original_nodes=graph.assistant_original_nodes,
assistant_pruned_nodes=graph.assistant_pruned_nodes,
assistant_pruned_edges=graph.assistant_pruned_edges,
assistant_dialog_edges=graph.assistant_dialog_edges,
dialog_data_list=dialog_data_list,
)
@@ -523,6 +531,10 @@ class WritePipeline:
entity_edges=result.entity_entity_edges,
perceptual_edges=result.perceptual_edges,
connector=self._neo4j_connector,
assistant_original_nodes=result.assistant_original_nodes,
assistant_pruned_nodes=result.assistant_pruned_nodes,
assistant_pruned_edges=result.assistant_pruned_edges,
assistant_dialog_edges=result.assistant_dialog_edges,
)
if success:
logger.info("Successfully saved all data to Neo4j")

View File

@@ -15,7 +15,9 @@ import hashlib
import json
import logging
from collections import OrderedDict
from datetime import datetime
from typing import List, Optional, Dict
from uuid import uuid4
from pydantic import BaseModel, Field
@@ -39,6 +41,16 @@ def message_has_files(message: "ConversationMessage") -> bool:
return message.files and len(message.files) > 0
class AssistantPruningRecord(BaseModel):
"""单个 User-Assistant 消息对的剪枝记录,用于后续写入 Neo4j。"""
pair_id: str = Field(..., description="唯一配对 IDOriginal 和 Pruned 节点共享")
original_text: str = Field(..., description="Assistant 原始回复全文")
pruned_text: str = Field(..., description="剪枝后文本assistant_memory_hint'NULL'")
memory_type: str = Field(..., description="comfort|suggestion|recommendation|warning|instruction|NULL")
created_at: str = Field(..., description="ISO 时间戳")
class AssistantPruningResponse(BaseModel):
"""LLM 对单个 User-Assistant 消息对的剪枝结果。
@@ -95,6 +107,9 @@ class SemanticPruner:
# Snapshot 数据收集:每个消息对的 input + gold
self._snapshot_records: List[Dict] = []
# 剪枝记录:用于后续写入 Neo4jAssistantOriginal + AssistantPruned 节点)
self.pruning_records: List[AssistantPruningRecord] = []
# 运行日志
self.run_logs: List[str] = []
@@ -246,6 +261,15 @@ class SemanticPruner:
},
})
# 收集剪枝记录(用于后续写入 Neo4j
self.pruning_records.append(AssistantPruningRecord(
pair_id=uuid4().hex,
original_text=asst_msg.msg,
pruned_text=result.assistant_memory_hint,
memory_type=result.assistant_memory_type,
created_at=datetime.now().isoformat(),
))
if result.assistant_memory_hint == "NULL":
self._log(
f" [{label}] 索引{asst_idx} → NULL删除 "

View File

@@ -855,6 +855,7 @@ class NewExtractionOrchestrator:
entity_idx=e.entity_idx,
name=e.name,
type=e.type,
type_description=getattr(e, "type_description", ""),
description=e.description,
is_explicit_memory=e.is_explicit_memory,
)
@@ -865,6 +866,7 @@ class NewExtractionOrchestrator:
subject_name=t.subject_name,
subject_id=t.subject_id,
predicate=t.predicate,
predicate_description=getattr(t, "predicate_description", ""),
object_name=t.object_name,
object_id=t.object_id,
)

View File

@@ -28,6 +28,10 @@ from app.core.memory.models.graph_models import (
StatementChunkEdge,
StatementEntityEdge,
StatementNode,
AssistantOriginalNode,
AssistantPrunedNode,
AssistantPrunedEdge,
AssistantDialogEdge,
)
from app.core.memory.models.message_models import DialogData, TemporalInfo
@@ -47,6 +51,10 @@ class GraphBuildResult:
"stmt_entity_edges",
"entity_entity_edges",
"perceptual_edges",
"assistant_original_nodes",
"assistant_pruned_nodes",
"assistant_pruned_edges",
"assistant_dialog_edges",
)
def __init__(
@@ -60,6 +68,10 @@ class GraphBuildResult:
stmt_entity_edges: List[StatementEntityEdge],
entity_entity_edges: List[EntityEntityEdge],
perceptual_edges: List[PerceptualEdge],
assistant_original_nodes: Optional[List[AssistantOriginalNode]] = None,
assistant_pruned_nodes: Optional[List[AssistantPrunedNode]] = None,
assistant_pruned_edges: Optional[List[AssistantPrunedEdge]] = None,
assistant_dialog_edges: Optional[List[AssistantDialogEdge]] = None,
):
self.dialogue_nodes = dialogue_nodes
self.chunk_nodes = chunk_nodes
@@ -70,6 +82,10 @@ class GraphBuildResult:
self.stmt_entity_edges = stmt_entity_edges
self.entity_entity_edges = entity_entity_edges
self.perceptual_edges = perceptual_edges
self.assistant_original_nodes = assistant_original_nodes or []
self.assistant_pruned_nodes = assistant_pruned_nodes or []
self.assistant_pruned_edges = assistant_pruned_edges or []
self.assistant_dialog_edges = assistant_dialog_edges or []
async def build_graph_nodes_and_edges(
@@ -343,6 +359,77 @@ async def build_graph_nodes_and_edges(
f"实体-实体边: {len(entity_entity_edges)}"
)
# ── Assistant 剪枝节点和边 ──
assistant_original_nodes: List[AssistantOriginalNode] = []
assistant_pruned_nodes: List[AssistantPrunedNode] = []
assistant_pruned_edges: List[AssistantPrunedEdge] = []
assistant_dialog_edges: List[AssistantDialogEdge] = []
for dialog_data in dialog_data_list:
pruning_records = dialog_data.metadata.get("assistant_pruning_records", [])
for record in pruning_records:
pair_id = record["pair_id"]
original_id = f"ao_{pair_id}"
pruned_id = f"ap_{pair_id}"
# AssistantOriginal 始终创建(记录原始对话)
original_node = AssistantOriginalNode(
id=original_id,
name=f"AssistantOriginal_{pair_id[:8]}",
end_user_id=dialog_data.end_user_id,
run_id=dialog_data.run_id,
created_at=dialog_data.created_at,
expired_at=dialog_data.expired_at,
pair_id=pair_id,
dialog_id=dialog_data.id,
text=record["original_text"],
)
assistant_original_nodes.append(original_node)
# BELONGS_TO_DIALOG: Original → Dialogue
assistant_dialog_edges.append(AssistantDialogEdge(
source=original_id,
target=dialog_data.id,
end_user_id=dialog_data.end_user_id,
run_id=dialog_data.run_id,
created_at=dialog_data.created_at,
))
# pruned_text 为 NULL 时不创建 AssistantPruned 节点和 PRUNED_TO 边
if record["pruned_text"] == "NULL":
continue
pruned_node = AssistantPrunedNode(
id=pruned_id,
name=f"AssistantPruned_{pair_id[:8]}",
end_user_id=dialog_data.end_user_id,
run_id=dialog_data.run_id,
created_at=dialog_data.created_at,
expired_at=dialog_data.expired_at,
pair_id=pair_id,
dialog_id=dialog_data.id,
text=record["pruned_text"],
memory_type=record["memory_type"],
)
assistant_pruned_nodes.append(pruned_node)
# PRUNED_TO: Original → Pruned
assistant_pruned_edges.append(AssistantPrunedEdge(
source=original_id,
target=pruned_id,
end_user_id=dialog_data.end_user_id,
run_id=dialog_data.run_id,
created_at=dialog_data.created_at,
pair_id=pair_id,
))
if assistant_original_nodes:
logger.info(
f"Assistant 剪枝节点创建完成 - "
f"原始节点: {len(assistant_original_nodes)}, "
f"剪枝节点: {len(assistant_pruned_nodes)}"
)
if progress_callback:
nodes_edges_stats = {
"dialogue_nodes_count": len(dialogue_nodes),
@@ -365,4 +452,8 @@ async def build_graph_nodes_and_edges(
stmt_entity_edges=stmt_entity_edges,
entity_entity_edges=entity_entity_edges,
perceptual_edges=perceptual_edges,
assistant_original_nodes=assistant_original_nodes,
assistant_pruned_nodes=assistant_pruned_nodes,
assistant_pruned_edges=assistant_pruned_edges,
assistant_dialog_edges=assistant_dialog_edges,
)

View File

@@ -1,199 +1,130 @@
{#
对话级抽取与相关性判定模板(用于剪枝加速)
输入pruning_scene, ontology_class_infos, dialog_text, language
- ontology_class_infos: List[{class_name: str, class_description: str}]
输出:严格 JSON不要包含任何多余文本字段
- is_related: bool是否与所选场景相关
- times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等)
- ids: [string],编号/ID/订单号/申请号/账号等
- amounts: [string],金额/费用/价格相关(带单位或货币符号)
- contacts: [string],联系方式(电话/手机号/邮箱/微信/QQ等
- addresses: [string],地址/地点相关文本
- keywords: [string],其它有助于保留的重要关键词(与场景强相关的术语)
- preserve_keywords: [string],必须保留的情绪/兴趣/爱好/个人偏好相关词或短语片段
你是一个面向记忆存储的 Assistant 辅助信息提取器。
要求
- 必须只输出上述 JSON且键名一致不得输出解释、前后缀不得包含注释。
- times/ids/amounts/contacts/addresses/keywords/preserve_keywords 仅抽取原文片段或规范化后的简单字符串。
- 仅输出上述键;避免多余解释或字段。
#}
任务
{# ── 确定场景说明 ── #}
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
{% if language == 'en' %}
{% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is relevant if it involves any of the following entity types.' %}
{% else %}
{% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关。' %}
{% endif %}
{% else %}
{% if language == 'en' %}
{% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
{% else %}
{% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %}
{% endif %}
{% endif %}
- 输入是一个 JSON对话放在 `msgs` 数组里,且数组中只有两条消息:第一条是 `User`,第二条是 `Assistant`。
- 你只处理第二条消息里的 `Assistant.msg`。
- 第一条消息里的 `User.msg` 只用于理解上下文,不允许出现在输出里。
- 你的输出必须包含两个字段:
1. `assistant_memory_hint`
2. `assistant_memory_type`
{% if language == "zh" %}
你是一个对话内容分析助手。请对下方对话全文进行一次性分析,完成两项任务:
1. 判断对话是否与指定场景相关;
2. 从对话中抽取所有需要保留的重要信息片段。
目标:
场景说明:{{ instruction }}
- 从 `Assistant.msg` 中提取一条适合后续检索的极短辅助摘要。
- 删除冗长解释、寒暄、礼貌话术、重复复述和空泛铺垫。
- 允许做摘要式改写,但只能保留原消息中已经出现的建议、推荐、提醒、安慰、步骤或其他对后续记忆有帮助的核心内容。
- 如果没有值得保留的信息,`assistant_memory_hint` 输出 `"NULL"``assistant_memory_type` 也输出 `"NULL"`。
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
【本场景实体类型定义】
以下实体类型定义了本场景中哪些内容是重要的。
凡是与以下任意类型相关的内容,都必须保留,并将关键词/短语提取到 keywords 字段:
硬约束:
{% for info in ontology_class_infos %}
- {{ info.class_name }}{{ info.class_description }}
{% endfor %}
- 不得改写、复述或输出 `User.msg`。
- 不得捏造新事实、新建议、新步骤、新材料。
- 不得改变 `Assistant` 原始语义和立场。
- 可以压缩、合并、重写 `Assistant.msg`,但必须忠于原内容。
- `assistant_memory_type` 只能从以下枚举中选择:
`comfort | suggestion | recommendation | warning | instruction | NULL`
- 只输出严格 JSON不要输出解释。
重要提示只要对话中出现与上述任意实体类型相关的内容即判定为相关is_related=true
{% endif %}
压缩原则:
---
【必须保留的内容(不可删除)】
以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段:
- 时间信息:日期、时间点、时间段、有效期 → times 字段
- 编号信息学号、工号、订单号、申请号、账号、ID → ids 字段
- 金额信息:价格、费用、金额(含货币符号或单位,如"100元"、"¥200")→ amounts 字段(注意:考试分数、成绩分数不属于金额,不要放入此字段)
- 联系方式电话、手机号、邮箱、微信、QQ → contacts 字段
- 地址信息:地点、地址、位置 → addresses 字段
- 场景关键词:与**当前场景**强相关的专业术语、事件名称 → keywords 字段(注意:只放与当前场景直接相关的词,跨场景的内容不要放入此字段)
- **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段
- **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段
- **个人情感态度**:对人际关系、情感状态的明确表达(如"我跟室友闹矛盾了"、"我都快抑郁了")→ preserve_keywords 字段
- 注意:学业目标(如"我想考研")、成绩(如"87分")、学科偏好(如"喜欢数学")属于学业信息,不属于情绪/情感,不要放入 preserve_keywords 字段
- 优先保留具体建议、推荐、提醒、操作步骤、风险提示、安慰动作。
- 优先删除长背景解释、寒暄、礼貌收尾、对用户原话的重复复述。
- 如果原文是长说明、长步骤、长菜谱,输出更短的概要版本,但不要丢掉核心意图。
- 优先保留最短但仍有信息密度的版本。
- `assistant_memory_hint` 尽量写成完整句,不要只写零散词组或标签。
- 优先使用显式主语来写结果,例如:
`安慰了用户……`
`建议用户……`
`推荐用户……`
`提醒用户……`
【场景无关内容标记】
请从对话中识别出与当前场景({{ pruning_scene }}**既不相关、也无语义关联**的消息片段,将其原文(或关键片段)提取到 scene_unrelated_snippets 字段。
判断标准:
- 与场景实体类型完全无关
- 与场景话题没有因果/时间/情境上的关联(例如:不是"因为上课所以累"这种关联)
- 纯粹是另一个话题的内容(如在教育场景中讨论购物、娱乐等)
注意:有情绪/感受表达的消息即使话题不同,也可能有语义关联,请谨慎标记。
**重要scene_unrelated_snippets 必须认真填写,不能为空数组。**
如果对话中存在与场景无关的内容,必须将其原文片段提取出来。
示例(场景=在线教育):
- "我最近心情很差,跟室友闹矛盾了" → 与教育场景无关,加入 scene_unrelated_snippets
- "她总是很晚回来吵到我睡觉" → 与教育场景无关,加入 scene_unrelated_snippets
- "对,我都快抑郁了" → 与教育场景无关,加入 scene_unrelated_snippets
- "期末考试12月25日" → 与教育场景相关,不加入 scene_unrelated_snippets
- "我上次高数作业87分" → 与教育场景相关,不加入 scene_unrelated_snippets
- "我的目标是考研" → 与教育场景相关,不加入 scene_unrelated_snippets
示例(场景=情感陪伴):
- "我最近心情很差,跟室友闹矛盾了" → 与情感陪伴场景相关(情绪+关系),不加入 scene_unrelated_snippets
- "对,我都快抑郁了" → 与情感陪伴场景相关(情绪),不加入 scene_unrelated_snippets
- "期末考试12月25日3号教学楼201室" → 与情感陪伴场景无关(教育信息),加入 scene_unrelated_snippets
- "我上次高数作业87分这次能考好吗" → 与情感陪伴场景无关(学业信息),加入 scene_unrelated_snippets
- "我的目标是考研,想读应用数学" → 与情感陪伴场景无关(学业目标),加入 scene_unrelated_snippets
【可以删除的内容】
以下类型的内容属于低价值信息,可以在剪枝时删除:
- 纯寒暄问候:如"你好"、"在吗"、"拜拜"、"嗯"、"好的"、"哦"等无实质内容的短语
- 纯表情/符号:如"[微笑]"、"😊"、"哈哈"等
- 重复确认:如"对对对"、"是的是的"、"嗯嗯嗯"等无新增信息的重复
- 无意义填充:如"啊"、"呢"、"嘛"等语气词单独成句
**注意:即使消息很短,只要包含情绪、兴趣、爱好、个人观点等有价值信息,就必须保留,不得删除。**
例如:
- "我好开心呀" → 包含情绪开心必须保留preserve_keywords 中加入"开心"
- "好喜欢打羽毛球呀" → 包含兴趣爱好喜欢打羽毛球必须保留preserve_keywords 中加入"喜欢打羽毛球"
- "我好难过" → 包含情绪难过必须保留preserve_keywords 中加入"难过"
- "太好啦!看到你开心,我也跟着心情亮起来" → 包含情绪必须保留preserve_keywords 中加入"开心"
---
对话全文:
"""
{{ dialog_text }}
"""
只输出严格 JSON键固定、顺序不限
Few-shot 示例 1
输入:
{
"is_related": <true 或 false>,
"times": [<string>...],
"ids": [<string>...],
"amounts": [<string>...],
"contacts": [<string>...],
"addresses": [<string>...],
"keywords": [<string>...],
"preserve_keywords": [<string>...],
"scene_unrelated_snippets": [<string>...]
"msgs": [
{
"role": "User",
"msg": "我室友小雯这学期一直在准备毕业论文,这两周都在改答辩 PPT。她下周三答辩我有点担心她会紧张。"
},
{
"role": "Assistant",
"msg": "听起来你很关心小雯,也希望她答辩顺利。她现在紧张其实很正常,很多人在答辩前都会这样。"
}
]
}
{% else %}
You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks:
1. Determine whether the dialogue is relevant to the specified scene;
2. Extract all important information fragments that must be preserved.
Scenario Description: {{ instruction }}
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
[Scene Entity Type Definitions]
The following entity types define what content is important in this scene.
Content related to ANY of these types must be preserved and extracted into the keywords field:
{% for info in ontology_class_infos %}
- {{ info.class_name }}: {{ info.class_description }}
{% endfor %}
Important: If the dialogue contains content related to any of the entity types above, mark it as relevant (is_related=true).
{% endif %}
---
[MUST PRESERVE (cannot be deleted)]
The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields:
- Time information: dates, time points, durations, expiry dates → times field
- ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field
- Amount information: prices, fees, amounts (with currency symbols or units, e.g., "$100", "¥200") → amounts field (Note: exam scores and grades are NOT amounts, do not put them here)
- Contact information: phone numbers, emails, WeChat, QQ → contacts field
- Address information: locations, addresses, places → addresses field
- Scene keywords: professional terms and event names strongly related to **the current scene** → keywords field (Note: only put terms directly related to the current scene; cross-scene content should not be placed here)
- **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field
- **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field
- **Personal emotional attitudes**: clear expressions about interpersonal relationships or emotional states (e.g., "I had a fight with my roommate", "I'm almost depressed") → preserve_keywords field
- Note: Academic goals (e.g., "I want to pursue a master's degree"), grades (e.g., "87 points"), and subject preferences (e.g., "I like math") are academic information, NOT emotions/feelings — do not put them in preserve_keywords
[Scene-Unrelated Content Marking]
Please identify message snippets in the dialogue that are **neither relevant to nor semantically associated with** the current scene ({{ pruning_scene }}), and extract their original text (or key fragments) into the scene_unrelated_snippets field.
Criteria:
- Completely unrelated to the scene's entity types
- No causal/temporal/contextual association with the scene topic (e.g., "feeling tired because of class" IS associated)
- Purely belongs to a different topic (e.g., discussing shopping or entertainment in an education scene)
Note: Messages with emotional/feeling expressions may still have semantic association even if the topic differs — mark carefully.
[CAN BE DELETED]
The following types of content are low-value and can be removed during pruning:
- Pure greetings: e.g., "hello", "are you there", "bye", "ok", "yeah" — short phrases with no substantive content
- Pure emojis/symbols: e.g., "[smile]", "😊", "haha"
- Repetitive confirmations: e.g., "yes yes yes", "right right", "uh huh" — repetitions with no new information
- Meaningless fillers: standalone interjections like "ah", "well", "hmm"
**Note: Even if a message is short, if it contains emotions, interests, hobbies, or personal opinions, it MUST be preserved.**
Examples:
- "I'm so happy!" → contains emotion (happy), must preserve; add "happy" to preserve_keywords
- "I love playing badminton!" → contains interest (love playing badminton), must preserve; add "love playing badminton" to preserve_keywords
- "I feel so sad" → contains emotion (sad), must preserve; add "sad" to preserve_keywords
---
Full Dialogue:
"""
{{ dialog_text }}
"""
Output strict JSON only (fixed keys, order doesn't matter):
输出:
{
"is_related": <true or false>,
"times": [<string>...],
"ids": [<string>...],
"amounts": [<string>...],
"contacts": [<string>...],
"addresses": [<string>...],
"keywords": [<string>...],
"preserve_keywords": [<string>...],
"scene_unrelated_snippets": [<string>...]
"assistant_memory_hint": "安慰了用户对室友答辩状态的担忧。",
"assistant_memory_type": "comfort"
}
{% endif %}
Few-shot 示例 2
输入:
{
"msgs": [
{
"role": "User",
"msg": "我最近总失眠,已经两周了,想先自己调一调。"
},
{
"role": "Assistant",
"msg": "如果你想先自己调整,可以先减少咖啡因摄入,尤其下午和晚上尽量不要再喝咖啡或浓茶,同时把睡前刷手机的时间压缩一些,尽量固定上床时间,先连续观察几天。"
}
]
}
输出:
{
"assistant_memory_hint": "建议用户减少咖啡因摄入、减少睡前刷手机时间并固定上床时间。",
"assistant_memory_type": "suggestion"
}
Few-shot 示例 3
输入:
{
"msgs": [
{
"role": "User",
"msg": "我晚上想做个简单点的减脂餐,最好二十分钟左右能搞定。"
},
{
"role": "Assistant",
"msg": "你可以做一个鸡胸肉沙拉碗,主要用鸡胸肉、生菜、黄瓜和圣女果。鸡胸肉简单煎熟切块后和蔬菜拌在一起,调味尽量用橄榄油加一点醋,不要放太多沙拉酱。"
}
]
}
输出:
{
"assistant_memory_hint": "推荐用户做鸡胸肉沙拉碗,并提醒用户调味时少放沙拉酱。",
"assistant_memory_type": "recommendation"
}
Few-shot 示例 4
输入:
{
"msgs": [
{
"role": "User",
"msg": "剪枝引擎和萃取引擎我都想先做,但是估计都会比较花时间。"
},
{
"role": "Assistant",
"msg": "这两个模块都涉及比较多的设计和实现细节。如果你想先推进,我建议先拆需求,再分别评估开发量。"
}
]
}
输出:
{
"assistant_memory_hint": "建议用户先拆需求,再分别评估两个模块的开发量。",
"assistant_memory_type": "suggestion"
}
现在处理下面这个输入。
输入:
{{ dialog_text }}
只输出严格 JSON
{
"assistant_memory_hint": "<string or NULL>",
"assistant_memory_type": "comfort | suggestion | recommendation | warning | instruction | NULL"
}

View File

@@ -2,7 +2,7 @@
{{ input_json }}
{%- endmacro %}
===Tasks===
=== Tasks ===
{% if language == "zh" %}
你的任务是从提供的目标文本中识别并提取陈述句,并为每条陈述句标注以下信息:
@@ -11,11 +11,12 @@
- statement_text
- statement_type
- temporal_type
- has_emotional_state
- has_unsolved_reference
- valid_at
- invalid_at
每条输出都应是一个结构化的记忆候选陈述句。
每条输出都应是一个结构化的候选记忆陈述句。
{% else %}
Your task is to identify and extract declarative statements from the provided target text, and annotate each extracted statement with:
@@ -23,6 +24,7 @@ Your task is to identify and extract declarative statements from the provided ta
- statement_text
- statement_type
- temporal_type
- has_emotional_state
- has_unsolved_reference
- valid_at
- invalid_at
@@ -30,7 +32,7 @@ Your task is to identify and extract declarative statements from the provided ta
Each output item should be a structured candidate memory statement.
{% endif %}
===Inputs===
=== Inputs ===
{% if language == "zh" %}
- chunk_id: chunk 唯一 ID
@@ -48,7 +50,7 @@ Each output item should be a structured candidate memory statement.
- supporting_context.msgs: ordered contextual messages, which may include User and Assistant messages
{% endif %}
===Scope===
=== Scope ===
{% if language == "zh" %}
- 只从 `target_content` 中提取陈述句。
@@ -66,12 +68,12 @@ Each output item should be a structured candidate memory statement.
- Every output statement must be directly grounded in wording from `target_content`.
{% endif %}
===Extraction Rules===
=== Extraction Rules ===
{% if language == "zh" %}
拆分规则:
- 以“一个完整意思”为单位提取陈述句,通常对应一个完整句子或一个自然语义片段。
- 默认保留句子级结构;只有当一个句子内部包含两个及以上彼此独立、拆开后明显更清的重要信息时,才拆成多条。
- 默认保留句子级结构;只有当一个句子内部包含两个及以上彼此独立、拆开后明显更清的重要信息时,才拆成多条。
- 宁可多提取,也不要漏掉 `target_content` 中能独立成立、且语义稳定的 statement。
- 但不要为了提高覆盖率而引入原文没有的信息,或输出语义不成立的 statement。
@@ -82,6 +84,9 @@ Each output item should be a structured candidate memory statement.
共指消解:
- 先完成最终的 `statement_text` 改写,再判断 `has_unsolved_reference`。
- `has_unsolved_reference` 必须基于最终输出的 `statement_text` 判断,而不是基于原始 `target_content` 里是否出现过代词来判断。
- 如果最终 `statement_text` 已经把引用改写成具体实体名,例如“助理恭喜用户”“小李点了一杯美式咖啡”,则 `has_unsolved_reference` 必须是 `false`。
- 如果可以解析到具体实体名,优先输出具体实体名,并将 `has_unsolved_reference` 设为 `false`。
- 如果不能解析到具体实体名,但可以解析到最小必要描述,则输出该最小必要描述,并将 `has_unsolved_reference` 设为 `true`。
- 如果既不能解析到具体实体名,也不能稳定解析到最小必要描述,则保留最小必要原始表达,并将 `has_unsolved_reference` 设为 `true`。
@@ -117,6 +122,15 @@ statement_type
- 如果没有明确时间,不要编造时间。
- 对于点状事件(例如某天发生的一次考试、一次见面、一次提交),`valid_at` 和 `invalid_at` 都应填写为该事件的起止边界;不要只填 `valid_at`。
情感状态判断:
- `has_emotional_state` 只用于判断当前 statement 是否反映了用户的情感状态。
- 如果根据当前 statement 和 supporting_context可以判断用户当前存在某种情感状态则输出 `true`。
- 该字段不是情绪分类字段,不要求输出具体情绪类型。
- 明确情绪表达例如“开心”“难过”“紧张”“有压力”通常应标为 `true`。
- 即使没有明确情绪词,只要语义足以表明用户当前具有情感状态,也可以标为 `true`,例如“我很好”。
- 如果只是客观事实、动作描述或安排,且无法从当前上下文稳定判断用户情感状态,则输出 `false`。
temporal_type
- `STATIC`:相对稳定、持续性的状态、身份、属性、长期偏好、长期关系、长期职业或长期居住状态;若带起始时间,可填 `valid_at``invalid_at` 必须为 `"NULL"`。
@@ -129,7 +143,7 @@ temporal_type
- 允许为解决代词、省略和时间歧义做最小必要改写。
- 不要引入原文未明确表达的新事实、额外推断或风格化概括。
{% else %}
Granularity:
Splitting rules:
- Extract statements at the level of one complete thought, usually one full sentence or one natural semantic unit.
- Preserve sentence-level structure by default; split only when a sentence contains two or more independent and important pieces of information that become clearly easier to understand when separated.
- Prefer higher recall: do not miss independently valid and semantically stable statements in `target_content`.
@@ -149,6 +163,9 @@ Coreference resolution:
Clear vs unresolved reference:
- First produce the final rewritten `statement_text`, then decide `has_unsolved_reference`.
- `has_unsolved_reference` must be judged from the final `statement_text`, not from whether the original `target_content` once contained a pronoun.
- If the final `statement_text` already resolves the reference to a concrete named entity, such as “The assistant congratulates the user” or “Xiao Li ordered an Americano,” then `has_unsolved_reference` must be `false`.
- A reference is fully resolved only if the current `supporting_context` can map it to a concrete named entity.
- `Zhang San`, `Old Zhang` when clearly resolved to Zhang San, `Professor Li`, and `Teacher Wang` are clear references.
- `the user's friend`, `the user's coworker`, `a teacher`, and `an interviewer` are allowed outputs but still count as unresolved.
@@ -177,6 +194,15 @@ Temporal rules:
- If no explicit time is available, do not invent one.
- For point-in-time events such as a single exam, a meeting, or a submission on one day, populate both `valid_at` and `invalid_at`; do not fill only `valid_at`.
Emotional-state detection:
- `has_emotional_state` is used only to judge whether the current statement reflects the user's emotional state.
- If the current statement plus supporting context is sufficient to infer that the user currently has some emotional state, output `true`.
- This field is not an emotion category field. Do not infer or output a specific emotion label here.
- Explicit emotion wording such as “happy”, “sad”, “nervous”, or “under pressure” should usually be marked `true`.
- Statements without explicit emotion words may still be `true` if the user's emotional state is reasonably inferable, such as “I am fine.”
- If the statement is only an objective fact or action description and the user's emotional state cannot be stably inferred from the current context, output `false`.
temporal_type:
- `STATIC`: relatively stable, ongoing states, identities, attributes, long-term preferences, long-term relationships, occupations, or residence states.
@@ -190,7 +216,7 @@ Rewrite boundary:
- Do not introduce unsupported facts, extra inference, or stylistic summarization.
{% endif %}
===Examples===
=== Examples ===
{% if language == "zh" %}
示例 1:
示例输入: {
@@ -219,6 +245,7 @@ Rewrite boundary:
"statement_text": "李教授这学期要求很严。",
"statement_type": "OPINION",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "2023-09-04T18:00:00",
"invalid_at": "NULL"
@@ -228,17 +255,19 @@ Rewrite boundary:
"statement_text": "李教授讲课清晰透彻。",
"statement_type": "OPINION",
"temporal_type": "ATEMPORAL",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "NULL",
"invalid_at": "NULL"
},
{
"statement_id": "stmt_m1n2o3p4",
"statement_text": "李教授的气场很吓人。",
"statement_text": "用户每次被李教授点名都有点发怵。",
"statement_type": "OPINION",
"temporal_type": "ATEMPORAL",
"temporal_type": "DYNAMIC",
"has_emotional_state": true,
"has_unsolved_reference": false,
"valid_at": "NULL",
"valid_at": "2023-09-04T18:00:00",
"invalid_at": "NULL"
}
]
@@ -248,13 +277,13 @@ Rewrite boundary:
示例输入: {
"chunk_id": "chunk_b2c3d4e5",
"end_user_id": "eu_12345678",
"target_content": "我最近在学 Python每天晚上都会练一个小时。这周还打算先把基础语法和函数部分过一遍。",
"target_content": "我最近在学Python每天晚上都会练一个小时。这周还打算先把基础语法和函数部分过一遍。",
"target_message_date": "2026-04-01T00:00:00",
"supporting_context": {
"msgs": [
{
"role": "User",
"msg": "我最近在学 Python。"
"msg": "我最近在学Python。"
},
{
"role": "Assistant",
@@ -268,27 +297,30 @@ Rewrite boundary:
"statements": [
{
"statement_id": "stmt_m3n4o5p6",
"statement_text": "用户最近在学 Python。",
"statement_text": "用户最近在学Python。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
},
{
"statement_id": "stmt_q7r8s9t0",
"statement_text": "用户最近每天晚上都会练一个小时 Python。",
"statement_text": "用户最近每都会练一个小时Python。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
},
{
"statement_id": "stmt_u1v2w3x4",
"statement_text": "用户这周打算先复习 Python 的基础语法和函数部分。",
"statement_text": "用户这周打算先复习Python的基础语法和函数部分。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
@@ -323,6 +355,7 @@ Rewrite boundary:
"statement_text": "用户觉得那两个有点难。",
"statement_type": "OPINION",
"temporal_type": "DYNAMIC",
"has_emotional_state": true,
"has_unsolved_reference": true,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
@@ -332,6 +365,7 @@ Rewrite boundary:
"statement_text": "用户昨晚看了半天那两个还是没太搞明白。",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": true,
"valid_at": "2026-03-31T00:00:00",
"invalid_at": "2026-03-31T23:59:59"
@@ -341,6 +375,7 @@ Rewrite boundary:
"statement_text": "如果周末还弄不出来,用户可能会去问助教。",
"statement_type": "OTHER",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": true,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
@@ -375,6 +410,7 @@ Example Output: {
"statement_text": "Professor Li is very strict this semester.",
"statement_type": "OPINION",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "2023-09-04T18:00:00",
"invalid_at": "NULL"
@@ -384,17 +420,19 @@ Example Output: {
"statement_text": "Professor Li explains things clearly.",
"statement_type": "OPINION",
"temporal_type": "ATEMPORAL",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "NULL",
"invalid_at": "NULL"
},
{
"statement_id": "stmt_m1n2o3p4",
"statement_text": "Professor Li's presence is intimidating.",
"statement_text": "The user gets nervous every time Professor Li calls on the user.",
"statement_type": "OPINION",
"temporal_type": "ATEMPORAL",
"temporal_type": "DYNAMIC",
"has_emotional_state": true,
"has_unsolved_reference": false,
"valid_at": "NULL",
"valid_at": "2023-09-04T18:00:00",
"invalid_at": "NULL"
}
]
@@ -427,6 +465,7 @@ Example Output: {
"statement_text": "The user has been learning Python recently.",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
@@ -436,6 +475,7 @@ Example Output: {
"statement_text": "The user has recently been practicing Python for an hour every night.",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
@@ -445,6 +485,7 @@ Example Output: {
"statement_text": "The user plans to review Python basic syntax and functions first this week.",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": false,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
@@ -479,6 +520,7 @@ Example Output: {
"statement_text": "The user thinks those two things are difficult.",
"statement_type": "OPINION",
"temporal_type": "DYNAMIC",
"has_emotional_state": true,
"has_unsolved_reference": true,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
@@ -488,6 +530,7 @@ Example Output: {
"statement_text": "The user spent a long time last night looking at those two things but still did not really understand them.",
"statement_type": "FACT",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": true,
"valid_at": "2026-03-31T00:00:00",
"invalid_at": "2026-03-31T23:59:59"
@@ -497,6 +540,7 @@ Example Output: {
"statement_text": "If the user still cannot finish them by the weekend, the user may ask the TA.",
"statement_type": "OTHER",
"temporal_type": "DYNAMIC",
"has_emotional_state": false,
"has_unsolved_reference": true,
"valid_at": "2026-04-01T00:00:00",
"invalid_at": "NULL"
@@ -504,7 +548,7 @@ Example Output: {
]
}
{% endif %}
===End of Examples===
=== End of Examples ===
{% if language == "zh" %}
最终输出前检查:
@@ -512,7 +556,9 @@ Example Output: {
- 是否只保留 `target_content` 中可直接支持的陈述句
- 如果主语是用户,是否统一写“用户”
- 非用户主体是否尽量写成具体名称;若无法做到,是否已正确标记 `has_unsolved_reference = true`
- 如果最终 `statement_text` 已经落到具体实体名,`has_unsolved_reference` 是否已经改为 `false`
- statement_type 是否合法,且没有把一般事实机械标成 `OPINION`
- `has_emotional_state` 是否仅用于判断是否存在情感状态,而没有被当作情绪分类字段
- temporal_type 是否与 valid_at / invalid_at 一致
- 输出是否严格符合 JSON schema
{% else %}
@@ -520,7 +566,9 @@ Example Output: {
- Keep only statements directly supported by `target_content`
- If the subject is the user, render it as “the user”
- Render non-user subjects as concrete names when possible; otherwise mark `has_unsolved_reference = true`
- If the final `statement_text` already resolves the reference to a concrete named entity, ensure `has_unsolved_reference = false`
- Ensure statement_type is valid and do not mechanically label ordinary facts as `OPINION`
- Ensure `has_emotional_state` is used only for emotional-state presence detection, not emotion classification
- Ensure temporal_type is consistent with valid_at and invalid_at
- Ensure the output strictly matches the JSON schema
{% endif %}
@@ -555,8 +603,7 @@ Example Output: {
- Preserve the original language and do not translate.
{% endif %}
现在处理下面这个输入:
{{ render_input() }}
现在处理下面这个输入:{{ render_input() }}
Return only a JSON object matching the schema below:
{
@@ -566,6 +613,7 @@ Return only a JSON object matching the schema below:
"statement_text": "string",
"statement_type": "FACT | OPINION | OTHER",
"temporal_type": "STATIC | DYNAMIC | ATEMPORAL",
"has_emotional_state": "boolean",
"has_unsolved_reference": "boolean",
"valid_at": "string | NULL",
"invalid_at": "string | NULL"

View File

@@ -5,13 +5,21 @@ Extract entities and knowledge triplets from the given statement.
重要:
- `name`、`subject_name`、`object_name` 默认保持原文中的表面形式,不要翻译。
- 但对用户自指表达,如“我”“我的”“我自己”,统一规范为 `用户`
- 但在抽取前,必须先做指代解析
- 用户自指表达,如“我”“我的”“我自己”,一律规范为 `用户`。
- 非用户自指代词或指示表达,如“他”“她”“它”“这个”“那个”“这家”“那家”“这里”“那里”,如果能从 `supporting_context` 中稳定解析出具体指代,则必须替换为具体指代实体名。
- 如果上述代词或指示表达不能稳定解析,则整条跳过。
- 命名关系中新出现的称呼、别名、昵称、产品名保持原样,不做替换。
- `description` 使用中文。
- `type`、`predicate`、`type_description`、`predicate_description` 一律使用中文。
{% else %}
Important:
- Keep `name`, `subject_name`, and `object_name` in their original surface form from the source text. Do not translate them.
- Exception: normalize user self-reference such as "I", "me", and "myself" to `用户`.
- Keep `name`, `subject_name`, and `object_name` in their original surface form from the source text by default.
- But you MUST resolve references before extraction.
- Normalize user self-reference such as "I", "me", and "myself" to `用户`.
- For non-user pronouns or demonstratives such as "he", "she", "it", "this", "that", "this company", "that place", if a stable referent can be resolved from `supporting_context`, replace them with the resolved entity name.
- If such references cannot be resolved stably, skip the entire statement.
- Newly introduced names in naming or alias expressions must stay in their original form.
- Generate `description` in English.
- Always generate `type`, `predicate`, `type_description`, and `predicate_description` in Chinese.
{% endif %}
@@ -69,11 +77,13 @@ Primary statement to analyze:
开始抽取前,先检查 `has_unsolved_reference`。
- 如果 `has_unsolved_reference` 是 `true`,不要抽取任何内容。
- 此时必须返回:
- 如果 `statement_text` 中仍存在无法稳定解析的代词、指示词或省略主体,也应视为 unresolved reference。
- 这两种情况下都必须返回:
{% else %}
Before any extraction, check `has_unsolved_reference`.
- If `has_unsolved_reference` is `true`, do not extract anything.
- In that case, return exactly:
- If unresolved pronouns, demonstratives, or omitted subjects still remain in `statement_text`, treat the statement as unresolved as well.
- In either case, return exactly:
{% endif %}
```json
@@ -86,8 +96,10 @@ Primary statement to analyze:
{% if language == "zh" %}
- 不要在引用未解析时尝试部分抽取。
- 不要保留“他”“这个”“那个”这类原代词继续输出实体或关系。
{% else %}
- Do not attempt partial extraction when the reference is unresolved.
- Do not keep unresolved forms such as "he", "this", or "that" as extracted entities or relation arguments.
{% endif %}
===Input Boundary===
@@ -100,6 +112,8 @@ Primary statement to analyze:
- 如果 `supporting_context.msgs` 中的 Assistant 消息包含总结、猜测、解释或改写,这些内容只能作为理解辅助,不能直接作为抽取来源。
- `statement_type`、`temporal_type`、`valid_at`、`invalid_at` 是辅助理解字段,不是抽取目标。
- 对 `statement_text` 中的用户自指表达,要统一规范成实体 `用户`。
- 对其他可稳定解析的代词或指示表达,要替换为具体指代实体名后再抽取。
- 对命名关系中新出现的称呼、别名、昵称、产品名,不要因为上下文可推断其所指而直接改写,它们应保持原样作为实体名。
{% else %}
- Treat `statement_text` as the only direct extraction target.
- Use `supporting_context.msgs` only to interpret references, ellipsis, subject identity, and necessary background in `statement_text`.
@@ -108,41 +122,253 @@ Primary statement to analyze:
- If Assistant messages in `supporting_context.msgs` contain summary, guess, interpretation, or rephrasing, use them only as interpretive support and never as a direct extraction source.
- Treat `statement_type`, `temporal_type`, `valid_at`, and `invalid_at` as auxiliary context, not extraction targets.
- Normalize user self-reference in `statement_text` to the entity `用户`.
- Replace other resolvable pronouns or demonstratives with their resolved entity names before extraction.
- For newly introduced names in naming or alias expressions, do not rewrite them even if the context reveals who they refer to; keep them as entity names.
{% endif %}
===预定义实体类型===
只能使用以下中文实体类型。如果没有完全匹配的类型,请选择最接近的一项,不要发明新类型。
- `人物`: 现实中的具体个人
- `组织`: 公司、机构、团队、社群等组织性主体
- `群体`: 未具名或泛指的一组人
- `地点`: 具有地理或空间意义的位置
- `设施`: 建筑、场馆、房间、实验室等功能性空间
- `地址`: 具体地址或位置描述
- `物品`: 一般具体物体
- `设备`: 具有明确用途的工具或器材
- `产品`: 可被制造、购买、使用的产品
- `交通工具`: 用于出行或运输的工具
- `文档`: 文章、报告、表格、说明等文档
- `媒体`: 图片、音频、视频等媒体对象
- `网站`: 网站、网页或互联网平台
- `软件`: 软件、应用、系统或数字服务
- `账号`: 账号、账户、用户档案
- `标识符`: ID、编号、用户名、工号等标识
- `联系方式`: 电话、邮箱、社交账号等联系方式
- `角色`: 某实体承担的社会或功能角色
- `职业`: 工作或职业身份
- `技能`: 可学习或掌握的能力
- `知识主题`: 主题、领域、方法、理论或知识概念
- `目标`: 希望达成的结果
- `偏好`: 稳定的喜欢、倾向或偏爱
- `习惯`: 重复出现的行为模式
- `语言`: 自然语言或编程语言
- `金额`: 金额或货币数值
- `数量`: 带或不带单位的数量值
- `货币`: 货币单位
- `组织部门`: 组织内部的部门或业务单元
- `称呼`: 用于指代或称呼实体的名字
- `人物`
- definition: 可稳定指向、可被当作具体个体区分和归并的个人实体。
- positive_examples: `用户`、`张三`、`王教授`、`小林`
- negative_examples: `老师`、`导师`、`学生`、`他们`
- notes: 强调“这个人是谁”,不强调他承担的社会身份;用户自指统一归为 `用户`。
- `组织`
- definition: 公司、机构、学校、实验室、团队、社群等组织性主体。
- positive_examples: `腾讯`、`清华大学`、`机器人公司`、`实验室`
- negative_examples: `人事部`、`教研组`、`办公室`
- notes: 如果表达的是组织内部单元,当前一级仍优先并入 `组织`,除非后续单独扩展子类。
- `群体`
- definition: 边界相对稳定、可被当作整体引用的一组人。
- positive_examples: `我的朋友`、`同事们`、`实验室成员`
- negative_examples: `他们`、`一些人`、`一个朋友`
- notes: 只用于边界相对稳定的人群;边界不稳或 unresolved 的表达不要归入 `群体`。
- `智能体`
- definition: 具有行动、交互或执行能力的非人主体如机器人、AI 或其他智慧体。
- positive_examples: `机器人查票员`、`家务机器人`、`智能助手`
- negative_examples: `手机`、`电脑`、`机器人公司`
- notes: 如果对象只是普通设备,不归入 `智能体`;只有在叙述中被当作主体行动或交互时才使用。
- `角色职业`
- definition: 人物承担的社会角色、功能身份或职业身份。
- positive_examples: `导师`、`老师`、`学生`、`医生`、`程序员`
- negative_examples: `张三`、`王教授`、`我的朋友`
- notes: 强调“这个人是什么身份”,不强调“这个人是谁”;如果文本落到具体个人,优先用 `人物`。
- `地点设施`
- definition: 具有地理意义或功能性空间意义的位置与场所。
- positive_examples: `北京`、`巴黎`、`图书馆`、`办公室`、`教室`
- negative_examples: `这里`、`那里`、`朝这边`、`明天去的地方`
- notes: 地理地点和功能场所当前一级合并;未稳定解析的位置指代表达不要抽取。
- `物品设备`
- definition: 可被持有、使用、携带的具体物体、设备、工具或交通工具。
- positive_examples: `手机`、`电脑`、`相机`、`自行车`
- negative_examples: `微信`、`GitHub`、`会员服务`
- notes: 交通工具当前并入此类;数字服务不归入本类。
- `产品服务`
- definition: 可被购买、使用、消费或订阅的产品或服务。
- positive_examples: `iPhone`、`健身课`、`会员服务`
- negative_examples: `微信`、`GitHub`、`手机`
- notes: 具体商品和服务当前一级合并;纯软件平台优先归入 `软件平台`。
- `软件平台`
- definition: 软件、应用、网站、在线平台或数字服务系统。
- positive_examples: `微信`、`GitHub`、`ChatGPT`、`飞书`
- negative_examples: `iPhone`、`会员服务`、`手机号`
- notes: 软件、网站、平台当前一级合并;如果语境强调的是账号本身,改用 `账号`。
- `账号`
- definition: 账户、账号、用户档案类实体。
- positive_examples: `GitHub账号`、`微信号`
- negative_examples: `用户名`、`工号`、`邮箱`
- notes: 与 `标识符`、`联系方式` 分开;账号是主体可持有的账户对象。
- `标识符`
- definition: 用于识别实体的编号、ID、用户名、学号、工号等标识。
- positive_examples: `学号`、`工号`、`用户名`
- negative_examples: `GitHub账号`、`手机号`
- notes: 当前允许保留,但通常只有在存在明确识别关系时才值得抽取。
- `联系方式`
- definition: 可用于联系实体的电话、邮箱、社交联系地址。
- positive_examples: `手机号`、`邮箱`、`微信联系方式`
- negative_examples: `用户名`、`GitHub账号`
- notes: 当前允许保留,但通常只有在存在明确联系关系时才值得抽取。
- `文档媒体`
- definition: 文章、报告、表格、图片、音频、视频等内容载体。
- positive_examples: `简历`、`论文`、`照片`、`录音`
- negative_examples: `微积分`、`微信`、`学号`
- notes: 文档与媒体当前一级合并;如果只是内容主题,不归入本类。
- `知识能力`
- definition: 可学习、掌握、使用或讨论的知识主题、技能、学科或语言。
- positive_examples: `微积分`、`机器学习`、`写作`、`Python`、`中文`
- negative_examples: `紧张`、`成功`、`意义`
- notes: 不包含情绪、心理状态、抽象结果或价值判断;这些应写入 `description`。
- `偏好习惯目标`
- definition: 用户稳定的偏好、重复习惯,以及具体、明确、用户特异且值得长期保留的目标。
- positive_examples: `喜欢安静环境`、`晨跑`、`通过雅思`
- negative_examples: `紧张`、`开心`、`成功`、`回报`
- notes: 这是高风险类型;只允许稳定偏好、重复习惯、具体目标,不允许抽象愿望或情绪状态。
- `称呼别名`
- definition: 用于指代或称呼实体的名字。
- positive_examples: `山哥`、`老张`、`X1`
- negative_examples: `导师`、`程序员`、`好人`
- notes: 只用于名字性表达,不用于角色、职业、评价词。
实体类型总规则:
- unresolved 或边界不稳的表达,不因“看起来像名词”就创建实体。
- 情绪、心理状态、金额、数量、普通时间、一次性动作短语,默认不作为独立实体类型抽取。
- 抽象命题片段、泛化结果、价值判断,默认不创建实体;如有保留价值,应写入相关高价值实体的 `description`。
实体类型选择原则:
- 优先保留对用户画像、偏好、长期身份、稳定关系或持续兴趣有记忆价值的实体类型。
- 对于“努力”“回报”“意义”“成功”这类泛化概念、抽象命题片段或价值判断,默认不要仅因句中出现就创建实体。
- `群体` 只用于边界相对稳定、可被当作整体引用的人群;像“他们”“一些人”“一个朋友”这类边界不稳或 unresolved 的表达不要归入 `群体`。
- `偏好习惯目标` 只能用于稳定偏好、重复习惯或具体明确的用户目标,不能把抽象结果、泛因果终点、空泛愿望或情绪状态强行归入其中。
- 当前阶段不抽取情绪状态实体;像“紧张”“开心”“难过”“焦虑”“放松”这类情绪或心理状态,不要归入 `知识能力`、`偏好习惯目标` 或其他现有类型。
===关系本体大类===
以下大类是当前 `predicate` 本体树的第一层,用于帮助理解和约束后面的具体关系白名单。输出具体 `predicate` 时仍然必须使用后文列出的细关系,而不是直接输出这些大类名称。
- `命名关系`
- definition: 表达实体名称、别名、称呼之间的对应或使用关系。
- covered_predicates: `别名属于`、`使用称呼`
- positive_examples: `山哥 -> 别名属于 -> 用户`、`我的朋友 -> 使用称呼 -> 山哥`
- negative_examples: `导师 -> 别名属于 -> 用户`、`好人 -> 使用称呼 -> 用户`
- notes: 只处理名字性表达,不处理角色、职业、评价词。
- status: `enabled`
- `类型归属关系`
- definition: 表达实体属于某种类别,或主体承担某种角色/职业身份的关系。
- covered_predicates: `属于类型`、`担任角色`、`从事职业`
- positive_examples: `王教授 -> 担任角色 -> 导师`、`张三 -> 从事职业 -> 程序员`
- negative_examples: `张三 -> 担任角色 -> 山哥`、`用户 -> 从事职业 -> 紧张`
- notes: 用于“是什么”,不用于“叫什么”。
- status: `enabled`
- `成员隶属关系`
- definition: 表达主体属于某个组织、群体或集合的成员归属关系。
- covered_predicates: `成员属于`
- positive_examples: `张三 -> 成员属于 -> 实验室成员`、`用户 -> 成员属于 -> 社群`
- negative_examples: `他们 -> 成员属于 -> 学校`、`一个朋友 -> 成员属于 -> 班级`
- notes: 前提是主体和归属对象都足够稳定;边界不稳的人群不要硬抽。
- status: `enabled`
- `任职服务关系`
- definition: 表达人物或主体在组织中的工作、任职或服务关系。
- covered_predicates: `任职于`
- positive_examples: `张明 -> 任职于 -> 腾讯`、`王教授 -> 任职于 -> 清华大学`
- negative_examples: `张明 -> 任职于 -> 导师`、`用户 -> 任职于 -> 明天的面试`
- notes: 优先用于人物到组织的稳定供职关系。
- status: `enabled`
- `空间位置关系`
- definition: 表达实体与地点、场所、空间位置之间的稳定位置关系。
- covered_predicates: `位于`、`拥有位置`、`居住于`
- positive_examples: `用户 -> 居住于 -> 巴黎`、`办公室 -> 位于 -> 北京`
- negative_examples: `用户 -> 位于 -> 明天下午三点`、`这里 -> 位于 -> 学校`
- notes: 普通时间表达和未解析位置指代不进入此类。
- status: `enabled`
- `前往到访关系`
- definition: 表达主体前往、到访某地点、场所、组织、课程或活动对象的关系。
- covered_predicates: `前往`
- positive_examples: `用户 -> 前往 -> 图书馆`、`用户 -> 前往 -> 公司`
- negative_examples: `用户 -> 前往 -> 明天下午三点`、`用户 -> 前往 -> 复习微积分任务`
- notes: 当前应优先用于稳定倾向或有记忆价值的到访对象,不鼓励因一次性日程而过抽。
- status: `enabled`
- `组成包含关系`
- definition: 表达部分与整体、包含与被包含之间的结构关系。
- covered_predicates: `组成部分`、`包含部分`
- positive_examples: `教研组 -> 组成部分 -> 学院`、`学院 -> 包含部分 -> 教研组`
- negative_examples: `用户 -> 组成部分 -> 图书馆`、`微积分 -> 包含部分 -> 用户`
- notes: 只用于结构性组成关系,不用于临时搭配或抽象联系。
- status: `enabled`
- `拥有持有关系`
- definition: 表达主体拥有、持有、配有某对象、账号、联系方式或标识的关系。
- covered_predicates: `拥有`、`拥有账号`、`拥有联系方式`、`标识为`
- positive_examples: `用户 -> 拥有账号 -> GitHub账号`、`用户 -> 拥有联系方式 -> 邮箱`、`用户 -> 标识为 -> 学号`
- negative_examples: `用户 -> 拥有 -> 紧张`、`努力 -> 拥有 -> 回报`
- notes: 不用于抽象命题、情绪状态或口号式表达。
- status: `enabled`
- `使用采用关系`
- definition: 表达主体使用、采用某工具、产品、平台、语言或资源的关系。
- covered_predicates: `使用`、`使用语言`
- positive_examples: `用户 -> 使用 -> 微信`、`用户 -> 使用语言 -> 中文`
- negative_examples: `用户 -> 使用 -> 成功`、`用户 -> 使用语言 -> 紧张`
- notes: 以后若扩展“采用方法”,也可挂在本大类下。
- status: `enabled`
- `创建生产关系`
- definition: 表达主体创建、撰写、生产某对象或结果的关系。
- covered_predicates: `创建了`、`由…创建`、`撰写了`
- positive_examples: `用户 -> 撰写了 -> 简历`、`简历 -> 由…创建 -> 用户`
- negative_examples: `用户 -> 创建了 -> 明天下午三点`、`努力 -> 由…创建 -> 用户`
- notes: 只用于明确的生产、创作、撰写关系。
- status: `enabled`
- `知识学习关系`
- definition: 表达主体与知识、技能、学科、语言等知识能力对象之间的认知、学习或兴趣关系。
- covered_predicates: `了解`、`学习`、`感兴趣于`
- positive_examples: `用户 -> 学习 -> 微积分`、`用户 -> 了解 -> 机器学习`、`用户 -> 感兴趣于 -> 心理学`
- negative_examples: `用户 -> 学习 -> 紧张`、`用户 -> 感兴趣于 -> 成功`
- notes: 关系对象应是 `知识能力` 类,而不是情绪、价值判断或抽象结果。
- status: `enabled`
- `偏好目标关系`
- definition: 表达主体对对象的稳定偏好、厌恶,或对具体明确目标的指向关系。
- covered_predicates: `偏好`、`不喜欢`、`想要`
- positive_examples: `用户 -> 偏好 -> 安静环境`、`用户 -> 不喜欢 -> 辛辣食物`、`用户 -> 想要 -> 通过雅思`
- negative_examples: `用户 -> 想要 -> 成功`、`用户 -> 偏好 -> 紧张`、`用户 -> 不喜欢 -> 努力就会有回报`
- notes: 这是高风险大类;`想要` 只用于具体、明确、用户特异的目标,不用于抽象愿望。
- status: `enabled`
- `职责责任关系`
- definition: 表达主体负责某项工作、职责、事务或领域的关系。
- covered_predicates: `负责`
- positive_examples: `张三 -> 负责 -> 招聘工作`、`王教授 -> 负责 -> 实验室项目`
- negative_examples: `张三 -> 负责 -> 紧张`、`用户 -> 负责 -> 成功`
- notes: 关系对象应是具体职责或事务,不应是情绪或抽象结果。
- status: `enabled`
- `沟通交互关系`
- definition: 表达两个主体之间发生沟通、交流或交互的关系。
- covered_predicates: `沟通于`
- positive_examples: `用户 -> 沟通于 -> 张三`、`导师 -> 沟通于 -> 学生`
- negative_examples: `用户 -> 沟通于 -> 紧张`、`图书馆 -> 沟通于 -> 微积分`
- notes: 两端通常都应是可作为交互主体的实体。
- status: `enabled`
- `提及关系`
- definition: 表达主体或文本明确提到某实体的关系。
- covered_predicates: `提到`
- positive_examples: `用户 -> 提到 -> 腾讯`、`文档 -> 提到 -> 张三`
- negative_examples: `用户 -> 提到 -> 努力`、`用户 -> 提到 -> 回报`、`用户 -> 提到 -> 紧张`
- notes: 受限大类;不用于保留泛化概念、抽象命题片段、情绪状态或仅在句面上出现但没有记忆价值的对象。
- status: `restricted`
- `一般关联关系`
- definition: 表达两个实体之间存在明确、稳定、值得保留,但当前无更精确谓词可用的关联关系。
- covered_predicates: `关联于`、`相关于`
- positive_examples: `项目 -> 关联于 -> 实验室`、`账号 -> 相关于 -> 平台`
- negative_examples: `努力 -> 相关于 -> 回报`、`用户 -> 关联于 -> 紧张`、`成功 -> 相关于 -> 意义`
- notes: 受限大类;不能作为失败兜底关系,不能用来连接抽象概念、口号式表达或无法成立的关系。
- status: `restricted`
===预定义关系类型===
只能使用以下中文关系类型。如果没有完全匹配的关系,请选择最接近的一项,不要发明新关系。
@@ -172,60 +398,90 @@ Primary statement to analyze:
- `感兴趣于`: 主体对某主题感兴趣
- `偏好`: 主体偏好某对象、方式或主题
- `不喜欢`: 主体不喜欢某对象、方式或主题
- `想要`: 主体想获得、达成或拥有某对象或结果
- `想要`: 主体想获得、达成或拥有具体、明确、用户特异且值得保留的对象或目标,不用于抽象结果、泛化愿望或口号式表达
- `负责`: 主体负责某项工作、职责或领域
- `沟通于`: 两个实体之间发生沟通或交流
- `拥有联系方式`: 实体具有某联系方式
- `拥有账号`: 实体具有某账号
- `标识为`: 实体由某标识符标识
- `使用语言`: 主体使用某语言
- `相关于`: 当存在明确联系但无更精确关系时使用的弱关系
- `相关于`: 当存在明确、稳定且具有记忆价值的联系但无更精确关系时使用的弱关系;不得用于泛化概念、抽象命题片段、口号式表达或仅为补全结构的联系
===Extraction Order===
{% if language == "zh" %}
按以下顺序执行:
0. 先检查 `has_unsolved_reference`;如果为 `true`,直接返回空结果。
1. 识别 `statement_text` 中值得抽取的稳定实体。
2. 判断这些实体之间是否存在可由预定义关系类型表达的有效关系
3. 最后补充实体字段和关系字段
1. 先做指代解析:用户自指统一替换为 `用户`;其他可稳定解析的代词或指示表达替换为具体指代实体
2. 如果仍存在无法稳定解析的代词、指示词或省略主体,直接返回空结果
3. 识别 `statement_text` 中值得抽取的稳定实体
4. 判断这些实体之间是否存在可由预定义关系类型表达的有效关系。
5. 最后补充实体字段和关系字段。
不要让附加字段主导整个抽取过程。
{% else %}
Follow this order:
0. First check `has_unsolved_reference`; if it is `true`, immediately return the empty result.
1. Identify stable entities worth extracting from `statement_text`.
2. Determine whether any valid relations between those entities can be expressed using the predefined Chinese predicates.
3. Finally fill auxiliary entity and predicate fields.
1. Resolve references first: normalize user self-reference to `用户`; replace other stably resolvable pronouns or demonstratives with their resolved entity names.
2. If unresolved pronouns, demonstratives, or omitted subjects still remain, immediately return the empty result.
3. Identify stable entities worth extracting from `statement_text`.
4. Determine whether any valid relations between those entities can be expressed using the predefined Chinese predicates.
5. Finally fill auxiliary entity and predicate fields.
Do not let auxiliary fields drive the extraction process.
{% endif %}
===Guidelines===
**Reference Resolution:**
{% if language == "zh" %}
- 指代解析优先于实体抽取和关系抽取。
- 所有用户自指表达都必须规范成 `用户`,包括“我”“我的”“我自己”等。
- 对“他”“她”“它”“这个”“那个”“这家”“那家”“这里”“那里”等非用户自指表达,若上下文可稳定解析,则必须用解析后的具体实体名替换。
- 若非用户自指表达无法稳定解析,则整条跳过,不输出部分结果。
- 新出现的称呼、别名、昵称、产品名不是待消解代词,应保持原样。
{% else %}
- Reference resolution happens before entity or relation extraction.
- All user self-reference must be normalized to `用户`, including forms such as "I", "me", "my", and "myself".
- For non-user references such as "he", "she", "it", "this", "that", "this company", "that place", "here", or "there", if the context supports a stable resolution, replace them with the resolved entity name.
- If a non-user reference cannot be resolved stably, skip the entire statement and do not output partial results.
- Newly introduced names, aliases, nicknames, and product names are not pronouns to be resolved; keep them in their original form.
{% endif %}
**Entity Extraction:**
{% if language == "zh" %}
- 只有当某个名字、概念、对象、群体或地点在当前陈述中承担明确语义角色,或是理解有效关系所必需时,才创建实体。
- 不要因为表面上出现了名词、修饰词或短语,就机械地创建实体。
- 不要把完整命题、因果链、价值判断或口号式表达拆成多个低价值实体;例如“努力就会有回报”默认不应抽取出“努力”或“回报”作为实体。
- 普通时间表达默认不抽取为实体,包括日期、时刻、明天、下周、今晚八点等。
- 一次性动作短语默认不抽取为实体,例如“复习微积分”“去图书馆学习”“参观卢浮宫”。
- 不要为了表达一句带时间或地点的行动,而额外创造“任务”“计划”“事件”实体。
- 但如果动作明确把主体和某个稳定实体连接起来,可以保留该稳定实体,并抽取轻关系。例如“我去图书馆”“我去公司开会”“我去上课”“我去看演唱会”可以抽取 `前往`。
- 当句子只是在讨论一般道理、抽象规律、空泛结果或非个体化概念,而这些概念本身不构成可复用记忆时,不要创建实体。
- 如果句子表达的是用户的观点、信念、判断、愿望或目标倾向,但其中抽象对象不值得作为独立实体保留,则只保留相关高价值实体,不要再创建这些低价值对象实体,并把未抽取的抽象内容压缩写入相关实体的 `description`;例如“用户认为努力就会有回报”应只保留 `用户`,并在 `description` 中体现“用户认为努力就会有回报”。
- 对于未抽取的抽象实体、抽象命题片段或泛化结果,只要它们对理解该高价值实体有帮助,就应优先写入该实体的 `description`,而不是改用宽泛关系或补造弱实体。
- 当前阶段同样不要把情绪或心理状态抽成实体;如果句子里出现“紧张”“开心”“难过”“焦虑”“放松”等,应写入相关高价值实体的 `description`,而不是把它们标成 `知识能力`、`偏好习惯目标` 或其他近似类型。
- 如果陈述里有值得保留的实体信息,但没有有效关系,可以只返回 `entities`,并把 `triplets` 设为 `[]`。
- `name` 默认保持原文中的表面形式,不要翻译;但用户自指要统一写成 `用户`。
- `name` 默认保持原文中的表面形式,但用户自指必须写成 `用户`,可稳定解析的其他代词必须替换为具体指代实体名
- `description` 必须使用中文。
- `type` 和 `type_description` 必须使用上方预定义的中文标签与中文定义。
{% else %}
- Extract entities only when they play a clear semantic role in the statement or are necessary for understanding a valid relation.
- Do not mechanically create entities for every noun, modifier, or surface mention.
- Do not split generic propositions, causal slogans, or value judgments into low-value abstract entities. For example, "effort brings reward" should not create standalone entities for "effort" or "reward" by default.
- Do not extract ordinary time expressions as entities, including dates, timestamps, "tomorrow", "next week", or "8 PM tonight".
- Do not extract one-off action phrases as entities, such as "review calculus", "study in the library", or "visit the Louvre".
- Do not create extra "task", "plan", or "event" entities just to represent an action with time or location modifiers.
- But if an action clearly connects the subject to a stable entity, keep that stable entity and use a light relation. For example, statements like "I go to the library", "I go to the office", "I go to class", or "I go to a concert" can use `前往`.
- If the sentence is only about a generic principle, abstract outcome, or non-personalized concept that is not worth remembering on its own, do not create an entity for it.
- If a statement expresses the user's belief, judgment, opinion, wish, or goal tendency but the referenced abstract concepts are not worth keeping as standalone entities, keep only the relevant high-value entities, do not create those low-value concept entities, and compress the unextracted abstract content into the relevant entity `description`. For example, "the user believes effort brings reward" should keep only `用户` and reflect that belief in `description`.
- For abstract entities, proposition fragments, or generic outcomes that are not extracted, prefer writing them into the relevant retained entity's `description` when they help preserve the memory, instead of switching to a broad relation or inventing a weak entity.
- In the current stage, do not extract emotional or psychological states as entities. States such as nervousness, happiness, sadness, anxiety, or relief should be written into the relevant retained entity's `description` rather than mapped to `知识能力`, `偏好习惯目标`, or any other approximate type.
- If the statement contains entity-worthy content but no valid relation, it is acceptable to return `entities` with `triplets: []`.
- Keep `name` in its original surface form from the source text; exception: normalize user self-reference to `用户`.
- Keep `name` in its original surface form by default, but write user self-reference as `用户` and replace other stably resolvable references with their resolved entity names.
- `description` must be in English.
- `type` and `type_description` must use the predefined Chinese labels and Chinese definitions above.
{% endif %}
@@ -233,11 +489,11 @@ Do not let auxiliary fields drive the extraction process.
**Semantic Memory (`is_explicit_memory`):**
{% if language == "zh" %}
- 只有当实体明显属于语义知识记忆中的抽象概念时,才设为 `true`,例如概念、定义、理论、方法和知识主题
- 只有当实体明显属于语义知识记忆中的抽象知识对象时,才设为 `true`,例如概念、定义、理论、方法以及 `知识能力` 中的知识类对象
- 对人、组织、地点、具体物体以及大多数实例级实体,一律设为 `false`。
- 除非非常明确,否则默认设为 `false`。
{% else %}
- Use `true` only for abstract conceptual entities that belong in semantic knowledge memory, such as concepts, definitions, theories, methods, and knowledge topics.
- Use `true` only for abstract knowledge-oriented entities that belong in semantic knowledge memory, such as concepts, definitions, theories, methods, and knowledge-oriented members of `知识能力`.
- Use `false` for people, organizations, locations, concrete objects, and most instance-level entities.
- Default to `false` unless the entity is clearly an abstract knowledge concept.
{% endif %}
@@ -269,8 +525,14 @@ Do not let auxiliary fields drive the extraction process.
- 如果没有任何预定义关系适用,返回 `triplets: []`。
- 排除语气词、模糊情绪、孤立名词和缺乏明确关系结构的片段。
- 如果陈述不支持有效关系,不要强行构造 triplet。
- 不要为了保留一句抽象判断或泛因果命题,而强行构造“用户-拥有-努力”“努力-导致-回报”这类低价值 triplet。
- `提到` 不用于保留泛化概念、抽象命题片段、口号式表达或仅在句面上出现但无记忆价值的对象。
- `相关于` 不用于补救无法成立的关系,也不用于连接“努力”“回报”“成功”“意义”这类抽象概念。
- `想要` 只用于具体、明确、用户特异且值得保留的对象或目标;如果想要的内容过于抽象或空泛,不要抽取 `想要`,应改写进相关实体的 `description`。
- 不要为了保留情绪或心理状态而创建实体或弱关系;像“紧张”“开心”“难过”“焦虑”默认应写入相关实体的 `description`。
- 对于这类观点句,如果相关概念本身不值得保留,也不要只为了补全结构而额外创建对应实体;允许输出仅包含 `用户` 的 `entities` 和空的 `triplets`。
- 如果 `has_unsolved_reference` 是 `true`,不要抽取实体或 triplets。
- `subject_name` 和 `object_name` 默认保持原文中的表面形式,不要翻译;但用户自指要统一写成 `用户`。
- `subject_name` 和 `object_name` 默认保持原文中的表面形式,但用户自指必须写成 `用户`,可稳定解析的其他代词必须替换为具体指代实体名
- `predicate_description` 必须直接复用对应 `predicate` 的中文定义。
- 不要把普通时间表达作为 triplet 的宾语。
- 不要为了表达一次性计划、安排、日程而强行构造关系。
@@ -282,8 +544,14 @@ Do not let auxiliary fields drive the extraction process.
- If no predefined relation fits, return `triplets: []`.
- Exclude fillers, vague emotions, standalone nouns, and fragments without a clear relational structure.
- If the statement does not support a valid relation, do not force a triplet.
- Do not force low-value triplets such as "user-has-effort" or "effort-causes-reward" just to preserve a generic causal belief or slogan-like proposition.
- Do not use `提到` to preserve generic concepts, proposition fragments, slogan-like expressions, or surface mentions that have no memory value.
- Do not use `相关于` as a rescue relation when no real relation exists, and do not connect abstract concepts such as "effort", "reward", "success", or "meaning" with it.
- Use `想要` only for concrete, specific, user-grounded objects or goals worth retaining; if the desired content is too abstract or generic, do not extract `想要` and instead rewrite it into the relevant entity `description`.
- Do not create entities or weak relations just to preserve emotional or psychological states; states such as nervousness, happiness, sadness, or anxiety should normally be written into the relevant retained entity `description`.
- For such opinion statements, if the referenced concepts are not worth keeping, do not create extra entities just to complete a structure; it is valid to return only the `用户` entity with empty `triplets`.
- If `has_unsolved_reference` is `true`, do not extract entities or triplets.
- Keep `subject_name` and `object_name` in their original surface form; exception: normalize user self-reference to `用户`.
- Keep `subject_name` and `object_name` in their original surface form by default, but write user self-reference as `用户` and replace other stably resolvable references with their resolved entity names.
- `predicate_description` must directly reuse the corresponding Chinese definition of `predicate`.
- Do not use ordinary time expressions as triplet objects.
- Do not force relations just to encode one-off plans, schedules, or actions.
@@ -320,6 +588,7 @@ Do not let auxiliary fields drive the extraction process.
1. `alias -> 别名属于 -> canonical entity`
2. `caller -> 使用称呼 -> alias`
- 如果施称方在句中明确出现且对语义重要,不要省略它。
- 在命名关系中,新出现的称呼、别名、昵称、产品名必须保持原样,不要被替换成其所指实体名。
{% else %}
- Distinguish between a naming fact and a naming act when the statement expresses both.
- If the statement says that some entity or group calls or addresses another entity by a name, and the caller is explicitly mentioned in `statement_text`, extract the caller as an entity.
@@ -328,6 +597,7 @@ Do not let auxiliary fields drive the extraction process.
1. `alias -> 别名属于 -> canonical entity`
2. `caller -> 使用称呼 -> alias`
- Do not drop the caller entity if it is explicitly stated and semantically important to the naming relation.
- In naming relations, newly introduced names, aliases, nicknames, or product names must stay in their original form rather than being replaced by their referent.
{% endif %}
**subject_name / object_name Consistency:**
@@ -352,29 +622,28 @@ Output:
{"subject_name": "用户", "subject_id": 0, "predicate": "居住于", "predicate_description": "人物居住在某地点", "object_name": "巴黎", "object_id": 1}
],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "居住在巴黎的说话者", "is_explicit_memory": false},
{"entity_idx": 1, "name": "巴黎", "type": "地点", "type_description": "具有地理空间意义的位置", "description": "用户居住的城市", "is_explicit_memory": false}
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "居住在巴黎的说话者", "is_explicit_memory": false},
{"entity_idx": 1, "name": "巴黎", "type": "地点设施", "type_description": "具有地理意义或功能性空间意义的位置与场所", "description": "用户居住的城市", "is_explicit_memory": false}
]
}
**示例 2**
Statement: "张明在腾讯工作,负责 AI 产品开发。"
Statement: "在腾讯工作。"
Input condition: supporting context has already made it clear that “他” refers to “张明”.
Output:
{
"triplets": [
{"subject_name": "张明", "subject_id": 0, "predicate": "任职于", "predicate_description": "主体在某组织中工作或任职", "object_name": "腾讯", "object_id": 1},
{"subject_name": "张明", "subject_id": 0, "predicate": "负责", "predicate_description": "主体负责某项工作、职责或领域", "object_name": "AI 产品开发", "object_id": 2}
{"subject_name": "张明", "subject_id": 0, "predicate": "任职于", "predicate_description": "主体在某组织中工作或任职", "object_name": "腾讯", "object_id": 1}
],
"entities": [
{"entity_idx": 0, "name": "张明", "type": "人物", "type_description": "现实中的具体个人", "description": "在腾讯负责 AI 产品开发的人员", "is_explicit_memory": false},
{"entity_idx": 1, "name": "腾讯", "type": "组织", "type_description": "公司、机构、团队、社群等组织性主体", "description": "张明任职的公司", "is_explicit_memory": false},
{"entity_idx": 2, "name": "AI 产品开发", "type": "知识主题", "type_description": "主题、领域、方法、理论或知识概念", "description": "张明负责的工作方向", "is_explicit_memory": true}
{"entity_idx": 0, "name": "张明", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "在腾讯工作的人员", "is_explicit_memory": false},
{"entity_idx": 1, "name": "腾讯", "type": "组织", "type_description": "公司、机构、学校、实验室、团队、社群等组织性主体", "description": "张明任职的公司", "is_explicit_memory": false}
]
}
**示例 3**
Statement: "我明天下午三点去图书馆复习微积分。"
Statement: "我去图书馆微积分。"
Output:
{
@@ -383,9 +652,9 @@ Output:
{"subject_name": "用户", "subject_id": 0, "predicate": "学习", "predicate_description": "主体正在学习某知识主题或技能", "object_name": "微积分", "object_id": 2}
],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "提到自己安排的说话者", "is_explicit_memory": false},
{"entity_idx": 1, "name": "图书馆", "type": "设施", "type_description": "建筑、场馆、房间、实验室等功能性空间", "description": "用户提到要去的地点", "is_explicit_memory": false},
{"entity_idx": 2, "name": "微积分", "type": "知识主题", "type_description": "主题、领域、方法、理论或知识概念", "description": "用户提到的学习主题", "is_explicit_memory": true}
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "经常去图书馆学习微积分的说话者", "is_explicit_memory": false},
{"entity_idx": 1, "name": "图书馆", "type": "地点设施", "type_description": "具有地理意义或功能性空间意义的位置与场所。", "description": "用户经常前往学习的地点", "is_explicit_memory": false},
{"entity_idx": 2, "name": "微积分", "type": "知识能力", "type_description": "可学习、掌握、使用或讨论的知识主题、技能、学科或语言。", "description": "用户经常学习主题", "is_explicit_memory": true}
]
}
@@ -409,9 +678,86 @@ Output:
{"subject_name": "我的朋友", "subject_id": 1, "predicate": "使用称呼", "predicate_description": "主体使用某个名字来称呼另一实体", "object_name": "山哥", "object_id": 2}
],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "现实中的具体个人", "description": "被朋友称作山哥的说话者", "is_explicit_memory": false},
{"entity_idx": 1, "name": "我的朋友", "type": "群体", "type_description": "未具名或泛指的一组人", "description": "使用山哥这一称呼的人群", "is_explicit_memory": false},
{"entity_idx": 2, "name": "山哥", "type": "称呼", "type_description": "用于指代或称呼实体的名字", "description": "朋友用来称呼用户的昵称", "is_explicit_memory": false}
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "被朋友称作山哥的说话者", "is_explicit_memory": false},
{"entity_idx": 1, "name": "我的朋友", "type": "群体", "type_description": "边界相对稳定、可被当作整体引用的一组人", "description": "使用山哥这一称呼的人群", "is_explicit_memory": false},
{"entity_idx": 2, "name": "山哥", "type": "称呼别名", "type_description": "用于指代或称呼实体的名字", "description": "朋友用来称呼用户的昵称", "is_explicit_memory": false}
]
}
**示例 6**
Statement: "我认为努力就会有回报。"
Output:
{
"triplets": [],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "认为努力就会有回报的说话者", "is_explicit_memory": false}
]
}
**示例 7**
Statement: "我想要成功。"
Output:
{
"triplets": [],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "想要成功的说话者", "is_explicit_memory": false}
]
}
**示例 8**
Statement: "我最近有点紧张,不过这很正常。"
Output:
{
"triplets": [],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "最近有些紧张并认为这很正常的说话者", "is_explicit_memory": false}
]
}
**示例 9**
Statement: "王教授是导师。"
Output:
{
"triplets": [
{"subject_name": "王教授", "subject_id": 0, "predicate": "担任角色", "predicate_description": "主体承担某个角色", "object_name": "导师", "object_id": 1}
],
"entities": [
{"entity_idx": 0, "name": "王教授", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "承担导师角色的具体个人", "is_explicit_memory": false},
{"entity_idx": 1, "name": "导师", "type": "角色职业", "type_description": "人物承担的社会角色、功能身份或职业身份。", "description": "王教授承担的角色身份", "is_explicit_memory": false}
]
}
**示例 10**
Statement: "我的GitHub账号用户名是chen4。"
Output:
{
"triplets": [
{"subject_name": "用户", "subject_id": 0, "predicate": "拥有账号", "predicate_description": "实体具有某账号", "object_name": "GitHub账号", "object_id": 1},
{"subject_name": "GitHub账号", "subject_id": 1, "predicate": "标识为", "predicate_description": "实体由某标识符标识", "object_name": "chen4", "object_id": 2}
],
"entities": [
{"entity_idx": 0, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "拥有该 GitHub 账号的说话者", "is_explicit_memory": false},
{"entity_idx": 1, "name": "GitHub账号", "type": "账号", "type_description": "账户、账号、用户档案类实体。", "description": "用户拥有的 GitHub 账号", "is_explicit_memory": false},
{"entity_idx": 2, "name": "chen4", "type": "标识符", "type_description": "用于识别实体的编号、ID、用户名、学号、工号等标识。", "description": "该 GitHub 账号对应的用户名标识", "is_explicit_memory": false}
]
}
**示例 11**
Statement: "机器人查票员和我沟通。"
Output:
{
"triplets": [
{"subject_name": "机器人查票员", "subject_id": 0, "predicate": "沟通于", "predicate_description": "两个实体之间发生沟通或交流", "object_name": "用户", "object_id": 1}
],
"entities": [
{"entity_idx": 0, "name": "机器人查票员", "type": "智能体", "type_description": "具有行动、交互或执行能力的非人主体如机器人、AI 或其他智慧体。", "description": "与用户发生沟通的机器人主体", "is_explicit_memory": false},
{"entity_idx": 1, "name": "用户", "type": "人物", "type_description": "可稳定指向、可被当作具体个体区分和归并的个人实体。", "description": "与机器人查票员沟通的说话者", "is_explicit_memory": false}
]
}
===End of Examples===
@@ -424,10 +770,11 @@ JSON 要求:
- 字符串内部引号必须转义为 `\"`
- 不要使用中文引号
- 字符串值中不要换行
- `name`、`subject_name`、`object_name` 默认保持原文中的表面形式,不要翻译;但用户自指必须规范成 `用户`
- `name`、`subject_name`、`object_name` 默认保持原文中的表面形式,但用户自指必须规范成 `用户`,可稳定解析的其他代词必须替换为具体指代实体名
- `description` 必须使用中文
- `type`、`predicate`、`type_description`、`predicate_description` 必须使用上方预定义的中文标签和中文说明
- 如果 `has_unsolved_reference` 是 `true`,输出必须是 `{"entities": [], "triplets": []}`
- 如果存在无法稳定解析的代词或指示表达,输出也必须是 `{"entities": [], "triplets": []}`
- 如果没有有效 triplet返回 `"triplets": []`
{% else %}
JSON Requirements:
@@ -435,10 +782,11 @@ JSON 要求:
- Escape internal quotes using `\"`
- No Chinese quotation marks
- No line breaks inside string values
- `name`, `subject_name`, and `object_name` must keep the original surface form from the source text, except user self-reference which must be normalized to `用户`
- `name`, `subject_name`, and `object_name` keep their original surface forms by default, but user self-reference must be normalized to `用户` and other stably resolvable references must be replaced by their resolved entity names
- `description` must be in English
- `type`, `predicate`, `type_description`, and `predicate_description` must use the predefined Chinese labels and Chinese definitions above
- If `has_unsolved_reference` is `true`, the output must be `{"entities": [], "triplets": []}`
- If unresolved references still remain, the output must also be `{"entities": [], "triplets": []}`
- If no valid triplet exists, return `"triplets": []`
{% endif %}

View File

@@ -46,6 +46,12 @@ async def create_fulltext_indexes():
OPTIONS { indexConfig: { `fulltext.analyzer`: 'cjk' } }
""")
# 创建 AssistantPruned 剪枝文本全文索引
await connector.execute_query("""
CREATE FULLTEXT INDEX assistantPrunedFulltext IF NOT EXISTS FOR (p:AssistantPruned) ON EACH [p.text]
OPTIONS { indexConfig: { `fulltext.analyzer`: 'cjk' } }
""")
finally:
await connector.close()
@@ -135,6 +141,17 @@ async def create_vector_indexes():
`vector.similarity_function`: 'cosine'
}}
""")
# AssistantPruned text embedding index (optional, for semantic search on pruned hints)
await connector.execute_query("""
CREATE VECTOR INDEX assistant_pruned_embedding_index IF NOT EXISTS
FOR (p:AssistantPruned)
ON p.text_embedding
OPTIONS {indexConfig: {
`vector.dimensions`: 1024,
`vector.similarity_function`: 'cosine'
}}
""")
finally:
await connector.close()
@@ -179,6 +196,22 @@ async def create_unique_constraints():
"""
)
# AssistantOriginal.id unique
await connector.execute_query(
"""
CREATE CONSTRAINT assistant_original_id_unique IF NOT EXISTS
FOR (o:AssistantOriginal) REQUIRE o.id IS UNIQUE
"""
)
# AssistantPruned.id unique
await connector.execute_query(
"""
CREATE CONSTRAINT assistant_pruned_id_unique IF NOT EXISTS
FOR (p:AssistantPruned) REQUIRE p.id IS UNIQUE
"""
)
finally:
await connector.close()

View File

@@ -1363,154 +1363,60 @@ ORDER BY score DESC
LIMIT $limit
"""
SEARCH_STATEMENTS_BY_KEYWORD = """
CALL db.index.fulltext.queryNodes("statementsFulltext", $query) YIELD node AS s, score
WHERE ($end_user_id IS NULL OR s.end_user_id = $end_user_id)
OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s)
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity)
RETURN s.id AS id,
s.statement AS statement,
s.end_user_id AS end_user_id,
s.chunk_id AS chunk_id,
s.created_at AS created_at,
s.expired_at AS expired_at,
s.valid_at AS valid_at,
properties(s)['invalid_at'] AS invalid_at,
c.id AS chunk_id_from_rel,
collect(DISTINCT e.id) AS entity_ids,
COALESCE(s.activation_value, s.importance_score, 0.5) AS activation_value,
COALESCE(s.importance_score, 0.5) AS importance_score,
s.last_access_time AS last_access_time,
COALESCE(s.access_count, 0) AS access_count,
score
ORDER BY score DESC
LIMIT $limit
"""
SEARCH_ENTITIES_BY_NAME_OR_ALIAS = """
CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score
WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id)
WITH e, score
With collect({entity: e, score: score}) AS fulltextResults
# ── Assistant Pruning Nodes & Edges ──
OPTIONAL MATCH (ae:ExtractedEntity)
WHERE ($end_user_id IS NULL OR ae.end_user_id = $end_user_id)
AND ae.aliases IS NOT NULL
AND ANY(alias IN ae.aliases WHERE toLower(alias) CONTAINS toLower($query))
WITH fulltextResults, collect(ae) AS aliasEntities
UNWIND (fulltextResults + [x IN aliasEntities | {entity: x, score:
CASE
WHEN ANY(alias IN x.aliases WHERE toLower(alias) = toLower($query)) THEN 1.0
WHEN ANY(alias IN x.aliases WHERE toLower(alias) STARTS WITH toLower($query)) THEN 0.9
ELSE 0.8
END
}]) AS row
WITH row.entity AS e, row.score AS score
WITH DISTINCT e, MAX(score) AS score
OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e)
OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s)
RETURN e.id AS id,
e.name AS name,
e.end_user_id AS end_user_id,
e.entity_type AS entity_type,
e.created_at AS created_at,
e.expired_at AS expired_at,
e.entity_idx AS entity_idx,
e.statement_id AS statement_id,
e.description AS description,
e.aliases AS aliases,
e.name_embedding AS name_embedding,
e.connect_strength AS connect_strength,
collect(DISTINCT s.id) AS statement_ids,
collect(DISTINCT c.id) AS chunk_ids,
COALESCE(e.activation_value, e.importance_score, 0.5) AS activation_value,
COALESCE(e.importance_score, 0.5) AS importance_score,
e.last_access_time AS last_access_time,
COALESCE(e.access_count, 0) AS access_count,
score
ORDER BY score DESC
LIMIT $limit
"""
SEARCH_CHUNKS_BY_CONTENT = """
CALL db.index.fulltext.queryNodes("chunksFulltext", $query) YIELD node AS c, score
WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id)
OPTIONAL MATCH (c)-[:CONTAINS]->(s:Statement)
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity)
RETURN c.id AS id,
c.end_user_id AS end_user_id,
c.content AS content,
c.dialog_id AS dialog_id,
c.sequence_number AS sequence_number,
collect(DISTINCT s.id) AS statement_ids,
collect(DISTINCT e.id) AS entity_ids,
COALESCE(c.activation_value, 0.5) AS activation_value,
c.last_access_time AS last_access_time,
COALESCE(c.access_count, 0) AS access_count,
score
ORDER BY score DESC
LIMIT $limit
"""
# MemorySummary keyword search using fulltext index
SEARCH_MEMORY_SUMMARIES_BY_KEYWORD = """
CALL db.index.fulltext.queryNodes("summariesFulltext", $query) YIELD node AS m, score
WHERE ($end_user_id IS NULL OR m.end_user_id = $end_user_id)
OPTIONAL MATCH (m)-[:DERIVED_FROM_STATEMENT]->(s:Statement)
RETURN m.id AS id,
m.name AS name,
m.end_user_id AS end_user_id,
m.dialog_id AS dialog_id,
m.chunk_ids AS chunk_ids,
m.content AS content,
m.created_at AS created_at,
COALESCE(m.activation_value, m.importance_score, 0.5) AS activation_value,
COALESCE(m.importance_score, 0.5) AS importance_score,
m.last_access_time AS last_access_time,
COALESCE(m.access_count, 0) AS access_count,
score
ORDER BY score DESC
LIMIT $limit
"""
# Community keyword search: matches name or summary via fulltext index
SEARCH_COMMUNITIES_BY_KEYWORD = """
CALL db.index.fulltext.queryNodes("communitiesFulltext", $query) YIELD node AS c, score
WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id)
RETURN c.community_id AS id,
c.name AS name,
c.summary AS content,
c.core_entities AS core_entities,
c.member_count AS member_count,
c.end_user_id AS end_user_id,
c.updated_at AS updated_at,
score
ORDER BY score DESC
LIMIT $limit
"""
FULLTEXT_QUERY_CYPHER_MAPPING = {
Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_KEYWORD,
Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_NAME_OR_ALIAS,
Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_CONTENT,
Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_KEYWORD,
Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_KEYWORD,
Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUALS_BY_KEYWORD
ASSISTANT_ORIGINAL_NODE_SAVE = """
UNWIND $originals AS orig
MERGE (o:AssistantOriginal {id: orig.id})
SET o += {
end_user_id: orig.end_user_id,
run_id: orig.run_id,
dialog_id: orig.dialog_id,
pair_id: orig.pair_id,
text: orig.text,
created_at: orig.created_at,
expired_at: orig.expired_at
}
USER_ID_QUERY_CYPHER_MAPPING = {
Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_USER_ID,
Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_USER_ID,
Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_USER_ID,
Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_USER_ID,
Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_USER_ID,
Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUAL_BY_USER_ID
}
NODE_ID_QUERY_CYPHER_MAPPING = {
Neo4jNodeType.STATEMENT: SEARCH_STATEMENTS_BY_IDS,
Neo4jNodeType.EXTRACTEDENTITY: SEARCH_ENTITIES_BY_IDS,
Neo4jNodeType.CHUNK: SEARCH_CHUNKS_BY_IDS,
Neo4jNodeType.MEMORYSUMMARY: SEARCH_MEMORY_SUMMARIES_BY_IDS,
Neo4jNodeType.COMMUNITY: SEARCH_COMMUNITIES_BY_IDS,
Neo4jNodeType.PERCEPTUAL: SEARCH_PERCEPTUAL_BY_IDS
RETURN o.id AS uuid
"""
ASSISTANT_PRUNED_NODE_SAVE = """
UNWIND $pruneds AS p
MERGE (pr:AssistantPruned {id: p.id})
SET pr += {
end_user_id: p.end_user_id,
run_id: p.run_id,
dialog_id: p.dialog_id,
pair_id: p.pair_id,
text: p.text,
memory_type: p.memory_type,
text_embedding: p.text_embedding,
created_at: p.created_at,
expired_at: p.expired_at
}
RETURN pr.id AS uuid
"""
ASSISTANT_PRUNED_EDGE_SAVE = """
UNWIND $edges AS edge
MATCH (o:AssistantOriginal {id: edge.source})
MATCH (p:AssistantPruned {id: edge.target})
MERGE (o)-[r:PRUNED_TO]->(p)
SET r.pair_id = edge.pair_id,
r.end_user_id = edge.end_user_id,
r.run_id = edge.run_id,
r.created_at = edge.created_at
RETURN elementId(r) AS uuid
"""
ASSISTANT_DIALOG_EDGE_SAVE = """
UNWIND $edges AS edge
MATCH (o:AssistantOriginal {id: edge.source})
MATCH (d:Dialogue {id: edge.target})
MERGE (o)-[r:BELONGS_TO_DIALOG]->(d)
SET r.end_user_id = edge.end_user_id,
r.run_id = edge.run_id,
r.created_at = edge.created_at
RETURN elementId(r) AS uuid
"""

View File

@@ -24,6 +24,10 @@ from app.core.memory.models.graph_models import (
EntityEntityEdge,
PerceptualNode,
PerceptualEdge,
AssistantOriginalNode,
AssistantPrunedNode,
AssistantPrunedEdge,
AssistantDialogEdge,
)
import logging
@@ -166,6 +170,10 @@ async def save_dialog_and_statements_to_neo4j(
statement_entity_edges: List[StatementEntityEdge],
perceptual_edges: List[PerceptualEdge],
connector: Neo4jConnector,
assistant_original_nodes: Optional[List[AssistantOriginalNode]] = None,
assistant_pruned_nodes: Optional[List[AssistantPrunedNode]] = None,
assistant_pruned_edges: Optional[List[AssistantPrunedEdge]] = None,
assistant_dialog_edges: Optional[List[AssistantDialogEdge]] = None,
) -> bool:
"""Save dialogue nodes, chunk nodes, statement nodes, entities, and all relationships to Neo4j using graph models.
@@ -368,6 +376,55 @@ async def save_dialog_and_statements_to_neo4j(
results['perceptual_chunk_edges'] = perceptual_edges_uuids
logger.info(f"Successfully saved {len(perceptual_edges_uuids)} perceptual-chunk edges to Neo4j")
# 8. Save assistant original nodes
if assistant_original_nodes:
from app.repositories.neo4j.cypher_queries import ASSISTANT_ORIGINAL_NODE_SAVE
original_data = [node.model_dump() for node in assistant_original_nodes]
result = await tx.run(ASSISTANT_ORIGINAL_NODE_SAVE, originals=original_data)
original_uuids = [record["uuid"] async for record in result]
results['assistant_originals'] = original_uuids
logger.info(f"Successfully saved {len(original_uuids)} assistant original nodes to Neo4j")
# 9. Save assistant pruned nodes
if assistant_pruned_nodes:
from app.repositories.neo4j.cypher_queries import ASSISTANT_PRUNED_NODE_SAVE
pruned_data = [node.model_dump() for node in assistant_pruned_nodes]
result = await tx.run(ASSISTANT_PRUNED_NODE_SAVE, pruneds=pruned_data)
pruned_uuids = [record["uuid"] async for record in result]
results['assistant_pruneds'] = pruned_uuids
logger.info(f"Successfully saved {len(pruned_uuids)} assistant pruned nodes to Neo4j")
# 10. Save PRUNED_TO edges (Original → Pruned)
if assistant_pruned_edges:
from app.repositories.neo4j.cypher_queries import ASSISTANT_PRUNED_EDGE_SAVE
edge_data = [{
"source": edge.source,
"target": edge.target,
"pair_id": edge.pair_id,
"end_user_id": edge.end_user_id,
"run_id": edge.run_id,
"created_at": edge.created_at.isoformat() if edge.created_at else None,
} for edge in assistant_pruned_edges]
result = await tx.run(ASSISTANT_PRUNED_EDGE_SAVE, edges=edge_data)
pruned_edge_uuids = [record["uuid"] async for record in result]
results['assistant_pruned_edges'] = pruned_edge_uuids
logger.info(f"Successfully saved {len(pruned_edge_uuids)} PRUNED_TO edges to Neo4j")
# 11. Save BELONGS_TO_DIALOG edges (Original → Dialogue)
if assistant_dialog_edges:
from app.repositories.neo4j.cypher_queries import ASSISTANT_DIALOG_EDGE_SAVE
edge_data = [{
"source": edge.source,
"target": edge.target,
"end_user_id": edge.end_user_id,
"run_id": edge.run_id,
"created_at": edge.created_at.isoformat() if edge.created_at else None,
} for edge in assistant_dialog_edges]
result = await tx.run(ASSISTANT_DIALOG_EDGE_SAVE, edges=edge_data)
dialog_edge_uuids = [record["uuid"] async for record in result]
results['assistant_dialog_edges'] = dialog_edge_uuids
logger.info(f"Successfully saved {len(dialog_edge_uuids)} BELONGS_TO_DIALOG edges to Neo4j")
return results
try: