From d12ad213e04659f6c251440f5672f5a85ec5b733 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Thu, 19 Mar 2026 11:49:59 +0800 Subject: [PATCH] [changes] Optimize the semantic pruning judgment rules --- .../core/memory/agent/utils/get_dialogs.py | 2 +- api/app/core/memory/models/config_models.py | 30 +- .../data_preprocessing/data_pruning.py | 306 ++++++++++++++++-- .../prompt/prompts/extracat_Pruning.jinja2 | 93 ++++-- api/app/schemas/memory_config_schema.py | 2 +- api/app/services/memory_config_service.py | 23 +- api/app/services/pilot_run_service.py | 2 +- 7 files changed, 393 insertions(+), 65 deletions(-) diff --git a/api/app/core/memory/agent/utils/get_dialogs.py b/api/app/core/memory/agent/utils/get_dialogs.py index ea44d0a5..a301a5ef 100644 --- a/api/app/core/memory/agent/utils/get_dialogs.py +++ b/api/app/core/memory/agent/utils/get_dialogs.py @@ -84,7 +84,7 @@ async def get_chunked_dialogs( pruning_scene=memory_config.pruning_scene or "education", pruning_threshold=memory_config.pruning_threshold, scene_id=str(memory_config.scene_id) if memory_config.scene_id else None, - ontology_classes=memory_config.ontology_classes, + ontology_class_infos=memory_config.ontology_classes, ) logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}") diff --git a/api/app/core/memory/models/config_models.py b/api/app/core/memory/models/config_models.py index c2d62ac1..5ed50b7f 100644 --- a/api/app/core/memory/models/config_models.py +++ b/api/app/core/memory/models/config_models.py @@ -6,6 +6,7 @@ of the memory system including LLM, chunking, pruning, and search. Classes: LLMConfig: Configuration for LLM client ChunkerConfig: Configuration for dialogue chunking + OntologyClassInfo: Single ontology class with name and description PruningConfig: Configuration for semantic pruning TemporalSearchParams: Parameters for temporal search queries """ @@ -50,30 +51,41 @@ class ChunkerConfig(BaseModel): min_characters_per_chunk: Optional[int] = Field(24, ge=0, description="The minimum number of characters in each chunk.") +class OntologyClassInfo(BaseModel): + """本体类型的名称与语义描述,用于剪枝提示词注入。 + + Attributes: + class_name: 本体类型名称(如"患者"、"课程") + class_description: 本体类型语义描述,告知 LLM 该类型在当前场景下的含义 + """ + class_name: str = Field(..., description="本体类型名称") + class_description: str = Field(default="", description="本体类型语义描述") + + class PruningConfig(BaseModel): """Configuration for semantic pruning of dialogue content. Attributes: pruning_switch: Enable or disable semantic pruning - pruning_scene: Scene name for pruning, either a built-in key - ('education', 'online_service', 'outbound') or a custom scene_name - from ontology_scene table + pruning_scene: Scene name for pruning from ontology_scene table pruning_threshold: Pruning ratio (0-0.9, max 0.9 to avoid complete removal) - scene_id: Optional ontology scene UUID, used to load custom ontology classes - ontology_classes: List of class_name strings from ontology_class table, - injected into the prompt when pruning_scene is not a built-in scene + scene_id: Optional ontology scene UUID + ontology_class_infos: Full ontology class info (name + description) from + ontology_class table, injected into the pruning prompt to drive + scene-aware preservation decisions """ pruning_switch: bool = Field(False, description="Enable semantic pruning when True.") pruning_scene: str = Field( "education", - description="Scene for pruning: built-in key or custom scene_name from ontology_scene.", + description="Scene name from ontology_scene table.", ) pruning_threshold: float = Field( 0.5, ge=0.0, le=0.9, description="Pruning ratio within 0-0.9 (max 0.9 to avoid termination).") scene_id: Optional[str] = Field(None, description="Ontology scene UUID (optional).") - ontology_classes: Optional[List[str]] = Field( - None, description="Class names from ontology_class table for custom scenes." + ontology_class_infos: List[OntologyClassInfo] = Field( + default_factory=list, + description="Full ontology class info (name + description) injected into pruning prompt." ) diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py index 28f7d8e0..28e2f96b 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py @@ -20,7 +20,6 @@ from pydantic import BaseModel, Field from app.core.memory.models.message_models import DialogData, ConversationMessage, ConversationContext from app.core.memory.models.config_models import PruningConfig -from app.core.memory.utils.config.config_utils import get_pruning_config from app.core.memory.utils.prompt.prompt_utils import prompt_env, log_prompt_rendering, log_template_rendering from app.core.memory.storage_services.extraction_engine.data_preprocessing.scene_config import ( SceneConfigRegistry, @@ -34,6 +33,8 @@ class DialogExtractionResponse(BaseModel): - is_related:对话与场景的相关性判定。 - times / ids / amounts / contacts / addresses / keywords:重要信息片段,用来在不相关对话中保留关键消息。 - preserve_keywords:情绪/兴趣/爱好/个人观点相关词,包含这些词的消息必须强制保留。 + - scene_unrelated_snippets:与当前场景无关且无语义关联的消息片段(原文截取), + 用于高阈值阶段精准删除跨场景内容。 """ is_related: bool = Field(...) times: List[str] = Field(default_factory=list) @@ -43,6 +44,7 @@ class DialogExtractionResponse(BaseModel): addresses: List[str] = Field(default_factory=list) keywords: List[str] = Field(default_factory=list) preserve_keywords: List[str] = Field(default_factory=list, description="情绪/兴趣/爱好/个人观点相关词,包含这些词的消息强制保留") + scene_unrelated_snippets: List[str] = Field(default_factory=list,description="与当前场景无关且无语义关联的消息原文片段,高阈值阶段用于精准删除跨场景内容") class MessageImportanceResponse(BaseModel): @@ -91,12 +93,14 @@ class SemanticPruner: # 加载统一填充词库 self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(self.config.pruning_scene) - # 本体类型列表(用于注入提示词,所有场景均支持) - self._ontology_classes = getattr(self.config, "ontology_classes", None) or [] + # 本体类型列表:直接使用 ontology_class_infos(name + description) + self._ontology_class_infos = getattr(self.config, "ontology_class_infos", None) or [] + # _ontology_classes 仅用于日志统计 + self._ontology_classes = [info.class_name for info in self._ontology_class_infos] self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene}") - if self._ontology_classes: - self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}") + if self._ontology_class_infos: + self._log(f"[剪枝-初始化] 注入本体类型({len(self._ontology_class_infos)}个): {self._ontology_classes}") else: self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词") @@ -121,7 +125,8 @@ class SemanticPruner: 1. 空消息 2. 场景特定填充词库精确匹配 3. 常见寒暄精确匹配 - 4. 纯表情/标点 + 4. 组合寒暄模式(前缀+后缀组合,如"好的谢谢"、"同学你好"、"明白了") + 5. 纯表情/标点 """ t = message.msg.strip() if not t: @@ -143,6 +148,55 @@ class SemanticPruner: if t in common_greetings: return True + # 组合寒暄模式:短消息(≤15字)且完全由寒暄成分构成 + # 策略:将消息拆分后,每个片段都能在填充词库或常见寒暄中找到,则整体为填充 + if len(t) <= 15: + # 确认+称呼/感谢组合,如"好的谢谢"、"明白了"、"知道了谢谢" + _confirm_prefixes = {"好的", "好", "嗯", "嗯嗯", "哦", "明白", "明白了", "知道了", "了解", "收到", "没问题"} + _thanks_suffixes = {"谢谢", "谢谢你", "谢谢您", "多谢", "感谢", "谢了"} + _greeting_suffixes = {"你好", "您好", "老师好", "同学好", "大家好"} + _greeting_prefixes = {"同学", "老师", "您好", "你好"} + _close_patterns = { + "没有了", "没事了", "没问题了", "好了", "行了", "可以了", + "不用了", "不需要了", "就这样", "就这样吧", "那就这样", + } + _polite_responses = { + "不客气", "不用谢", "没关系", "没事", "应该的", "这是我应该做的", + } + + # 规则1:确认词 + 感谢词(如"好的谢谢"、"嗯谢谢") + for cp in _confirm_prefixes: + for ts in _thanks_suffixes: + if t == cp + ts or t == cp + "," + ts or t == cp + "," + ts: + return True + + # 规则2:称呼前缀 + 问候(如"同学你好"、"老师好") + for gp in _greeting_prefixes: + for gs in _greeting_suffixes: + if t == gp + gs or t.startswith(gp) and t.endswith("好"): + return True + + # 规则3:结束语 + 感谢(如"没有了,谢谢老师"、"没有了谢谢") + for cp in _close_patterns: + if t.startswith(cp): + remainder = t[len(cp):].lstrip(",,、 ") + if not remainder or any(remainder.startswith(ts) for ts in _thanks_suffixes): + return True + + # 规则4:礼貌回应(如"不客气,祝你考试顺利"——前缀是礼貌词,后半是祝福套话) + for pr in _polite_responses: + if t.startswith(pr): + remainder = t[len(pr):].lstrip(",,、 ") + # 后半是祝福/套话(不含实质信息) + if not remainder or re.match(r"^(祝|希望|期待|加油|顺利|好好|保重)", remainder): + return True + + # 规则5:纯确认词加"了"后缀(如"明白了"、"知道了"、"好了") + _confirm_base = {"明白", "知道", "了解", "收到", "好", "行", "可以", "没问题"} + for cb in _confirm_base: + if t == cb + "了" or t == cb + "了。" or t == cb + "了!": + return True + # 检查是否为纯表情符号(方括号包裹) if re.fullmatch(r"(\[[^\]]+\])+", t): return True @@ -331,13 +385,13 @@ class SemanticPruner: rendered = self.template.render( pruning_scene=self.config.pruning_scene, - ontology_classes=self._ontology_classes, + ontology_class_infos=self._ontology_class_infos, dialog_text=dialog_text, language=self.language ) log_template_rendering("extracat_Pruning.jinja2", { "pruning_scene": self.config.pruning_scene, - "ontology_classes_count": len(self._ontology_classes), + "ontology_class_infos_count": len(self._ontology_class_infos), "language": self.language }) log_prompt_rendering("pruning-extract", rendered) @@ -377,6 +431,189 @@ class SemanticPruner: ) return fallback_response + def _get_pruning_mode(self) -> str: + """根据 pruning_threshold 返回当前剪枝阶段。 + + - 低阈值 [0.0, 0.3):conservative 只删填充,保留所有实质内容 + - 中阈值 [0.3, 0.6):semantic 保留场景相关 + 有语义关联的内容,删除无关联内容 + - 高阈值 [0.6, 0.9]:strict 只保留场景相关内容,跨场景内容可被删除 + """ + t = float(self.config.pruning_threshold) + if t < 0.3: + return "conservative" + elif t < 0.6: + return "semantic" + else: + return "strict" + + def _apply_related_dialog_pruning( + self, + msgs: List[ConversationMessage], + extraction: "DialogExtractionResponse", + dialog_label: str, + pruning_mode: str, + ) -> List[ConversationMessage]: + """相关对话统一剪枝入口,消除 prune_dialog / prune_dataset 中的重复逻辑。 + + - conservative:只删填充 + - semantic / strict:场景感知剪枝 + """ + if pruning_mode == "conservative": + preserve_tokens = self._build_preserve_tokens(extraction) + return self._prune_fillers_only(msgs, preserve_tokens, dialog_label) + else: + return self._prune_with_scene_filter(msgs, extraction, dialog_label, pruning_mode) + + def _prune_fillers_only( + self, + msgs: List[ConversationMessage], + preserve_tokens: List[str], + dialog_label: str, + ) -> List[ConversationMessage]: + """相关对话专用:只删填充消息,LLM 保护消息和实质内容一律保留。 + + 不受 pruning_threshold 约束,删多少算多少(填充有多少删多少)。 + 至少保留 1 条消息。 + 注意:填充检测优先于 preserve_tokens 保护——填充消息本身无信息价值, + 即使 LLM 误将其关键词放入 preserve_tokens 也应删除。 + """ + to_delete_ids: set = set() + for m in msgs: + # 填充检测优先:先判断是否为填充,再看 LLM 保护 + if self._is_filler_message(m): + to_delete_ids.add(id(m)) + self._log(f" [填充] '{m.msg[:40]}' → 删除") + continue + if self._msg_matches_tokens(m, preserve_tokens): + self._log(f" [保护] '{m.msg[:40]}' → LLM保护,跳过") + + kept = [m for m in msgs if id(m) not in to_delete_ids] + if not kept and msgs: + kept = [msgs[0]] + + deleted = len(msgs) - len(kept) + self._log( + f"[剪枝-相关] {dialog_label} 总消息={len(msgs)} " + f"填充删除={deleted} 保留={len(kept)}" + ) + return kept + + def _prune_with_scene_filter( + self, + msgs: List[ConversationMessage], + extraction: "DialogExtractionResponse", + dialog_label: str, + mode: str, + ) -> List[ConversationMessage]: + """场景感知剪枝,供 semantic / strict 两个阈值档位调用。 + + 本函数体现剪枝系统的三层递进逻辑: + + 第一层(conservative,阈值 < 0.3): + 不进入本函数,由 _prune_fillers_only 处理。 + 保留标准:只问"有没有信息量",填充消息(嗯/好的/哈哈等)删除,其余一律保留。 + + 第二层(semantic,阈值 [0.3, 0.6)): + 保留标准:内容价值优先,场景相关性是参考而非唯一标准。 + - 填充消息 → 删除(最高优先级) + - 场景相关消息 → 保留 + - 场景无关消息 → 有两次豁免机会: + 1. 命中 scene_preserve_tokens(LLM 标记的关键词/时间/金额等)→ 保留 + 2. 含情感词(感觉/压力/开心等)→ 保留(情感内容有记忆价值) + 3. 两次豁免均未命中 → 删除 + + 第三层(strict,阈值 [0.6, 0.9]): + 保留标准:场景相关性优先,豁免权极度收窄。 + - 填充消息 → 删除(最高优先级) + - 场景相关消息 → 保留 + - 场景无关消息 → 直接删除,仅保留一个例外: + LLM 同时将该消息放入 preserve_keywords(自相矛盾时以情感标记为准)→ 保留 + 注意:strict 模式下情感词兜底不再生效,场景相关性是最终裁决标准。 + + 至少保留 1 条消息(兜底取第一条)。 + """ + # strict 模式收窄保护范围:只保护结构化关键信息(时间/编号/金额/联系方式/地址), + # 不保护 keywords / preserve_keywords,让场景过滤能删掉更多内容。 + # semantic 模式完整保护:包含 LLM 抽取的所有重要片段(含 keywords 和 preserve_keywords)。 + if mode == "strict": + scene_preserve_tokens = ( + extraction.times + extraction.ids + extraction.amounts + + extraction.contacts + extraction.addresses + ) + else: + scene_preserve_tokens = self._build_preserve_tokens(extraction) + + unrelated_snippets = extraction.scene_unrelated_snippets or [] + + to_delete_ids: set = set() + for m in msgs: + msg_text = m.msg.strip() + + # 第一优先级:填充消息无论模式直接删除,不参与后续场景判断 + if self._is_filler_message(m): + to_delete_ids.add(id(m)) + self._log(f" [填充] '{msg_text[:40]}' → 删除") + continue + + # 双向包含匹配:处理 LLM 返回片段与原始消息文本长度不完全一致的情况 + is_scene_unrelated = any( + snip and (snip in msg_text or msg_text in snip) + for snip in unrelated_snippets + ) + + if is_scene_unrelated: + if mode == "strict": + # strict:场景无关 → 删除 + # 唯一例外:LLM 同时将该消息标记为 preserve_keywords, + # 说明 LLM 自相矛盾(既认为场景无关又认为值得保留),以 preserve_keywords 为准 + if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords): + self._log(f" [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护,保留") + else: + to_delete_ids.add(id(m)) + self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除") + elif mode == "semantic": + # semantic:场景无关但有内容价值 → 保留 + # 豁免第一层:命中 scene_preserve_tokens(关键词/结构化信息保护) + if self._msg_matches_tokens(m, scene_preserve_tokens): + self._log(f" [保护] '{msg_text[:40]}' → 场景关键词保护,保留") + else: + # 豁免第二层:含情感词,认为有情境记忆价值,即使场景无关也保留 + has_contextual_emotion = any( + word in msg_text + for word in ["感觉", "觉得", "心情", "开心", "难过", "高兴", "沮丧", + "喜欢", "讨厌", "爱", "恨", "担心", "害怕", "兴奋", + "压力", "累", "疲惫", "烦", "焦虑", "委屈", "感动"] + ) + if not has_contextual_emotion: + to_delete_ids.add(id(m)) + self._log(f" [场景无关-语义] '{msg_text[:40]}' → 删除(无情感关联)") + else: + self._log(f" [场景关联-保留] '{msg_text[:40]}' → 有情感关联,保留") + else: + # 不在 scene_unrelated_snippets 中 → 场景相关,直接保留 + if self._msg_matches_tokens(m, scene_preserve_tokens): + self._log(f" [保护] '{msg_text[:40]}' → LLM保护,跳过") + # else: 普通场景相关消息,保留,不输出日志 + + kept = [m for m in msgs if id(m) not in to_delete_ids] + if not kept and msgs: + kept = [msgs[0]] + + deleted = len(msgs) - len(kept) + self._log( + f"[剪枝-{mode}] {dialog_label} 总消息={len(msgs)} " + f"删除={deleted} 保留={len(kept)}" + ) + return kept + + def _build_preserve_tokens(self, extraction: "DialogExtractionResponse") -> List[str]: + """统一构建 preserve_tokens,合并 LLM 抽取的所有重要片段。""" + return ( + extraction.times + extraction.ids + extraction.amounts + + extraction.contacts + extraction.addresses + extraction.keywords + + extraction.preserve_keywords + ) + def _msg_matches_tokens(self, message: ConversationMessage, tokens: List[str]) -> bool: """判断消息是否包含任意抽取到的重要片段。""" if not tokens: @@ -397,16 +634,18 @@ class SemanticPruner: proportion = float(self.config.pruning_threshold) extraction = await self._extract_dialog_important(dialog.content) + pruning_mode = self._get_pruning_mode() + self._log(f"[剪枝-模式] 阈值={proportion} → 模式={pruning_mode}") + if extraction.is_related: - # 相关对话不剪枝 + kept = self._apply_related_dialog_pruning( + dialog.context.msgs, extraction, f"对话ID={dialog.id}", pruning_mode + ) + dialog.context = ConversationContext(msgs=kept) return dialog # 在不相关对话中,LLM 已通过 preserve_tokens 标记需要保护的内容 - preserve_tokens = ( - extraction.times + extraction.ids + extraction.amounts + - extraction.contacts + extraction.addresses + extraction.keywords + - extraction.preserve_keywords - ) + preserve_tokens = self._build_preserve_tokens(extraction) msgs = dialog.context.msgs # 分类:填充 / 其他可删(LLM保护消息通过不加入任何桶来隐式保护) @@ -482,6 +721,9 @@ class SemanticPruner: f"[剪枝-数据集] 对话总数={len(dialogs)} 场景={self.config.pruning_scene} 删除比例={proportion} 开关={self.config.pruning_switch} 模式=消息级独立判断" ) + pruning_mode = self._get_pruning_mode() + self._log(f"[剪枝-数据集] 阈值={proportion} → 剪枝阶段={pruning_mode}") + result: List[DialogData] = [] total_original_msgs = 0 total_deleted_msgs = 0 @@ -505,12 +747,19 @@ class SemanticPruner: original_count = len(msgs) total_original_msgs += original_count + # 相关对话:根据阶段决定处理力度 + if extraction.is_related: + kept = self._apply_related_dialog_pruning( + msgs, extraction, f"对话 {d_idx+1}", pruning_mode + ) + deleted_count = original_count - len(kept) + total_deleted_msgs += deleted_count + dd.context.msgs = kept + result.append(dd) + continue + # 从 LLM 抽取结果中获取所有需要保留的 token - preserve_tokens = ( - extraction.times + extraction.ids + extraction.amounts + - extraction.contacts + extraction.addresses + extraction.keywords + - extraction.preserve_keywords # 情绪/兴趣/爱好关键词 - ) + preserve_tokens = self._build_preserve_tokens(extraction) # 判断是否需要详细日志 should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog @@ -605,6 +854,18 @@ class SemanticPruner: self._log(f"[剪枝-数据集] 剩余对话数={len(result)}") + # 补充统计日志(供 _parse_logs_to_structured 正则解析) + related_count = sum(1 for ex in extraction_results if ex.is_related) + unrelated_count = len(dialogs) - related_count + related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related] + unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related] + self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}") + self._log( + f"[剪枝-数据集] 相关对话:第[{', '.join(related_indices)}]段;" + f"不相关对话:第[{', '.join(unrelated_indices)}]段" + ) + self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs} 条") + # 保存日志 try: from app.core.config import settings @@ -686,7 +947,7 @@ class SemanticPruner: re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)") re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)") re_indices = re.compile(r"相关对话:第\[(.*?)\]段;不相关对话:第\[(.*?)\]段") - re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+)\s+分配删除=(\d+)\s+实删=(\d+)\s+保留=(\d+)") + re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b") re_total_del = re.compile(r"总删除\s+(\d+)\s+条") re_remaining = re.compile(r"剩余对话数=(\d+)") @@ -720,9 +981,8 @@ class SemanticPruner: dialogs.append({ "index": parse_int(m.group(1)), "total_messages": parse_int(m.group(2)), - "quota_delete": parse_int(m.group(3)), - "actual_deleted": parse_int(m.group(4)), - "kept": parse_int(m.group(5)), + "deleted": parse_int(m.group(3)), + "kept": parse_int(m.group(4)), }) continue diff --git a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 index e204b7f9..3061e663 100644 --- a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 @@ -1,6 +1,7 @@ {# 对话级抽取与相关性判定模板(用于剪枝加速) - 输入:pruning_scene, ontology_classes, dialog_text, language + 输入:pruning_scene, ontology_class_infos, dialog_text, language + - ontology_class_infos: List[{class_name: str, class_description: str}] 输出:严格 JSON(不要包含任何多余文本),字段: - is_related: bool,是否与所选场景相关 - times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等) @@ -18,20 +19,16 @@ #} {# ── 确定场景说明 ── #} -{% if ontology_classes and ontology_classes | length > 0 %} +{% if ontology_class_infos and ontology_class_infos | length > 0 %} {% if language == 'en' %} - {% set custom_types_str = ontology_classes | join(', ') %} - {% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %} + {% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is relevant if it involves any of the following entity types.' %} {% else %} - {% set custom_types_str = ontology_classes | join('、') %} - {% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %} + {% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关。' %} {% endif %} {% else %} {% if language == 'en' %} - {% set custom_types_str = '' %} {% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %} {% else %} - {% set custom_types_str = '' %} {% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %} {% endif %} {% endif %} @@ -42,8 +39,17 @@ 2. 从对话中抽取所有需要保留的重要信息片段。 场景说明:{{ instruction }} -{% if custom_types_str %} -重要提示:只要对话中出现与上述实体类型({{ custom_types_str }})相关的内容,即判定为相关(is_related=true)。 + +{% if ontology_class_infos and ontology_class_infos | length > 0 %} +【本场景实体类型定义】 +以下实体类型定义了本场景中哪些内容是重要的。 +凡是与以下任意类型相关的内容,都必须保留,并将关键词/短语提取到 keywords 字段: + +{% for info in ontology_class_infos %} +- {{ info.class_name }}:{{ info.class_description }} +{% endfor %} + +重要提示:只要对话中出现与上述任意实体类型相关的内容,即判定为相关(is_related=true)。 {% endif %} --- @@ -51,13 +57,40 @@ 以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段: - 时间信息:日期、时间点、时间段、有效期 → times 字段 - 编号信息:学号、工号、订单号、申请号、账号、ID → ids 字段 -- 金额信息:价格、费用、金额(含货币符号或单位) → amounts 字段 +- 金额信息:价格、费用、金额(含货币符号或单位,如"100元"、"¥200")→ amounts 字段(注意:考试分数、成绩分数不属于金额,不要放入此字段) - 联系方式:电话、手机号、邮箱、微信、QQ → contacts 字段 - 地址信息:地点、地址、位置 → addresses 字段 -- 场景关键词:与场景强相关的专业术语、事件名称 → keywords 字段 +- 场景关键词:与**当前场景**强相关的专业术语、事件名称 → keywords 字段(注意:只放与当前场景直接相关的词,跨场景的内容不要放入此字段) - **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段 - **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段 -- **个人观点与态度**:对某事物的明确看法、评价、立场 → preserve_keywords 字段 +- **个人情感态度**:对人际关系、情感状态的明确表达(如"我跟室友闹矛盾了"、"我都快抑郁了")→ preserve_keywords 字段 +- 注意:学业目标(如"我想考研")、成绩(如"87分")、学科偏好(如"喜欢数学")属于学业信息,不属于情绪/情感,不要放入 preserve_keywords 字段 + +【场景无关内容标记】 +请从对话中识别出与当前场景({{ pruning_scene }})**既不相关、也无语义关联**的消息片段,将其原文(或关键片段)提取到 scene_unrelated_snippets 字段。 +判断标准: +- 与场景实体类型完全无关 +- 与场景话题没有因果/时间/情境上的关联(例如:不是"因为上课所以累"这种关联) +- 纯粹是另一个话题的内容(如在教育场景中讨论购物、娱乐等) +注意:有情绪/感受表达的消息即使话题不同,也可能有语义关联,请谨慎标记。 + +**重要:scene_unrelated_snippets 必须认真填写,不能为空数组。** +如果对话中存在与场景无关的内容,必须将其原文片段提取出来。 + +示例(场景=在线教育): +- "我最近心情很差,跟室友闹矛盾了" → 与教育场景无关,加入 scene_unrelated_snippets +- "她总是很晚回来吵到我睡觉" → 与教育场景无关,加入 scene_unrelated_snippets +- "对,我都快抑郁了" → 与教育场景无关,加入 scene_unrelated_snippets +- "期末考试12月25日" → 与教育场景相关,不加入 scene_unrelated_snippets +- "我上次高数作业87分" → 与教育场景相关,不加入 scene_unrelated_snippets +- "我的目标是考研" → 与教育场景相关,不加入 scene_unrelated_snippets + +示例(场景=情感陪伴): +- "我最近心情很差,跟室友闹矛盾了" → 与情感陪伴场景相关(情绪+关系),不加入 scene_unrelated_snippets +- "对,我都快抑郁了" → 与情感陪伴场景相关(情绪),不加入 scene_unrelated_snippets +- "期末考试12月25日,3号教学楼201室" → 与情感陪伴场景无关(教育信息),加入 scene_unrelated_snippets +- "我上次高数作业87分,这次能考好吗" → 与情感陪伴场景无关(学业信息),加入 scene_unrelated_snippets +- "我的目标是考研,想读应用数学" → 与情感陪伴场景无关(学业目标),加入 scene_unrelated_snippets 【可以删除的内容】 以下类型的内容属于低价值信息,可以在剪枝时删除: @@ -88,7 +121,8 @@ "contacts": [...], "addresses": [...], "keywords": [...], - "preserve_keywords": [...] + "preserve_keywords": [...], + "scene_unrelated_snippets": [...] } {% else %} You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks: @@ -96,8 +130,17 @@ You are a dialogue content analysis assistant. Please analyze the full dialogue 2. Extract all important information fragments that must be preserved. Scenario Description: {{ instruction }} -{% if custom_types_str %} -Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true). + +{% if ontology_class_infos and ontology_class_infos | length > 0 %} +[Scene Entity Type Definitions] +The following entity types define what content is important in this scene. +Content related to ANY of these types must be preserved and extracted into the keywords field: + +{% for info in ontology_class_infos %} +- {{ info.class_name }}: {{ info.class_description }} +{% endfor %} + +Important: If the dialogue contains content related to any of the entity types above, mark it as relevant (is_related=true). {% endif %} --- @@ -105,13 +148,22 @@ Important: If the dialogue contains content related to any of the entity types a The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields: - Time information: dates, time points, durations, expiry dates → times field - ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field -- Amount information: prices, fees, amounts (with currency symbols or units) → amounts field +- Amount information: prices, fees, amounts (with currency symbols or units, e.g., "$100", "¥200") → amounts field (Note: exam scores and grades are NOT amounts, do not put them here) - Contact information: phone numbers, emails, WeChat, QQ → contacts field - Address information: locations, addresses, places → addresses field -- Scene keywords: professional terms and event names strongly related to the scene → keywords field +- Scene keywords: professional terms and event names strongly related to **the current scene** → keywords field (Note: only put terms directly related to the current scene; cross-scene content should not be placed here) - **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field - **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field -- **Personal opinions and attitudes**: clear views, evaluations, or stances on something → preserve_keywords field +- **Personal emotional attitudes**: clear expressions about interpersonal relationships or emotional states (e.g., "I had a fight with my roommate", "I'm almost depressed") → preserve_keywords field +- Note: Academic goals (e.g., "I want to pursue a master's degree"), grades (e.g., "87 points"), and subject preferences (e.g., "I like math") are academic information, NOT emotions/feelings — do not put them in preserve_keywords + +[Scene-Unrelated Content Marking] +Please identify message snippets in the dialogue that are **neither relevant to nor semantically associated with** the current scene ({{ pruning_scene }}), and extract their original text (or key fragments) into the scene_unrelated_snippets field. +Criteria: +- Completely unrelated to the scene's entity types +- No causal/temporal/contextual association with the scene topic (e.g., "feeling tired because of class" IS associated) +- Purely belongs to a different topic (e.g., discussing shopping or entertainment in an education scene) +Note: Messages with emotional/feeling expressions may still have semantic association even if the topic differs — mark carefully. [CAN BE DELETED] The following types of content are low-value and can be removed during pruning: @@ -141,6 +193,7 @@ Output strict JSON only (fixed keys, order doesn't matter): "contacts": [...], "addresses": [...], "keywords": [...], - "preserve_keywords": [...] + "preserve_keywords": [...], + "scene_unrelated_snippets": [...] } {% endif %} diff --git a/api/app/schemas/memory_config_schema.py b/api/app/schemas/memory_config_schema.py index 0c359d70..8d7490fe 100644 --- a/api/app/schemas/memory_config_schema.py +++ b/api/app/schemas/memory_config_schema.py @@ -417,7 +417,7 @@ class MemoryConfig: # Ontology scene association scene_id: Optional[UUID] = None - ontology_classes: Optional[list] = field(default=None) + ontology_class_infos: list[dict] = field(default_factory=list) def __post_init__(self): """Validate configuration after initialization.""" diff --git a/api/app/services/memory_config_service.py b/api/app/services/memory_config_service.py index 4d67673f..a3751c07 100644 --- a/api/app/services/memory_config_service.py +++ b/api/app/services/memory_config_service.py @@ -107,28 +107,29 @@ def _validate_config_id(config_id, db: Session = None): ) -def _load_ontology_classes(db: Session, scene_id, pruning_scene: Optional[str]) -> Optional[list]: - """从 ontology_class 表加载场景类型名称列表,用于注入提示词。 +def _load_ontology_class_infos(db: Session, scene_id) -> list: + """从 ontology_class 表加载完整本体类型信息(name + description),用于注入剪枝提示词。 Args: db: 数据库会话 scene_id: 本体场景 UUID - pruning_scene: 语义剪枝场景名称(保留参数,暂未使用) Returns: - class_name 字符串列表,或 None(无数据时) + [{"class_name": ..., "class_description": ...}, ...] 或空列表 """ if not scene_id: - return None + return [] try: from app.repositories.ontology_class_repository import OntologyClassRepository repo = OntologyClassRepository(db) classes = repo.get_classes_by_scene(scene_id) - names = [c.class_name for c in classes if c.class_name] - return names if names else None + return [ + {"class_name": c.class_name, "class_description": c.class_description or ""} + for c in classes if c.class_name + ] except Exception as e: - logger.warning(f"Failed to load ontology classes for scene_id={scene_id}: {e}") - return None + logger.warning(f"Failed to load ontology class infos for scene_id={scene_id}: {e}") + return [] class MemoryConfigService: @@ -383,7 +384,7 @@ class MemoryConfigService: pruning_threshold=float(memory_config.pruning_threshold) if memory_config.pruning_threshold is not None else 0.5, # Ontology scene association scene_id=memory_config.scene_id, - ontology_classes=_load_ontology_classes(self.db, memory_config.scene_id, memory_config.pruning_scene), + ontology_class_infos=_load_ontology_class_infos(self.db, memory_config.scene_id), ) elapsed_ms = (time.time() - start_time) * 1000 @@ -550,11 +551,13 @@ class MemoryConfigService: - pruning_switch: bool - pruning_scene: str - pruning_threshold: float + - ontology_class_infos: list of {class_name, class_description} dicts """ return { "pruning_switch": memory_config.pruning_enabled, "pruning_scene": memory_config.pruning_scene, "pruning_threshold": memory_config.pruning_threshold, + "ontology_class_infos": memory_config.ontology_class_infos or [], } def get_ontology_types(self, memory_config: MemoryConfig): diff --git a/api/app/services/pilot_run_service.py b/api/app/services/pilot_run_service.py index b63bc0db..b473140d 100644 --- a/api/app/services/pilot_run_service.py +++ b/api/app/services/pilot_run_service.py @@ -121,7 +121,7 @@ async def run_pilot_extraction( "pruning_scene": memory_config.pruning_scene, "pruning_threshold": memory_config.pruning_threshold, "scene_id": str(memory_config.scene_id) if memory_config.scene_id else None, - "ontology_classes": memory_config.ontology_classes, + "ontology_class_infos": memory_config.ontology_classes, } config = PruningConfig(**pruning_config_dict)