From d12ad213e04659f6c251440f5672f5a85ec5b733 Mon Sep 17 00:00:00 2001
From: lanceyq <1982376970@qq.com>
Date: Thu, 19 Mar 2026 11:49:59 +0800
Subject: [PATCH] [changes] Optimize the semantic pruning judgment rules

---
 .../core/memory/agent/utils/get_dialogs.py    |   2 +-
 api/app/core/memory/models/config_models.py   |  30 +-
 .../data_preprocessing/data_pruning.py        | 306 ++++++++++++++++--
 .../prompt/prompts/extracat_Pruning.jinja2    |  93 ++++--
 api/app/schemas/memory_config_schema.py       |   2 +-
 api/app/services/memory_config_service.py     |  23 +-
 api/app/services/pilot_run_service.py         |   2 +-
 7 files changed, 393 insertions(+), 65 deletions(-)

diff --git a/api/app/core/memory/agent/utils/get_dialogs.py b/api/app/core/memory/agent/utils/get_dialogs.py
index ea44d0a5..a301a5ef 100644
--- a/api/app/core/memory/agent/utils/get_dialogs.py
+++ b/api/app/core/memory/agent/utils/get_dialogs.py
@@ -84,7 +84,7 @@ async def get_chunked_dialogs(
                             pruning_scene=memory_config.pruning_scene or "education",
                             pruning_threshold=memory_config.pruning_threshold,
                             scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
-                            ontology_classes=memory_config.ontology_classes,
+                            ontology_class_infos=memory_config.ontology_classes,
                         )
                         logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}")
                         
diff --git a/api/app/core/memory/models/config_models.py b/api/app/core/memory/models/config_models.py
index c2d62ac1..5ed50b7f 100644
--- a/api/app/core/memory/models/config_models.py
+++ b/api/app/core/memory/models/config_models.py
@@ -6,6 +6,7 @@ of the memory system including LLM, chunking, pruning, and search.
 Classes:
     LLMConfig: Configuration for LLM client
     ChunkerConfig: Configuration for dialogue chunking
+    OntologyClassInfo: Single ontology class with name and description
     PruningConfig: Configuration for semantic pruning
     TemporalSearchParams: Parameters for temporal search queries
 """
@@ -50,30 +51,41 @@ class ChunkerConfig(BaseModel):
     min_characters_per_chunk: Optional[int] = Field(24, ge=0, description="The minimum number of characters in each chunk.")
 
 
+class OntologyClassInfo(BaseModel):
+    """本体类型的名称与语义描述，用于剪枝提示词注入。
+
+    Attributes:
+        class_name: 本体类型名称（如"患者"、"课程"）
+        class_description: 本体类型语义描述，告知 LLM 该类型在当前场景下的含义
+    """
+    class_name: str = Field(..., description="本体类型名称")
+    class_description: str = Field(default="", description="本体类型语义描述")
+
+
 class PruningConfig(BaseModel):
     """Configuration for semantic pruning of dialogue content.
 
     Attributes:
         pruning_switch: Enable or disable semantic pruning
-        pruning_scene: Scene name for pruning, either a built-in key
-            ('education', 'online_service', 'outbound') or a custom scene_name
-            from ontology_scene table
+        pruning_scene: Scene name for pruning from ontology_scene table
         pruning_threshold: Pruning ratio (0-0.9, max 0.9 to avoid complete removal)
-        scene_id: Optional ontology scene UUID, used to load custom ontology classes
-        ontology_classes: List of class_name strings from ontology_class table,
-            injected into the prompt when pruning_scene is not a built-in scene
+        scene_id: Optional ontology scene UUID
+        ontology_class_infos: Full ontology class info (name + description) from
+            ontology_class table, injected into the pruning prompt to drive
+            scene-aware preservation decisions
     """
     pruning_switch: bool = Field(False, description="Enable semantic pruning when True.")
     pruning_scene: str = Field(
         "education",
-        description="Scene for pruning: built-in key or custom scene_name from ontology_scene.",
+        description="Scene name from ontology_scene table.",
     )
     pruning_threshold: float = Field(
         0.5, ge=0.0, le=0.9,
         description="Pruning ratio within 0-0.9 (max 0.9 to avoid termination).")
     scene_id: Optional[str] = Field(None, description="Ontology scene UUID (optional).")
-    ontology_classes: Optional[List[str]] = Field(
-        None, description="Class names from ontology_class table for custom scenes."
+    ontology_class_infos: List[OntologyClassInfo] = Field(
+        default_factory=list,
+        description="Full ontology class info (name + description) injected into pruning prompt."
     )
 
 
diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
index 28f7d8e0..28e2f96b 100644
--- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
+++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
@@ -20,7 +20,6 @@ from pydantic import BaseModel, Field
 
 from app.core.memory.models.message_models import DialogData, ConversationMessage, ConversationContext
 from app.core.memory.models.config_models import PruningConfig
-from app.core.memory.utils.config.config_utils import get_pruning_config
 from app.core.memory.utils.prompt.prompt_utils import prompt_env, log_prompt_rendering, log_template_rendering
 from app.core.memory.storage_services.extraction_engine.data_preprocessing.scene_config import (
     SceneConfigRegistry,
@@ -34,6 +33,8 @@ class DialogExtractionResponse(BaseModel):
     - is_related：对话与场景的相关性判定。
     - times / ids / amounts / contacts / addresses / keywords：重要信息片段，用来在不相关对话中保留关键消息。
     - preserve_keywords：情绪/兴趣/爱好/个人观点相关词，包含这些词的消息必须强制保留。
+    - scene_unrelated_snippets：与当前场景无关且无语义关联的消息片段（原文截取），
+      用于高阈值阶段精准删除跨场景内容。
     """
     is_related: bool = Field(...)
     times: List[str] = Field(default_factory=list)
@@ -43,6 +44,7 @@ class DialogExtractionResponse(BaseModel):
     addresses: List[str] = Field(default_factory=list)
     keywords: List[str] = Field(default_factory=list)
     preserve_keywords: List[str] = Field(default_factory=list, description="情绪/兴趣/爱好/个人观点相关词，包含这些词的消息强制保留")
+    scene_unrelated_snippets: List[str] = Field(default_factory=list,description="与当前场景无关且无语义关联的消息原文片段，高阈值阶段用于精准删除跨场景内容")
 
 
 class MessageImportanceResponse(BaseModel):
@@ -91,12 +93,14 @@ class SemanticPruner:
         # 加载统一填充词库
         self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(self.config.pruning_scene)
         
-        # 本体类型列表（用于注入提示词，所有场景均支持）
-        self._ontology_classes = getattr(self.config, "ontology_classes", None) or []
+        # 本体类型列表：直接使用 ontology_class_infos（name + description）
+        self._ontology_class_infos = getattr(self.config, "ontology_class_infos", None) or []
+        # _ontology_classes 仅用于日志统计
+        self._ontology_classes = [info.class_name for info in self._ontology_class_infos]
         
         self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene}")
-        if self._ontology_classes:
-            self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}")
+        if self._ontology_class_infos:
+            self._log(f"[剪枝-初始化] 注入本体类型({len(self._ontology_class_infos)}个): {self._ontology_classes}")
         else:
             self._log(f"[剪枝-初始化] 未找到本体类型，将使用通用提示词")
         
@@ -121,7 +125,8 @@ class SemanticPruner:
         1. 空消息
         2. 场景特定填充词库精确匹配
         3. 常见寒暄精确匹配
-        4. 纯表情/标点
+        4. 组合寒暄模式（前缀+后缀组合，如"好的谢谢"、"同学你好"、"明白了"）
+        5. 纯表情/标点
         """
         t = message.msg.strip()
         if not t:
@@ -143,6 +148,55 @@ class SemanticPruner:
         if t in common_greetings:
             return True
 
+        # 组合寒暄模式：短消息（≤15字）且完全由寒暄成分构成
+        # 策略：将消息拆分后，每个片段都能在填充词库或常见寒暄中找到，则整体为填充
+        if len(t) <= 15:
+            # 确认+称呼/感谢组合，如"好的谢谢"、"明白了"、"知道了谢谢"
+            _confirm_prefixes = {"好的", "好", "嗯", "嗯嗯", "哦", "明白", "明白了", "知道了", "了解", "收到", "没问题"}
+            _thanks_suffixes = {"谢谢", "谢谢你", "谢谢您", "多谢", "感谢", "谢了"}
+            _greeting_suffixes = {"你好", "您好", "老师好", "同学好", "大家好"}
+            _greeting_prefixes = {"同学", "老师", "您好", "你好"}
+            _close_patterns = {
+                "没有了", "没事了", "没问题了", "好了", "行了", "可以了",
+                "不用了", "不需要了", "就这样", "就这样吧", "那就这样",
+            }
+            _polite_responses = {
+                "不客气", "不用谢", "没关系", "没事", "应该的", "这是我应该做的",
+            }
+
+            # 规则1：确认词 + 感谢词（如"好的谢谢"、"嗯谢谢"）
+            for cp in _confirm_prefixes:
+                for ts in _thanks_suffixes:
+                    if t == cp + ts or t == cp + "，" + ts or t == cp + "，" + ts:
+                        return True
+
+            # 规则2：称呼前缀 + 问候（如"同学你好"、"老师好"）
+            for gp in _greeting_prefixes:
+                for gs in _greeting_suffixes:
+                    if t == gp + gs or t.startswith(gp) and t.endswith("好"):
+                        return True
+
+            # 规则3：结束语 + 感谢（如"没有了，谢谢老师"、"没有了谢谢"）
+            for cp in _close_patterns:
+                if t.startswith(cp):
+                    remainder = t[len(cp):].lstrip("，,、 ")
+                    if not remainder or any(remainder.startswith(ts) for ts in _thanks_suffixes):
+                        return True
+
+            # 规则4：礼貌回应（如"不客气，祝你考试顺利"——前缀是礼貌词，后半是祝福套话）
+            for pr in _polite_responses:
+                if t.startswith(pr):
+                    remainder = t[len(pr):].lstrip("，,、 ")
+                    # 后半是祝福/套话（不含实质信息）
+                    if not remainder or re.match(r"^(祝|希望|期待|加油|顺利|好好|保重)", remainder):
+                        return True
+
+            # 规则5：纯确认词加"了"后缀（如"明白了"、"知道了"、"好了"）
+            _confirm_base = {"明白", "知道", "了解", "收到", "好", "行", "可以", "没问题"}
+            for cb in _confirm_base:
+                if t == cb + "了" or t == cb + "了。" or t == cb + "了！":
+                    return True
+
         # 检查是否为纯表情符号（方括号包裹）
         if re.fullmatch(r"(\[[^\]]+\])+", t):
             return True
@@ -331,13 +385,13 @@ class SemanticPruner:
 
         rendered = self.template.render(
             pruning_scene=self.config.pruning_scene,
-            ontology_classes=self._ontology_classes,
+            ontology_class_infos=self._ontology_class_infos,
             dialog_text=dialog_text,
             language=self.language
         )
         log_template_rendering("extracat_Pruning.jinja2", {
             "pruning_scene": self.config.pruning_scene,
-            "ontology_classes_count": len(self._ontology_classes),
+            "ontology_class_infos_count": len(self._ontology_class_infos),
             "language": self.language
         })
         log_prompt_rendering("pruning-extract", rendered)
@@ -377,6 +431,189 @@ class SemanticPruner:
                     )
                     return fallback_response
 
+    def _get_pruning_mode(self) -> str:
+        """根据 pruning_threshold 返回当前剪枝阶段。
+
+        - 低阈值 [0.0, 0.3)：conservative  只删填充，保留所有实质内容
+        - 中阈值 [0.3, 0.6)：semantic      保留场景相关 + 有语义关联的内容，删除无关联内容
+        - 高阈值 [0.6, 0.9]：strict        只保留场景相关内容，跨场景内容可被删除
+        """
+        t = float(self.config.pruning_threshold)
+        if t < 0.3:
+            return "conservative"
+        elif t < 0.6:
+            return "semantic"
+        else:
+            return "strict"
+
+    def _apply_related_dialog_pruning(
+        self,
+        msgs: List[ConversationMessage],
+        extraction: "DialogExtractionResponse",
+        dialog_label: str,
+        pruning_mode: str,
+    ) -> List[ConversationMessage]:
+        """相关对话统一剪枝入口，消除 prune_dialog / prune_dataset 中的重复逻辑。
+
+        - conservative：只删填充
+        - semantic / strict：场景感知剪枝
+        """
+        if pruning_mode == "conservative":
+            preserve_tokens = self._build_preserve_tokens(extraction)
+            return self._prune_fillers_only(msgs, preserve_tokens, dialog_label)
+        else:
+            return self._prune_with_scene_filter(msgs, extraction, dialog_label, pruning_mode)
+
+    def _prune_fillers_only(
+        self,
+        msgs: List[ConversationMessage],
+        preserve_tokens: List[str],
+        dialog_label: str,
+    ) -> List[ConversationMessage]:
+        """相关对话专用：只删填充消息，LLM 保护消息和实质内容一律保留。
+
+        不受 pruning_threshold 约束，删多少算多少（填充有多少删多少）。
+        至少保留 1 条消息。
+        注意：填充检测优先于 preserve_tokens 保护——填充消息本身无信息价值，
+        即使 LLM 误将其关键词放入 preserve_tokens 也应删除。
+        """
+        to_delete_ids: set = set()
+        for m in msgs:
+            # 填充检测优先：先判断是否为填充，再看 LLM 保护
+            if self._is_filler_message(m):
+                to_delete_ids.add(id(m))
+                self._log(f"  [填充] '{m.msg[:40]}' → 删除")
+                continue
+            if self._msg_matches_tokens(m, preserve_tokens):
+                self._log(f"  [保护] '{m.msg[:40]}' → LLM保护，跳过")
+
+        kept = [m for m in msgs if id(m) not in to_delete_ids]
+        if not kept and msgs:
+            kept = [msgs[0]]
+
+        deleted = len(msgs) - len(kept)
+        self._log(
+            f"[剪枝-相关] {dialog_label} 总消息={len(msgs)} "
+            f"填充删除={deleted} 保留={len(kept)}"
+        )
+        return kept
+
+    def _prune_with_scene_filter(
+        self,
+        msgs: List[ConversationMessage],
+        extraction: "DialogExtractionResponse",
+        dialog_label: str,
+        mode: str,
+    ) -> List[ConversationMessage]:
+        """场景感知剪枝，供 semantic / strict 两个阈值档位调用。
+
+        本函数体现剪枝系统的三层递进逻辑：
+
+        第一层（conservative，阈值 < 0.3）：
+            不进入本函数，由 _prune_fillers_only 处理。
+            保留标准：只问"有没有信息量"，填充消息（嗯/好的/哈哈等）删除，其余一律保留。
+
+        第二层（semantic，阈值 [0.3, 0.6)）：
+            保留标准：内容价值优先，场景相关性是参考而非唯一标准。
+            - 填充消息 → 删除（最高优先级）
+            - 场景相关消息 → 保留
+            - 场景无关消息 → 有两次豁免机会：
+                1. 命中 scene_preserve_tokens（LLM 标记的关键词/时间/金额等）→ 保留
+                2. 含情感词（感觉/压力/开心等）→ 保留（情感内容有记忆价值）
+                3. 两次豁免均未命中 → 删除
+
+        第三层（strict，阈值 [0.6, 0.9]）：
+            保留标准：场景相关性优先，豁免权极度收窄。
+            - 填充消息 → 删除（最高优先级）
+            - 场景相关消息 → 保留
+            - 场景无关消息 → 直接删除，仅保留一个例外：
+                LLM 同时将该消息放入 preserve_keywords（自相矛盾时以情感标记为准）→ 保留
+            注意：strict 模式下情感词兜底不再生效，场景相关性是最终裁决标准。
+
+        至少保留 1 条消息（兜底取第一条）。
+        """
+        # strict 模式收窄保护范围：只保护结构化关键信息（时间/编号/金额/联系方式/地址），
+        # 不保护 keywords / preserve_keywords，让场景过滤能删掉更多内容。
+        # semantic 模式完整保护：包含 LLM 抽取的所有重要片段（含 keywords 和 preserve_keywords）。
+        if mode == "strict":
+            scene_preserve_tokens = (
+                extraction.times + extraction.ids + extraction.amounts +
+                extraction.contacts + extraction.addresses
+            )
+        else:
+            scene_preserve_tokens = self._build_preserve_tokens(extraction)
+
+        unrelated_snippets = extraction.scene_unrelated_snippets or []
+
+        to_delete_ids: set = set()
+        for m in msgs:
+            msg_text = m.msg.strip()
+
+            # 第一优先级：填充消息无论模式直接删除，不参与后续场景判断
+            if self._is_filler_message(m):
+                to_delete_ids.add(id(m))
+                self._log(f"  [填充] '{msg_text[:40]}' → 删除")
+                continue
+
+            # 双向包含匹配：处理 LLM 返回片段与原始消息文本长度不完全一致的情况
+            is_scene_unrelated = any(
+                snip and (snip in msg_text or msg_text in snip)
+                for snip in unrelated_snippets
+            )
+
+            if is_scene_unrelated:
+                if mode == "strict":
+                    # strict：场景无关 → 删除
+                    # 唯一例外：LLM 同时将该消息标记为 preserve_keywords，
+                    # 说明 LLM 自相矛盾（既认为场景无关又认为值得保留），以 preserve_keywords 为准
+                    if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords):
+                        self._log(f"  [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护，保留")
+                    else:
+                        to_delete_ids.add(id(m))
+                        self._log(f"  [场景无关-严格] '{msg_text[:40]}' → 删除")
+                elif mode == "semantic":
+                    # semantic：场景无关但有内容价值 → 保留
+                    # 豁免第一层：命中 scene_preserve_tokens（关键词/结构化信息保护）
+                    if self._msg_matches_tokens(m, scene_preserve_tokens):
+                        self._log(f"  [保护] '{msg_text[:40]}' → 场景关键词保护，保留")
+                    else:
+                        # 豁免第二层：含情感词，认为有情境记忆价值，即使场景无关也保留
+                        has_contextual_emotion = any(
+                            word in msg_text
+                            for word in ["感觉", "觉得", "心情", "开心", "难过", "高兴", "沮丧",
+                                         "喜欢", "讨厌", "爱", "恨", "担心", "害怕", "兴奋",
+                                         "压力", "累", "疲惫", "烦", "焦虑", "委屈", "感动"]
+                        )
+                        if not has_contextual_emotion:
+                            to_delete_ids.add(id(m))
+                            self._log(f"  [场景无关-语义] '{msg_text[:40]}' → 删除（无情感关联）")
+                        else:
+                            self._log(f"  [场景关联-保留] '{msg_text[:40]}' → 有情感关联，保留")
+            else:
+                # 不在 scene_unrelated_snippets 中 → 场景相关，直接保留
+                if self._msg_matches_tokens(m, scene_preserve_tokens):
+                    self._log(f"  [保护] '{msg_text[:40]}' → LLM保护，跳过")
+                # else: 普通场景相关消息，保留，不输出日志
+
+        kept = [m for m in msgs if id(m) not in to_delete_ids]
+        if not kept and msgs:
+            kept = [msgs[0]]
+
+        deleted = len(msgs) - len(kept)
+        self._log(
+            f"[剪枝-{mode}] {dialog_label} 总消息={len(msgs)} "
+            f"删除={deleted} 保留={len(kept)}"
+        )
+        return kept
+
+    def _build_preserve_tokens(self, extraction: "DialogExtractionResponse") -> List[str]:
+        """统一构建 preserve_tokens，合并 LLM 抽取的所有重要片段。"""
+        return (
+            extraction.times + extraction.ids + extraction.amounts +
+            extraction.contacts + extraction.addresses + extraction.keywords +
+            extraction.preserve_keywords
+        )
+
     def _msg_matches_tokens(self, message: ConversationMessage, tokens: List[str]) -> bool:
         """判断消息是否包含任意抽取到的重要片段。"""
         if not tokens:
@@ -397,16 +634,18 @@ class SemanticPruner:
 
         proportion = float(self.config.pruning_threshold)
         extraction = await self._extract_dialog_important(dialog.content)
+        pruning_mode = self._get_pruning_mode()
+        self._log(f"[剪枝-模式] 阈值={proportion} → 模式={pruning_mode}")
+
         if extraction.is_related:
-            # 相关对话不剪枝
+            kept = self._apply_related_dialog_pruning(
+                dialog.context.msgs, extraction, f"对话ID={dialog.id}", pruning_mode
+            )
+            dialog.context = ConversationContext(msgs=kept)
             return dialog
 
         # 在不相关对话中，LLM 已通过 preserve_tokens 标记需要保护的内容
-        preserve_tokens = (
-            extraction.times + extraction.ids + extraction.amounts +
-            extraction.contacts + extraction.addresses + extraction.keywords +
-            extraction.preserve_keywords
-        )
+        preserve_tokens = self._build_preserve_tokens(extraction)
         msgs = dialog.context.msgs
 
         # 分类：填充 / 其他可删（LLM保护消息通过不加入任何桶来隐式保护）
@@ -482,6 +721,9 @@ class SemanticPruner:
             f"[剪枝-数据集] 对话总数={len(dialogs)} 场景={self.config.pruning_scene} 删除比例={proportion} 开关={self.config.pruning_switch} 模式=消息级独立判断"
         )
         
+        pruning_mode = self._get_pruning_mode()
+        self._log(f"[剪枝-数据集] 阈值={proportion} → 剪枝阶段={pruning_mode}")
+        
         result: List[DialogData] = []
         total_original_msgs = 0
         total_deleted_msgs = 0
@@ -505,12 +747,19 @@ class SemanticPruner:
             original_count = len(msgs)
             total_original_msgs += original_count
 
+            # 相关对话：根据阶段决定处理力度
+            if extraction.is_related:
+                kept = self._apply_related_dialog_pruning(
+                    msgs, extraction, f"对话 {d_idx+1}", pruning_mode
+                )
+                deleted_count = original_count - len(kept)
+                total_deleted_msgs += deleted_count
+                dd.context.msgs = kept
+                result.append(dd)
+                continue
+
             # 从 LLM 抽取结果中获取所有需要保留的 token
-            preserve_tokens = (
-                extraction.times + extraction.ids + extraction.amounts +
-                extraction.contacts + extraction.addresses + extraction.keywords +
-                extraction.preserve_keywords  # 情绪/兴趣/爱好关键词
-            )
+            preserve_tokens = self._build_preserve_tokens(extraction)
 
             # 判断是否需要详细日志
             should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog
@@ -605,6 +854,18 @@ class SemanticPruner:
         
         self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
 
+        # 补充统计日志（供 _parse_logs_to_structured 正则解析）
+        related_count = sum(1 for ex in extraction_results if ex.is_related)
+        unrelated_count = len(dialogs) - related_count
+        related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related]
+        unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related]
+        self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}")
+        self._log(
+            f"[剪枝-数据集] 相关对话：第[{', '.join(related_indices)}]段；"
+            f"不相关对话：第[{', '.join(unrelated_indices)}]段"
+        )
+        self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs} 条")
+
         # 保存日志
         try:
             from app.core.config import settings
@@ -686,7 +947,7 @@ class SemanticPruner:
         re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)")
         re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)")
         re_indices = re.compile(r"相关对话：第\[(.*?)\]段；不相关对话：第\[(.*?)\]段")
-        re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+)\s+分配删除=(\d+)\s+实删=(\d+)\s+保留=(\d+)")
+        re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b")
         re_total_del = re.compile(r"总删除\s+(\d+)\s+条")
         re_remaining = re.compile(r"剩余对话数=(\d+)")
 
@@ -720,9 +981,8 @@ class SemanticPruner:
                 dialogs.append({
                     "index": parse_int(m.group(1)),
                     "total_messages": parse_int(m.group(2)),
-                    "quota_delete": parse_int(m.group(3)),
-                    "actual_deleted": parse_int(m.group(4)),
-                    "kept": parse_int(m.group(5)),
+                    "deleted": parse_int(m.group(3)),
+                    "kept": parse_int(m.group(4)),
                 })
                 continue
 
diff --git a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2
index e204b7f9..3061e663 100644
--- a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2
@@ -1,6 +1,7 @@
 {#
   对话级抽取与相关性判定模板（用于剪枝加速）
-  输入：pruning_scene, ontology_classes, dialog_text, language
+  输入：pruning_scene, ontology_class_infos, dialog_text, language
+    - ontology_class_infos: List[{class_name: str, class_description: str}]
   输出：严格 JSON（不要包含任何多余文本），字段：
     - is_related: bool，是否与所选场景相关
     - times: [string]，从对话中抽取的时间相关文本（日期、时间、时间段、有效期等）
@@ -18,20 +19,16 @@
 #}
 
 {# ── 确定场景说明 ── #}
-{% if ontology_classes and ontology_classes | length > 0 %}
+{% if ontology_class_infos and ontology_class_infos | length > 0 %}
   {% if language == 'en' %}
-    {% set custom_types_str = ontology_classes | join(', ') %}
-    {% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %}
+    {% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is relevant if it involves any of the following entity types.' %}
   {% else %}
-    {% set custom_types_str = ontology_classes | join('、') %}
-    {% set instruction = '场景「' ~ pruning_scene ~ '」：对话涉及以下任意实体类型时视为相关：' ~ custom_types_str ~ '。' %}
+    {% set instruction = '场景「' ~ pruning_scene ~ '」：对话涉及以下任意实体类型时视为相关。' %}
   {% endif %}
 {% else %}
   {% if language == 'en' %}
-    {% set custom_types_str = '' %}
     {% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
   {% else %}
-    {% set custom_types_str = '' %}
     {% set instruction = '场景「' ~ pruning_scene ~ '」：根据对话整体内容判断是否与该场景相关。' %}
   {% endif %}
 {% endif %}
@@ -42,8 +39,17 @@
 2. 从对话中抽取所有需要保留的重要信息片段。
 
 场景说明：{{ instruction }}
-{% if custom_types_str %}
-重要提示：只要对话中出现与上述实体类型（{{ custom_types_str }}）相关的内容，即判定为相关（is_related=true）。
+
+{% if ontology_class_infos and ontology_class_infos | length > 0 %}
+【本场景实体类型定义】
+以下实体类型定义了本场景中哪些内容是重要的。
+凡是与以下任意类型相关的内容，都必须保留，并将关键词/短语提取到 keywords 字段：
+
+{% for info in ontology_class_infos %}
+- {{ info.class_name }}：{{ info.class_description }}
+{% endfor %}
+
+重要提示：只要对话中出现与上述任意实体类型相关的内容，即判定为相关（is_related=true）。
 {% endif %}
 
 ---
@@ -51,13 +57,40 @@
 以下类型的内容无论是否与场景直接相关，都必须保留，请将其关键词/短语抽取到对应字段：
 - 时间信息：日期、时间点、时间段、有效期 → times 字段
 - 编号信息：学号、工号、订单号、申请号、账号、ID → ids 字段
-- 金额信息：价格、费用、金额（含货币符号或单位） → amounts 字段
+- 金额信息：价格、费用、金额（含货币符号或单位，如"100元"、"¥200"）→ amounts 字段（注意：考试分数、成绩分数不属于金额，不要放入此字段）
 - 联系方式：电话、手机号、邮箱、微信、QQ → contacts 字段
 - 地址信息：地点、地址、位置 → addresses 字段
-- 场景关键词：与场景强相关的专业术语、事件名称 → keywords 字段
+- 场景关键词：与**当前场景**强相关的专业术语、事件名称 → keywords 字段（注意：只放与当前场景直接相关的词，跨场景的内容不要放入此字段）
 - **情绪与情感**：喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段
 - **兴趣与爱好**：喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段
-- **个人观点与态度**：对某事物的明确看法、评价、立场 → preserve_keywords 字段
+- **个人情感态度**：对人际关系、情感状态的明确表达（如"我跟室友闹矛盾了"、"我都快抑郁了"）→ preserve_keywords 字段
+- 注意：学业目标（如"我想考研"）、成绩（如"87分"）、学科偏好（如"喜欢数学"）属于学业信息，不属于情绪/情感，不要放入 preserve_keywords 字段
+
+【场景无关内容标记】
+请从对话中识别出与当前场景（{{ pruning_scene }}）**既不相关、也无语义关联**的消息片段，将其原文（或关键片段）提取到 scene_unrelated_snippets 字段。
+判断标准：
+- 与场景实体类型完全无关
+- 与场景话题没有因果/时间/情境上的关联（例如：不是"因为上课所以累"这种关联）
+- 纯粹是另一个话题的内容（如在教育场景中讨论购物、娱乐等）
+注意：有情绪/感受表达的消息即使话题不同，也可能有语义关联，请谨慎标记。
+
+**重要：scene_unrelated_snippets 必须认真填写，不能为空数组。**
+如果对话中存在与场景无关的内容，必须将其原文片段提取出来。
+
+示例（场景=在线教育）：
+- "我最近心情很差，跟室友闹矛盾了" → 与教育场景无关，加入 scene_unrelated_snippets
+- "她总是很晚回来吵到我睡觉" → 与教育场景无关，加入 scene_unrelated_snippets
+- "对，我都快抑郁了" → 与教育场景无关，加入 scene_unrelated_snippets
+- "期末考试12月25日" → 与教育场景相关，不加入 scene_unrelated_snippets
+- "我上次高数作业87分" → 与教育场景相关，不加入 scene_unrelated_snippets
+- "我的目标是考研" → 与教育场景相关，不加入 scene_unrelated_snippets
+
+示例（场景=情感陪伴）：
+- "我最近心情很差，跟室友闹矛盾了" → 与情感陪伴场景相关（情绪+关系），不加入 scene_unrelated_snippets
+- "对，我都快抑郁了" → 与情感陪伴场景相关（情绪），不加入 scene_unrelated_snippets
+- "期末考试12月25日，3号教学楼201室" → 与情感陪伴场景无关（教育信息），加入 scene_unrelated_snippets
+- "我上次高数作业87分，这次能考好吗" → 与情感陪伴场景无关（学业信息），加入 scene_unrelated_snippets
+- "我的目标是考研，想读应用数学" → 与情感陪伴场景无关（学业目标），加入 scene_unrelated_snippets
 
 【可以删除的内容】
 以下类型的内容属于低价值信息，可以在剪枝时删除：
@@ -88,7 +121,8 @@
   "contacts": [<string>...],
   "addresses": [<string>...],
   "keywords": [<string>...],
-  "preserve_keywords": [<string>...]
+  "preserve_keywords": [<string>...],
+  "scene_unrelated_snippets": [<string>...]
 }
 {% else %}
 You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks:
@@ -96,8 +130,17 @@ You are a dialogue content analysis assistant. Please analyze the full dialogue
 2. Extract all important information fragments that must be preserved.
 
 Scenario Description: {{ instruction }}
-{% if custom_types_str %}
-Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true).
+
+{% if ontology_class_infos and ontology_class_infos | length > 0 %}
+[Scene Entity Type Definitions]
+The following entity types define what content is important in this scene.
+Content related to ANY of these types must be preserved and extracted into the keywords field:
+
+{% for info in ontology_class_infos %}
+- {{ info.class_name }}: {{ info.class_description }}
+{% endfor %}
+
+Important: If the dialogue contains content related to any of the entity types above, mark it as relevant (is_related=true).
 {% endif %}
 
 ---
@@ -105,13 +148,22 @@ Important: If the dialogue contains content related to any of the entity types a
 The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields:
 - Time information: dates, time points, durations, expiry dates → times field
 - ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field
-- Amount information: prices, fees, amounts (with currency symbols or units) → amounts field
+- Amount information: prices, fees, amounts (with currency symbols or units, e.g., "$100", "¥200") → amounts field (Note: exam scores and grades are NOT amounts, do not put them here)
 - Contact information: phone numbers, emails, WeChat, QQ → contacts field
 - Address information: locations, addresses, places → addresses field
-- Scene keywords: professional terms and event names strongly related to the scene → keywords field
+- Scene keywords: professional terms and event names strongly related to **the current scene** → keywords field (Note: only put terms directly related to the current scene; cross-scene content should not be placed here)
 - **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field
 - **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field
-- **Personal opinions and attitudes**: clear views, evaluations, or stances on something → preserve_keywords field
+- **Personal emotional attitudes**: clear expressions about interpersonal relationships or emotional states (e.g., "I had a fight with my roommate", "I'm almost depressed") → preserve_keywords field
+- Note: Academic goals (e.g., "I want to pursue a master's degree"), grades (e.g., "87 points"), and subject preferences (e.g., "I like math") are academic information, NOT emotions/feelings — do not put them in preserve_keywords
+
+[Scene-Unrelated Content Marking]
+Please identify message snippets in the dialogue that are **neither relevant to nor semantically associated with** the current scene ({{ pruning_scene }}), and extract their original text (or key fragments) into the scene_unrelated_snippets field.
+Criteria:
+- Completely unrelated to the scene's entity types
+- No causal/temporal/contextual association with the scene topic (e.g., "feeling tired because of class" IS associated)
+- Purely belongs to a different topic (e.g., discussing shopping or entertainment in an education scene)
+Note: Messages with emotional/feeling expressions may still have semantic association even if the topic differs — mark carefully.
 
 [CAN BE DELETED]
 The following types of content are low-value and can be removed during pruning:
@@ -141,6 +193,7 @@ Output strict JSON only (fixed keys, order doesn't matter):
   "contacts": [<string>...],
   "addresses": [<string>...],
   "keywords": [<string>...],
-  "preserve_keywords": [<string>...]
+  "preserve_keywords": [<string>...],
+  "scene_unrelated_snippets": [<string>...]
 }
 {% endif %}
diff --git a/api/app/schemas/memory_config_schema.py b/api/app/schemas/memory_config_schema.py
index 0c359d70..8d7490fe 100644
--- a/api/app/schemas/memory_config_schema.py
+++ b/api/app/schemas/memory_config_schema.py
@@ -417,7 +417,7 @@ class MemoryConfig:
     
     # Ontology scene association
     scene_id: Optional[UUID] = None
-    ontology_classes: Optional[list] = field(default=None)
+    ontology_class_infos: list[dict] = field(default_factory=list)
     
     def __post_init__(self):
         """Validate configuration after initialization."""
diff --git a/api/app/services/memory_config_service.py b/api/app/services/memory_config_service.py
index 4d67673f..a3751c07 100644
--- a/api/app/services/memory_config_service.py
+++ b/api/app/services/memory_config_service.py
@@ -107,28 +107,29 @@ def _validate_config_id(config_id, db: Session = None):
     )
 
 
-def _load_ontology_classes(db: Session, scene_id, pruning_scene: Optional[str]) -> Optional[list]:
-    """从 ontology_class 表加载场景类型名称列表，用于注入提示词。
+def _load_ontology_class_infos(db: Session, scene_id) -> list:
+    """从 ontology_class 表加载完整本体类型信息（name + description），用于注入剪枝提示词。
 
     Args:
         db: 数据库会话
         scene_id: 本体场景 UUID
-        pruning_scene: 语义剪枝场景名称（保留参数，暂未使用）
 
     Returns:
-        class_name 字符串列表，或 None（无数据时）
+        [{"class_name": ..., "class_description": ...}, ...] 或空列表
     """
     if not scene_id:
-        return None
+        return []
     try:
         from app.repositories.ontology_class_repository import OntologyClassRepository
         repo = OntologyClassRepository(db)
         classes = repo.get_classes_by_scene(scene_id)
-        names = [c.class_name for c in classes if c.class_name]
-        return names if names else None
+        return [
+            {"class_name": c.class_name, "class_description": c.class_description or ""}
+            for c in classes if c.class_name
+        ]
     except Exception as e:
-        logger.warning(f"Failed to load ontology classes for scene_id={scene_id}: {e}")
-        return None
+        logger.warning(f"Failed to load ontology class infos for scene_id={scene_id}: {e}")
+        return []
 
 
 class MemoryConfigService:
@@ -383,7 +384,7 @@ class MemoryConfigService:
                 pruning_threshold=float(memory_config.pruning_threshold) if memory_config.pruning_threshold is not None else 0.5,
                 # Ontology scene association
                 scene_id=memory_config.scene_id,
-                ontology_classes=_load_ontology_classes(self.db, memory_config.scene_id, memory_config.pruning_scene),
+                ontology_class_infos=_load_ontology_class_infos(self.db, memory_config.scene_id),
             )
 
             elapsed_ms = (time.time() - start_time) * 1000
@@ -550,11 +551,13 @@ class MemoryConfigService:
             - pruning_switch: bool
             - pruning_scene: str
             - pruning_threshold: float
+            - ontology_class_infos: list of {class_name, class_description} dicts
         """
         return {
             "pruning_switch": memory_config.pruning_enabled,
             "pruning_scene": memory_config.pruning_scene,
             "pruning_threshold": memory_config.pruning_threshold,
+            "ontology_class_infos": memory_config.ontology_class_infos or [],
         }
 
     def get_ontology_types(self, memory_config: MemoryConfig):
diff --git a/api/app/services/pilot_run_service.py b/api/app/services/pilot_run_service.py
index b63bc0db..b473140d 100644
--- a/api/app/services/pilot_run_service.py
+++ b/api/app/services/pilot_run_service.py
@@ -121,7 +121,7 @@ async def run_pilot_extraction(
                     "pruning_scene": memory_config.pruning_scene,
                     "pruning_threshold": memory_config.pruning_threshold,
                     "scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
-                    "ontology_classes": memory_config.ontology_classes,
+                    "ontology_class_infos": memory_config.ontology_classes,
                 }
                 config = PruningConfig(**pruning_config_dict)