Merge pull request #528 from SuanmoSuanyangTechnology/feature/pruning-optimize

Feature/pruning optimize
2026-03-10 17:37:43 +08:00
parent 1e1675ec12 97eabc0c36
commit fcb3845543
3 changed files with 150 additions and 62 deletions
--- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
+++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
@@ -33,6 +33,7 @@ class DialogExtractionResponse(BaseModel):
    - is_related：对话与场景的相关性判定。
    - times / ids / amounts / contacts / addresses / keywords：重要信息片段，用来在不相关对话中保留关键消息。
    - preserve_keywords：情绪/兴趣/爱好/个人观点相关词，包含这些词的消息必须强制保留。
    """
    is_related: bool = Field(...)
    times: List[str] = Field(default_factory=list)
@@ -41,6 +42,7 @@ class DialogExtractionResponse(BaseModel):
    contacts: List[str] = Field(default_factory=list)
    addresses: List[str] = Field(default_factory=list)
    keywords: List[str] = Field(default_factory=list)
    preserve_keywords: List[str] = Field(default_factory=list, description="情绪/兴趣/爱好/个人观点相关词，包含这些词的消息强制保留")
 class MessageImportanceResponse(BaseModel):
@@ -198,17 +200,16 @@ class SemanticPruner:
        return min(score, 10)  # 最高10分
    # 情绪/兴趣/爱好安全防线正则已移除，改由 extracat_Pruning.jinja2 提示词中的 preserve_keywords 机制处理
    def _is_filler_message(self, message: ConversationMessage) -> bool:
        """检测典型寒暄/口头禅/确认类短消息。
-        改进版：更严格的填充消息判断，避免误删场景相关内容
+        判断顺序：
-        满足以下之一视为填充消息：
+        1. 空消息
-        - 纯标点或空白
+        2. 场景特定填充词库精确匹配
-        - 在场景特定填充词库中（精确匹配）
+        3. 常见寒暄精确匹配
-        - 纯表情符号
+        4. 纯表情/标点
        - 常见寒暄（精确匹配短语）
        注意：不再使用长度判断，避免误删短但重要的消息
        """
        t = message.msg.strip()
        if not t:
@@ -234,20 +235,6 @@ class SemanticPruner:
        if re.fullmatch(r"(\[[^\]]+\])+", t):
            return True
        # 检查是否为纯emoji（Unicode表情）
        emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # 表情符号
            "\U0001F300-\U0001F5FF"  # 符号和象形文字
            "\U0001F680-\U0001F6FF"  # 交通和地图符号
            "\U0001F1E0-\U0001F1FF"  # 旗帜
            "\U00002702-\U000027B0"
            "\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE
        )
        if emoji_pattern.fullmatch(t):
            return True
        # 纯标点符号
        if re.fullmatch(r"[。！？,.!?…·\s]+", t):
            return True
@@ -595,43 +582,62 @@ class SemanticPruner:
        total_original_msgs = 0
        total_deleted_msgs = 0
-        for d_idx, dd in enumerate(dialogs):
+        # 并发执行所有对话的 LLM 抽取（获取 preserve_keywords 等保护信息）
        semaphore = asyncio.Semaphore(self.max_concurrent)
        async def extract_with_semaphore(dd: DialogData) -> DialogExtractionResponse:
            async with semaphore:
                try:
                    return await self._extract_dialog_important(dd.content)
                except Exception as e:
                    self._log(f"[剪枝-LLM] 对话抽取失败，使用降级策略: {str(e)[:100]}")
                    return DialogExtractionResponse(is_related=True)
        extraction_tasks = [extract_with_semaphore(dd) for dd in dialogs]
        extraction_results: List[DialogExtractionResponse] = await asyncio.gather(*extraction_tasks)
        for d_idx, (dd, extraction) in enumerate(zip(dialogs, extraction_results)):
            msgs = dd.context.msgs
            original_count = len(msgs)
            total_original_msgs += original_count
-            # ========== 问答对保护（已注释，暂不启用，留作观察） ==========
+            # 从 LLM 抽取结果中获取所有需要保留的 token
-            # qa_pairs = self._identify_qa_pairs(msgs)
+            preserve_tokens = (
-            # protected_indices = self._get_protected_indices(msgs, qa_pairs, window_size=0)
+                extraction.times + extraction.ids + extraction.amounts +
-            # ========================================================
+                extraction.contacts + extraction.addresses + extraction.keywords +
                extraction.preserve_keywords  # 情绪/兴趣/爱好关键词
            )
-            # 消息级分类：每条消息独立判断
+            # 判断是否需要详细日志
            important_msgs = []  # 重要消息（保留）
            unimportant_msgs = []  # 不重要消息（可删除）
            filler_msgs = []  # 填充消息（优先删除）
            # 判断是否需要详细日志（仅对前N条消息记录）
            should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog
            if self._detailed_prune_logging and original_count > self._max_debug_msgs_per_dialog:
                self._log(f"  对话[{d_idx}]消息数={original_count}，仅采样前{self._max_debug_msgs_per_dialog}条进行详细日志")
            if extraction.preserve_keywords:
                self._log(f"  对话[{d_idx}] LLM抽取到情绪/兴趣保护词: {extraction.preserve_keywords}")
            # 消息级分类：每条消息独立判断
            llm_protected_msgs = []  # LLM 保护消息（情绪/兴趣/重要token）：绝对不可删除
            rule_important_msgs = [] # 规则层重要消息（场景规则）：配额不足时可少量删除
            unimportant_msgs = []    # 不重要消息（可删除）
            filler_msgs = []         # 填充消息（优先删除）
            for idx, m in enumerate(msgs):
                msg_text = m.msg.strip()
-                # ========== 问答对保护判断（已注释） ==========
+                # LLM 保护：消息包含 preserve_keywords（情绪/兴趣词）或其他重要 token → 绝对不可删除
-                # if idx in protected_indices:
+                if self._msg_matches_tokens(m, preserve_tokens):
-                #     important_msgs.append((idx, m))
+                    llm_protected_msgs.append((idx, m))
-                #     self._log(f"  [{idx}] '{msg_text[:30]}...' → 重要（问答对保护）")
+                    if should_log_details or idx < self._max_debug_msgs_per_dialog:
-                # ==========================================
+                        self._log(f"  [{idx}] '{msg_text[:30]}...' → 重要（LLM保护，不可删）")
                # 填充消息（寒暄、表情等）
-                if self._is_filler_message(m):
+                elif self._is_filler_message(m):
                    filler_msgs.append((idx, m))
                    if should_log_details or idx < self._max_debug_msgs_per_dialog:
                        self._log(f"  [{idx}] '{msg_text[:30]}...' → 填充")
-                # 重要信息（学号、成绩、时间、金额等）
+                # 规则层重要信息（学号、成绩、时间、金额等）
                elif self._is_important_message(m):
-                    important_msgs.append((idx, m))
+                    rule_important_msgs.append((idx, m))
                    if should_log_details or idx < self._max_debug_msgs_per_dialog:
                        self._log(f"  [{idx}] '{msg_text[:30]}...' → 重要（场景规则）")
                # 其他消息
@@ -640,6 +646,9 @@ class SemanticPruner:
                    if should_log_details or idx < self._max_debug_msgs_per_dialog:
                        self._log(f"  [{idx}] '{msg_text[:30]}...' → 不重要")
            # important_msgs 仅用于日志统计（兼容下方日志输出）
            important_msgs = llm_protected_msgs + rule_important_msgs
            # 计算删除配额
            delete_target = int(original_count * proportion)
            if proportion > 0 and original_count > 0 and delete_target == 0:
@@ -669,17 +678,17 @@ class SemanticPruner:
                    to_delete_indices.add(idx)
                    deleted_details.append(f"[{idx}] 不重要: '{msg.msg[:50]}'")
-            # 第三步：如果还需要删除，按重要性分数删除重要消息
+            # 第三步：如果还需要删除，按重要性分数删除规则层重要消息（LLM保护消息绝对不删）
            remaining_quota = delete_target - len(to_delete_indices)
-            if remaining_quota > 0 and important_msgs:
+            if remaining_quota > 0 and rule_important_msgs:
                # 按重要性分数排序（分数低的优先删除）
-                imp_sorted = sorted(important_msgs, key=lambda x: self._importance_score(x[1]))
+                imp_sorted = sorted(rule_important_msgs, key=lambda x: self._importance_score(x[1]))
                imp_to_delete = min(len(imp_sorted), remaining_quota)
                for i in range(imp_to_delete):
                    idx, msg = imp_sorted[i]
                    to_delete_indices.add(idx)
                    score = self._importance_score(msg)
-                    deleted_details.append(f"[{idx}] 重要(分数{score}): '{msg.msg[:50]}'")
+                    deleted_details.append(f"[{idx}] 规则重要(分数{score}): '{msg.msg[:50]}'")
            # 执行删除
            kept_msgs = []
--- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py
+++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py
@@ -51,6 +51,22 @@ class SceneConfigRegistry:
        (r"今天|明天|后天|昨天|前天", 3),  # 相对时间（提高权重）
        (r"下周|下月|下年|上周|上月|上年|本周|本月|本年", 3),
        (r"今年|去年|明年", 3),
        # ---- 情绪内容（所有场景通用，用于情绪提取） ----
        (r"开心|高兴|快乐|兴奋|愉快|幸福|满足|喜悦|欣喜", 4),
        (r"难过|悲伤|伤心|痛苦|委屈|失落|沮丧|郁闷|忧郁|绝望", 4),
        (r"生气|愤怒|烦躁|焦虑|紧张|害怕|恐惧|担心|担忧|压力", 4),
        (r"感动|温暖|感激|感谢|惊喜|期待|憧憬|向往", 3),
        (r"无聊|无奈|尴尬|后悔|遗憾|羞愧|惭愧", 3),
        (r"好[开高快]心|很[开高快]心|超[开高快]心|非常[开高快]心", 4),
        (r"好难过|好伤心|好悲伤|好委屈|好痛苦", 4),
        (r"好开心|好高兴|好快乐|好幸福|好感动", 4),
        # ---- 兴趣/爱好内容（所有场景通用，用于兴趣提取） ----
        (r"喜欢|热爱|爱好|兴趣|擅长|享受|沉迷|着迷|痴迷", 4),
        (r"不喜欢|讨厌|厌恶|反感|排斥", 3),
        (r"羽毛球|篮球|足球|排球|乒乓球|网球|棒球|高尔夫", 4),
        (r"游泳|跑步|健身|瑜伽|舞蹈|武术|骑行|登山|徒步", 4),
        (r"音乐|唱歌|吉他|钢琴|绘画|摄影|书法|手工|烹饪", 4),
        (r"游戏|电影|动漫|小说|阅读|旅游|美食|宠物", 3),
    ]
    BASE_LOW_PRIORITY = [
@@ -58,6 +74,8 @@ class SceneConfigRegistry:
        (r"\d{1,2}点\d{0,2}分?", 2),  # 时间点 X点Y分 或 X点
        (r"上午|下午|中午|晚上|早上|傍晚|凌晨", 2),  # 时段（提高权重并扩充）
        (r"AM|PM|am|pm", 1),
        # ---- 情绪程度副词（辅助情绪识别） ----
        (r"特别|非常|超级|极其|十分|很|好[开高快]|太.*了", 1),
    ]
    BASE_FILLERS = {
--- a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2
@@ -9,10 +9,11 @@
    - contacts: [string]，联系方式（电话/手机号/邮箱/微信/QQ等）
    - addresses: [string]，地址/地点相关文本
    - keywords: [string]，其它有助于保留的重要关键词（与场景强相关的术语）
    - preserve_keywords: [string]，必须保留的情绪/兴趣/爱好/个人偏好相关词或短语片段
  要求：
  - 必须只输出上述 JSON，且键名一致；不得输出解释、前后缀；不得包含注释。
-  - times/ids/amounts/contacts/addresses/keywords 仅抽取原文片段或规范化后的简单字符串。
+  - times/ids/amounts/contacts/addresses/keywords/preserve_keywords 仅抽取原文片段或规范化后的简单字符串。
  - 仅输出上述键；避免多余解释或字段。
 #}
@@ -34,13 +35,11 @@
 {# ── 确定最终使用的场景说明 ── #}
 {% if is_builtin_scene %}
  {# 内置专门场景：使用固定说明 #}
  {% set scene_key = pruning_scene %}
  {% if scene_key not in builtin_scene_instructions %}{% set scene_key = 'education' %}{% endif %}
  {% set instruction = builtin_scene_instructions[scene_key][language] if language in ['zh', 'en'] else builtin_scene_instructions[scene_key]['zh'] %}
  {% set custom_types_str = '' %}
 {% else %}
  {# 自定义场景：使用场景名称 + 本体类型列表构建说明 #}
  {% if ontology_classes and ontology_classes | length > 0 %}
    {% if language == 'en' %}
      {% set custom_types_str = ontology_classes | join(', ') %}
@@ -50,7 +49,6 @@
      {% set instruction = '自定义场景「' ~ pruning_scene ~ '」：对话涉及以下任意实体类型时视为相关：' ~ custom_types_str ~ '。' %}
    {% endif %}
  {% else %}
    {# 无本体类型时退化为通用说明 #}
    {% if language == 'en' %}
      {% set instruction = 'Custom scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
    {% else %}
@@ -61,12 +59,43 @@
 {% endif %}
 {% if language == "zh" %}
-请在下方对话全文基础上，按该场景进行一次性抽取并判定相关性：
+你是一个对话内容分析助手。请对下方对话全文进行一次性分析，完成两项任务：
 1. 判断对话是否与指定场景相关；
 2. 从对话中抽取所有需要保留的重要信息片段。
 场景说明：{{ instruction }}
 {% if not is_builtin_scene and custom_types_str %}
 重要提示：只要对话中出现与上述实体类型（{{ custom_types_str }}）相关的内容，即判定为相关（is_related=true）。
 {% endif %}
 ---
 【必须保留的内容（不可删除）】
 以下类型的内容无论是否与场景直接相关，都必须保留，请将其关键词/短语抽取到对应字段：
 - 时间信息：日期、时间点、时间段、有效期 → times 字段
 - 编号信息：学号、工号、订单号、申请号、账号、ID → ids 字段
 - 金额信息：价格、费用、金额（含货币符号或单位） → amounts 字段
 - 联系方式：电话、手机号、邮箱、微信、QQ → contacts 字段
 - 地址信息：地点、地址、位置 → addresses 字段
 - 场景关键词：与场景强相关的专业术语、事件名称 → keywords 字段
 - **情绪与情感**：喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段
 - **兴趣与爱好**：喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段
 - **个人观点与态度**：对某事物的明确看法、评价、立场 → preserve_keywords 字段
 【可以删除的内容】
 以下类型的内容属于低价值信息，可以在剪枝时删除：
 - 纯寒暄问候：如"你好"、"在吗"、"拜拜"、"嗯"、"好的"、"哦"等无实质内容的短语
 - 纯表情/符号：如"[微笑]"、"😊"、"哈哈"等
 - 重复确认：如"对对对"、"是的是的"、"嗯嗯嗯"等无新增信息的重复
 - 无意义填充：如"啊"、"呢"、"嘛"等语气词单独成句
 **注意：即使消息很短，只要包含情绪、兴趣、爱好、个人观点等有价值信息，就必须保留，不得删除。**
 例如：
 - "我好开心呀" → 包含情绪（开心），必须保留，preserve_keywords 中加入"开心"
 - "好喜欢打羽毛球呀" → 包含兴趣爱好（喜欢打羽毛球），必须保留，preserve_keywords 中加入"喜欢打羽毛球"
 - "我好难过" → 包含情绪（难过），必须保留，preserve_keywords 中加入"难过"
 - "太好啦！看到你开心，我也跟着心情亮起来" → 包含情绪，必须保留，preserve_keywords 中加入"开心"
 ---
 对话全文：
 """
 {{ dialog_text }}
@@ -80,15 +109,46 @@
  "amounts": [<string>...],
  "contacts": [<string>...],
  "addresses": [<string>...],
-  "keywords": [<string>...]
+  "keywords": [<string>...],
  "preserve_keywords": [<string>...]
 }
 {% else %}
-Based on the full dialogue below, perform one-time extraction and relevance determination according to this scenario:
+You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks:
 1. Determine whether the dialogue is relevant to the specified scene;
 2. Extract all important information fragments that must be preserved.
 Scenario Description: {{ instruction }}
 {% if not is_builtin_scene and custom_types_str %}
 Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true).
 {% endif %}
 ---
 [MUST PRESERVE (cannot be deleted)]
 The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields:
 - Time information: dates, time points, durations, expiry dates → times field
 - ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field
 - Amount information: prices, fees, amounts (with currency symbols or units) → amounts field
 - Contact information: phone numbers, emails, WeChat, QQ → contacts field
 - Address information: locations, addresses, places → addresses field
 - Scene keywords: professional terms and event names strongly related to the scene → keywords field
 - **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field
 - **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field
 - **Personal opinions and attitudes**: clear views, evaluations, or stances on something → preserve_keywords field
 [CAN BE DELETED]
 The following types of content are low-value and can be removed during pruning:
 - Pure greetings: e.g., "hello", "are you there", "bye", "ok", "yeah" — short phrases with no substantive content
 - Pure emojis/symbols: e.g., "[smile]", "😊", "haha"
 - Repetitive confirmations: e.g., "yes yes yes", "right right", "uh huh" — repetitions with no new information
 - Meaningless fillers: standalone interjections like "ah", "well", "hmm"
 **Note: Even if a message is short, if it contains emotions, interests, hobbies, or personal opinions, it MUST be preserved.**
 Examples:
 - "I'm so happy!" → contains emotion (happy), must preserve; add "happy" to preserve_keywords
 - "I love playing badminton!" → contains interest (love playing badminton), must preserve; add "love playing badminton" to preserve_keywords
 - "I feel so sad" → contains emotion (sad), must preserve; add "sad" to preserve_keywords
 ---
 Full Dialogue:
 """
 {{ dialog_text }}
@@ -102,6 +162,7 @@ Output strict JSON only (fixed keys, order doesn't matter):
  "amounts": [<string>...],
  "contacts": [<string>...],
  "addresses": [<string>...],
-  "keywords": [<string>...]
+  "keywords": [<string>...],
  "preserve_keywords": [<string>...]
 }
 {% endif %}