diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py index ecbe0411..9e8bc05e 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py @@ -33,6 +33,7 @@ class DialogExtractionResponse(BaseModel): - is_related:对话与场景的相关性判定。 - times / ids / amounts / contacts / addresses / keywords:重要信息片段,用来在不相关对话中保留关键消息。 + - preserve_keywords:情绪/兴趣/爱好/个人观点相关词,包含这些词的消息必须强制保留。 """ is_related: bool = Field(...) times: List[str] = Field(default_factory=list) @@ -41,6 +42,7 @@ class DialogExtractionResponse(BaseModel): contacts: List[str] = Field(default_factory=list) addresses: List[str] = Field(default_factory=list) keywords: List[str] = Field(default_factory=list) + preserve_keywords: List[str] = Field(default_factory=list, description="情绪/兴趣/爱好/个人观点相关词,包含这些词的消息强制保留") class MessageImportanceResponse(BaseModel): @@ -198,26 +200,37 @@ class SemanticPruner: return min(score, 10) # 最高10分 + # 情绪/兴趣/爱好安全防线正则(类级别,避免重复编译) + _EMOTION_INTEREST_GUARD = re.compile( + r"开心|高兴|快乐|幸福|感动|难过|悲伤|伤心|委屈|失落|沮丧|郁闷|" + r"生气|愤怒|烦躁|焦虑|害怕|担心|压力|兴奋|期待|惊喜|惊讶|" + r"喜欢|热爱|爱好|兴趣|擅长|享受|沉迷|着迷|讨厌|厌恶|" + r"happy|sad|angry|excited|anxious|love|hate|enjoy|like|dislike" + ) + def _is_filler_message(self, message: ConversationMessage) -> bool: """检测典型寒暄/口头禅/确认类短消息。 - 改进版:更严格的填充消息判断,避免误删场景相关内容 - 满足以下之一视为填充消息: - - 纯标点或空白 - - 在场景特定填充词库中(精确匹配) - - 纯表情符号 - - 常见寒暄(精确匹配短语) - - 注意:不再使用长度判断,避免误删短但重要的消息 + 判断顺序: + 1. 情绪/兴趣安全防线(最高优先级):包含情绪词或兴趣词的消息,无论多短都不视为填充 + 2. 空消息 + 3. 场景特定填充词库精确匹配 + 4. 常见寒暄精确匹配 + 5. 纯表情/标点 """ t = message.msg.strip() if not t: return True - + + # ── 最高优先级:情绪/兴趣安全防线 ── + # "我好开心呀"、"好喜欢打羽毛球呀"、"我好难过" 等一律不视为填充 + if self._EMOTION_INTEREST_GUARD.search(t): + return False + # 检查是否在场景特定填充词库中(精确匹配) if t in self.scene_config.filler_phrases: return True - + # 常见寒暄和问候(精确匹配,避免误删) common_greetings = { "在吗", "在不在", "在呢", "在的", @@ -229,39 +242,29 @@ class SemanticPruner: } if t in common_greetings: return True - + # 检查是否为纯表情符号(方括号包裹) if re.fullmatch(r"(\[[^\]]+\])+", t): return True - + # 检查是否为纯emoji(Unicode表情) emoji_pattern = re.compile( "[" - "\U0001F600-\U0001F64F" # 表情符号 - "\U0001F300-\U0001F5FF" # 符号和象形文字 - "\U0001F680-\U0001F6FF" # 交通和地图符号 - "\U0001F1E0-\U0001F1FF" # 旗帜 + "\U0001F600-\U0001F64F" + "\U0001F300-\U0001F5FF" + "\U0001F680-\U0001F6FF" + "\U0001F1E0-\U0001F1FF" "\U00002702-\U000027B0" "\U000024C2-\U0001F251" "]+", flags=re.UNICODE ) if emoji_pattern.fullmatch(t): return True - + # 纯标点符号 if re.fullmatch(r"[。!?,.!?…·\s]+", t): return True - - # 安全防线:包含情绪词或兴趣词的消息,无论多短都不视为填充 - # 避免"我好开心呀"、"好喜欢打羽毛球呀"等被误删 - _emotion_interest_guard = re.compile( - r"开心|高兴|快乐|幸福|感动|难过|悲伤|伤心|委屈|失落|沮丧|郁闷|" - r"生气|愤怒|烦躁|焦虑|害怕|担心|压力|兴奋|期待|" - r"喜欢|热爱|爱好|兴趣|擅长|享受|沉迷|着迷|讨厌|厌恶" - ) - if _emotion_interest_guard.search(t): - return False - + return False async def _batch_evaluate_importance_with_llm( @@ -604,44 +607,63 @@ class SemanticPruner: result: List[DialogData] = [] total_original_msgs = 0 total_deleted_msgs = 0 - - for d_idx, dd in enumerate(dialogs): + + # 并发执行所有对话的 LLM 抽取(获取 preserve_keywords 等保护信息) + semaphore = asyncio.Semaphore(self.max_concurrent) + + async def extract_with_semaphore(dd: DialogData) -> DialogExtractionResponse: + async with semaphore: + try: + return await self._extract_dialog_important(dd.content) + except Exception as e: + self._log(f"[剪枝-LLM] 对话抽取失败,使用降级策略: {str(e)[:100]}") + return DialogExtractionResponse(is_related=True) + + extraction_tasks = [extract_with_semaphore(dd) for dd in dialogs] + extraction_results: List[DialogExtractionResponse] = await asyncio.gather(*extraction_tasks) + + for d_idx, (dd, extraction) in enumerate(zip(dialogs, extraction_results)): msgs = dd.context.msgs original_count = len(msgs) total_original_msgs += original_count - - # ========== 问答对保护(已注释,暂不启用,留作观察) ========== - # qa_pairs = self._identify_qa_pairs(msgs) - # protected_indices = self._get_protected_indices(msgs, qa_pairs, window_size=0) - # ======================================================== - - # 消息级分类:每条消息独立判断 - important_msgs = [] # 重要消息(保留) - unimportant_msgs = [] # 不重要消息(可删除) - filler_msgs = [] # 填充消息(优先删除) - - # 判断是否需要详细日志(仅对前N条消息记录) + + # 从 LLM 抽取结果中获取所有需要保留的 token + preserve_tokens = ( + extraction.times + extraction.ids + extraction.amounts + + extraction.contacts + extraction.addresses + extraction.keywords + + extraction.preserve_keywords # 情绪/兴趣/爱好关键词 + ) + + # 判断是否需要详细日志 should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog if self._detailed_prune_logging and original_count > self._max_debug_msgs_per_dialog: self._log(f" 对话[{d_idx}]消息数={original_count},仅采样前{self._max_debug_msgs_per_dialog}条进行详细日志") - + + if extraction.preserve_keywords: + self._log(f" 对话[{d_idx}] LLM抽取到情绪/兴趣保护词: {extraction.preserve_keywords}") + + # 消息级分类:每条消息独立判断 + llm_protected_msgs = [] # LLM 保护消息(情绪/兴趣/重要token):绝对不可删除 + rule_important_msgs = [] # 规则层重要消息(场景规则):配额不足时可少量删除 + unimportant_msgs = [] # 不重要消息(可删除) + filler_msgs = [] # 填充消息(优先删除) + for idx, m in enumerate(msgs): msg_text = m.msg.strip() - - # ========== 问答对保护判断(已注释) ========== - # if idx in protected_indices: - # important_msgs.append((idx, m)) - # self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(问答对保护)") - # ========================================== - + + # LLM 保护:消息包含 preserve_keywords(情绪/兴趣词)或其他重要 token → 绝对不可删除 + if self._msg_matches_tokens(m, preserve_tokens): + llm_protected_msgs.append((idx, m)) + if should_log_details or idx < self._max_debug_msgs_per_dialog: + self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(LLM保护,不可删)") # 填充消息(寒暄、表情等) - if self._is_filler_message(m): + elif self._is_filler_message(m): filler_msgs.append((idx, m)) if should_log_details or idx < self._max_debug_msgs_per_dialog: self._log(f" [{idx}] '{msg_text[:30]}...' → 填充") - # 重要信息(学号、成绩、时间、金额等) + # 规则层重要信息(学号、成绩、时间、金额等) elif self._is_important_message(m): - important_msgs.append((idx, m)) + rule_important_msgs.append((idx, m)) if should_log_details or idx < self._max_debug_msgs_per_dialog: self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(场景规则)") # 其他消息 @@ -649,6 +671,9 @@ class SemanticPruner: unimportant_msgs.append((idx, m)) if should_log_details or idx < self._max_debug_msgs_per_dialog: self._log(f" [{idx}] '{msg_text[:30]}...' → 不重要") + + # important_msgs 仅用于日志统计(兼容下方日志输出) + important_msgs = llm_protected_msgs + rule_important_msgs # 计算删除配额 delete_target = int(original_count * proportion) @@ -679,17 +704,17 @@ class SemanticPruner: to_delete_indices.add(idx) deleted_details.append(f"[{idx}] 不重要: '{msg.msg[:50]}'") - # 第三步:如果还需要删除,按重要性分数删除重要消息 + # 第三步:如果还需要删除,按重要性分数删除规则层重要消息(LLM保护消息绝对不删) remaining_quota = delete_target - len(to_delete_indices) - if remaining_quota > 0 and important_msgs: + if remaining_quota > 0 and rule_important_msgs: # 按重要性分数排序(分数低的优先删除) - imp_sorted = sorted(important_msgs, key=lambda x: self._importance_score(x[1])) + imp_sorted = sorted(rule_important_msgs, key=lambda x: self._importance_score(x[1])) imp_to_delete = min(len(imp_sorted), remaining_quota) for i in range(imp_to_delete): idx, msg = imp_sorted[i] to_delete_indices.add(idx) score = self._importance_score(msg) - deleted_details.append(f"[{idx}] 重要(分数{score}): '{msg.msg[:50]}'") + deleted_details.append(f"[{idx}] 规则重要(分数{score}): '{msg.msg[:50]}'") # 执行删除 kept_msgs = [] diff --git a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 index 6b620df9..47b3badb 100644 --- a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 @@ -9,10 +9,11 @@ - contacts: [string],联系方式(电话/手机号/邮箱/微信/QQ等) - addresses: [string],地址/地点相关文本 - keywords: [string],其它有助于保留的重要关键词(与场景强相关的术语) + - preserve_keywords: [string],必须保留的情绪/兴趣/爱好/个人偏好相关词或短语片段 要求: - 必须只输出上述 JSON,且键名一致;不得输出解释、前后缀;不得包含注释。 - - times/ids/amounts/contacts/addresses/keywords 仅抽取原文片段或规范化后的简单字符串。 + - times/ids/amounts/contacts/addresses/keywords/preserve_keywords 仅抽取原文片段或规范化后的简单字符串。 - 仅输出上述键;避免多余解释或字段。 #} @@ -34,13 +35,11 @@ {# ── 确定最终使用的场景说明 ── #} {% if is_builtin_scene %} - {# 内置专门场景:使用固定说明 #} {% set scene_key = pruning_scene %} {% if scene_key not in builtin_scene_instructions %}{% set scene_key = 'education' %}{% endif %} {% set instruction = builtin_scene_instructions[scene_key][language] if language in ['zh', 'en'] else builtin_scene_instructions[scene_key]['zh'] %} {% set custom_types_str = '' %} {% else %} - {# 自定义场景:使用场景名称 + 本体类型列表构建说明 #} {% if ontology_classes and ontology_classes | length > 0 %} {% if language == 'en' %} {% set custom_types_str = ontology_classes | join(', ') %} @@ -50,7 +49,6 @@ {% set instruction = '自定义场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %} {% endif %} {% else %} - {# 无本体类型时退化为通用说明 #} {% if language == 'en' %} {% set instruction = 'Custom scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %} {% else %} @@ -61,12 +59,43 @@ {% endif %} {% if language == "zh" %} -请在下方对话全文基础上,按该场景进行一次性抽取并判定相关性: +你是一个对话内容分析助手。请对下方对话全文进行一次性分析,完成两项任务: +1. 判断对话是否与指定场景相关; +2. 从对话中抽取所有需要保留的重要信息片段。 + 场景说明:{{ instruction }} {% if not is_builtin_scene and custom_types_str %} 重要提示:只要对话中出现与上述实体类型({{ custom_types_str }})相关的内容,即判定为相关(is_related=true)。 {% endif %} +--- +【必须保留的内容(不可删除)】 +以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段: +- 时间信息:日期、时间点、时间段、有效期 → times 字段 +- 编号信息:学号、工号、订单号、申请号、账号、ID → ids 字段 +- 金额信息:价格、费用、金额(含货币符号或单位) → amounts 字段 +- 联系方式:电话、手机号、邮箱、微信、QQ → contacts 字段 +- 地址信息:地点、地址、位置 → addresses 字段 +- 场景关键词:与场景强相关的专业术语、事件名称 → keywords 字段 +- **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段 +- **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段 +- **个人观点与态度**:对某事物的明确看法、评价、立场 → preserve_keywords 字段 + +【可以删除的内容】 +以下类型的内容属于低价值信息,可以在剪枝时删除: +- 纯寒暄问候:如"你好"、"在吗"、"拜拜"、"嗯"、"好的"、"哦"等无实质内容的短语 +- 纯表情/符号:如"[微笑]"、"😊"、"哈哈"等 +- 重复确认:如"对对对"、"是的是的"、"嗯嗯嗯"等无新增信息的重复 +- 无意义填充:如"啊"、"呢"、"嘛"等语气词单独成句 + +**注意:即使消息很短,只要包含情绪、兴趣、爱好、个人观点等有价值信息,就必须保留,不得删除。** +例如: +- "我好开心呀" → 包含情绪(开心),必须保留,preserve_keywords 中加入"开心" +- "好喜欢打羽毛球呀" → 包含兴趣爱好(喜欢打羽毛球),必须保留,preserve_keywords 中加入"喜欢打羽毛球" +- "我好难过" → 包含情绪(难过),必须保留,preserve_keywords 中加入"难过" +- "太好啦!看到你开心,我也跟着心情亮起来" → 包含情绪,必须保留,preserve_keywords 中加入"开心" + +--- 对话全文: """ {{ dialog_text }} @@ -80,15 +109,46 @@ "amounts": [...], "contacts": [...], "addresses": [...], - "keywords": [...] + "keywords": [...], + "preserve_keywords": [...] } {% else %} -Based on the full dialogue below, perform one-time extraction and relevance determination according to this scenario: +You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks: +1. Determine whether the dialogue is relevant to the specified scene; +2. Extract all important information fragments that must be preserved. + Scenario Description: {{ instruction }} {% if not is_builtin_scene and custom_types_str %} Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true). {% endif %} +--- +[MUST PRESERVE (cannot be deleted)] +The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields: +- Time information: dates, time points, durations, expiry dates → times field +- ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field +- Amount information: prices, fees, amounts (with currency symbols or units) → amounts field +- Contact information: phone numbers, emails, WeChat, QQ → contacts field +- Address information: locations, addresses, places → addresses field +- Scene keywords: professional terms and event names strongly related to the scene → keywords field +- **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field +- **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field +- **Personal opinions and attitudes**: clear views, evaluations, or stances on something → preserve_keywords field + +[CAN BE DELETED] +The following types of content are low-value and can be removed during pruning: +- Pure greetings: e.g., "hello", "are you there", "bye", "ok", "yeah" — short phrases with no substantive content +- Pure emojis/symbols: e.g., "[smile]", "😊", "haha" +- Repetitive confirmations: e.g., "yes yes yes", "right right", "uh huh" — repetitions with no new information +- Meaningless fillers: standalone interjections like "ah", "well", "hmm" + +**Note: Even if a message is short, if it contains emotions, interests, hobbies, or personal opinions, it MUST be preserved.** +Examples: +- "I'm so happy!" → contains emotion (happy), must preserve; add "happy" to preserve_keywords +- "I love playing badminton!" → contains interest (love playing badminton), must preserve; add "love playing badminton" to preserve_keywords +- "I feel so sad" → contains emotion (sad), must preserve; add "sad" to preserve_keywords + +--- Full Dialogue: """ {{ dialog_text }} @@ -102,6 +162,7 @@ Output strict JSON only (fixed keys, order doesn't matter): "amounts": [...], "contacts": [...], "addresses": [...], - "keywords": [...] + "keywords": [...], + "preserve_keywords": [...] } {% endif %}