diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py index 567c0347..28f7d8e0 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py @@ -88,26 +88,17 @@ class SemanticPruner: self._detailed_prune_logging = True # 是否启用详细日志 self._max_debug_msgs_per_dialog = 20 # 每个对话最多记录前N条消息的详细日志 - # 加载场景特定配置(内置场景走专门规则,自定义场景 fallback 到通用规则) - self.scene_config: ScenePatterns = SceneConfigRegistry.get_config( - self.config.pruning_scene, - fallback_to_generic=True - ) + # 加载统一填充词库 + self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(self.config.pruning_scene) - # 判断是否为内置专门场景 - self._is_builtin_scene = SceneConfigRegistry.is_scene_supported(self.config.pruning_scene) - - # 自定义场景的本体类型列表(用于注入提示词) + # 本体类型列表(用于注入提示词,所有场景均支持) self._ontology_classes = getattr(self.config, "ontology_classes", None) or [] - if self._is_builtin_scene: - self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene} 使用内置专门配置") + self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene}") + if self._ontology_classes: + self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}") else: - self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene} 为自定义场景,使用通用规则 + 本体类型提示词注入") - if self._ontology_classes: - self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}") - else: - self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词") + self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词") # Load Jinja2 template self.template = prompt_env.get_template("extracat_Pruning.jinja2") @@ -119,88 +110,9 @@ class SemanticPruner: # 运行日志:收集关键终端输出,便于写入 JSON self.run_logs: List[str] = [] - def _is_important_message(self, message: ConversationMessage) -> bool: - """基于启发式规则识别重要信息消息,优先保留。 - - 改进版:使用场景特定的模式进行识别 - - 根据 pruning_scene 动态加载对应的识别规则 - - 支持教育、在线服务、外呼三个场景的特定模式 - """ - text = message.msg.strip() - if not text: - return False - - # 使用场景特定的模式 - all_patterns = ( - self.scene_config.high_priority_patterns + - self.scene_config.medium_priority_patterns + - self.scene_config.low_priority_patterns - ) - - for pattern, _ in all_patterns: - if re.search(pattern, text, flags=re.IGNORECASE): - return True - - # 检查是否为问句(以问号结尾或包含疑问词) - if text.endswith("?") or text.endswith("?"): - return True - - # 检查是否包含问句关键词 - if any(keyword in text for keyword in self.scene_config.question_keywords): - return True - - # 检查是否包含决策性关键词 - if any(keyword in text for keyword in self.scene_config.decision_keywords): - return True - - return False - - def _importance_score(self, message: ConversationMessage) -> int: - """为重要消息打分,用于在保留比例内优先保留更关键的内容。 - - 改进版:使用场景特定的权重体系(0-10分) - - 根据场景动态调整不同信息类型的权重 - - 高优先级模式:4-6分 - - 中优先级模式:2-3分 - - 低优先级模式:1分 - """ - text = message.msg.strip() - score = 0 - - # 使用场景特定的权重 - for pattern, weight in self.scene_config.high_priority_patterns: - if re.search(pattern, text, flags=re.IGNORECASE): - score += weight - - for pattern, weight in self.scene_config.medium_priority_patterns: - if re.search(pattern, text, flags=re.IGNORECASE): - score += weight - - for pattern, weight in self.scene_config.low_priority_patterns: - if re.search(pattern, text, flags=re.IGNORECASE): - score += weight - - # 问句加分 - if text.endswith("?") or text.endswith("?"): - score += 2 - - # 包含问句关键词加分 - if any(keyword in text for keyword in self.scene_config.question_keywords): - score += 1 - - # 包含决策性关键词加分 - if any(keyword in text for keyword in self.scene_config.decision_keywords): - score += 2 - - # 长度加分(较长的消息通常包含更多信息) - if len(text) > 50: - score += 1 - if len(text) > 100: - score += 1 - - return min(score, 10) # 最高10分 - - # 情绪/兴趣/爱好安全防线正则已移除,改由 extracat_Pruning.jinja2 提示词中的 preserve_keywords 机制处理 + # _is_important_message 和 _importance_score 已移除: + # 重要性判断完全由 extracat_Pruning.jinja2 提示词 + LLM 的 preserve_tokens 机制承担。 + # LLM 根据注入的本体工程类型语义识别需要保护的内容,无需硬编码正则规则。 def _is_filler_message(self, message: ConversationMessage) -> bool: """检测典型寒暄/口头禅/确认类短消息。 @@ -419,14 +331,12 @@ class SemanticPruner: rendered = self.template.render( pruning_scene=self.config.pruning_scene, - is_builtin_scene=self._is_builtin_scene, ontology_classes=self._ontology_classes, dialog_text=dialog_text, language=self.language ) log_template_rendering("extracat_Pruning.jinja2", { "pruning_scene": self.config.pruning_scene, - "is_builtin_scene": self._is_builtin_scene, "ontology_classes_count": len(self._ontology_classes), "language": self.language }) @@ -491,62 +401,56 @@ class SemanticPruner: # 相关对话不剪枝 return dialog - # 在不相关对话中,识别重要/不重要消息 - tokens = extraction.times + extraction.ids + extraction.amounts + extraction.contacts + extraction.addresses + extraction.keywords + # 在不相关对话中,LLM 已通过 preserve_tokens 标记需要保护的内容 + preserve_tokens = ( + extraction.times + extraction.ids + extraction.amounts + + extraction.contacts + extraction.addresses + extraction.keywords + + extraction.preserve_keywords + ) msgs = dialog.context.msgs - imp_unrel_msgs: List[ConversationMessage] = [] - unimp_unrel_msgs: List[ConversationMessage] = [] + + # 分类:填充 / 其他可删(LLM保护消息通过不加入任何桶来隐式保护) + filler_ids: set = set() + deletable: List[ConversationMessage] = [] + for m in msgs: - if self._msg_matches_tokens(m, tokens) or self._is_important_message(m): - imp_unrel_msgs.append(m) + if self._msg_matches_tokens(m, preserve_tokens): + pass # 保护消息:不加入任何桶,不会被删除 + elif self._is_filler_message(m): + filler_ids.add(id(m)) else: - unimp_unrel_msgs.append(m) - # 计算总删除目标数量 + deletable.append(m) + + # 计算删除目标 total_unrel = len(msgs) delete_target = int(total_unrel * proportion) if proportion > 0 and total_unrel > 0 and delete_target == 0: delete_target = 1 - imp_del_cap = min(int(len(imp_unrel_msgs) * proportion), len(imp_unrel_msgs)) - unimp_del_cap = len(unimp_unrel_msgs) - max_capacity = max(0, len(msgs) - 1) - max_deletable = min(imp_del_cap + unimp_del_cap, max_capacity) + max_deletable = min(len(filler_ids) + len(deletable), max(0, total_unrel - 1)) delete_target = min(delete_target, max_deletable) - # 删除配额分配 - del_unimp = min(delete_target, unimp_del_cap) - rem = delete_target - del_unimp - del_imp = min(rem, imp_del_cap) - # 选取删除集合 - unimp_delete_ids = [] - imp_delete_ids = [] - if del_unimp > 0: - # 按出现顺序选取前 del_unimp 条不重要消息进行删除(确定性、可复现) - unimp_delete_ids = [id(m) for m in unimp_unrel_msgs[:del_unimp]] - if del_imp > 0: - imp_sorted = sorted(imp_unrel_msgs, key=lambda m: self._importance_score(m)) - imp_delete_ids = [id(m) for m in imp_sorted[:del_imp]] - - # 统计实际删除数量(重要/不重要) - actual_unimp_deleted = 0 - actual_imp_deleted = 0 - kept_msgs = [] - delete_targets = set(unimp_delete_ids) | set(imp_delete_ids) + # 优先删填充,再删其他可删消息(按出现顺序) + to_delete_ids: set = set() for m in msgs: - mid = id(m) - if mid in delete_targets: - if mid in set(unimp_delete_ids) and actual_unimp_deleted < del_unimp: - actual_unimp_deleted += 1 - continue - if mid in set(imp_delete_ids) and actual_imp_deleted < del_imp: - actual_imp_deleted += 1 - continue - kept_msgs.append(m) + if len(to_delete_ids) >= delete_target: + break + if id(m) in filler_ids: + to_delete_ids.add(id(m)) + for m in deletable: + if len(to_delete_ids) >= delete_target: + break + to_delete_ids.add(id(m)) + + kept_msgs = [m for m in msgs if id(m) not in to_delete_ids] if not kept_msgs and msgs: kept_msgs = [msgs[0]] - deleted_total = actual_unimp_deleted + actual_imp_deleted + deleted_total = len(msgs) - len(kept_msgs) + protected_count = len(msgs) - len(filler_ids) - len(deletable) self._log( - f"[剪枝-对话] 对话ID={dialog.id} 总消息={len(msgs)} 删除目标={delete_target} 实删={deleted_total} 保留={len(kept_msgs)}" + f"[剪枝-对话] 对话ID={dialog.id} 总消息={len(msgs)} " + f"(保护={protected_count} 填充={len(filler_ids)} 可删={len(deletable)}) " + f"删除目标={delete_target} 实删={deleted_total} 保留={len(kept_msgs)}" ) dialog.context = ConversationContext(msgs=kept_msgs) @@ -616,38 +520,29 @@ class SemanticPruner: if extraction.preserve_keywords: self._log(f" 对话[{d_idx}] LLM抽取到情绪/兴趣保护词: {extraction.preserve_keywords}") - # 消息级分类:每条消息独立判断 - llm_protected_msgs = [] # LLM 保护消息(情绪/兴趣/重要token):绝对不可删除 - rule_important_msgs = [] # 规则层重要消息(场景规则):配额不足时可少量删除 - unimportant_msgs = [] # 不重要消息(可删除) + # 消息级分类:LLM保护 / 填充 / 其他可删 + llm_protected_msgs = [] # LLM 保护消息(preserve_tokens 命中):绝对不可删除 filler_msgs = [] # 填充消息(优先删除) + deletable_msgs = [] # 其余消息(按比例删除) for idx, m in enumerate(msgs): msg_text = m.msg.strip() - # LLM 保护:消息包含 preserve_keywords(情绪/兴趣词)或其他重要 token → 绝对不可删除 if self._msg_matches_tokens(m, preserve_tokens): llm_protected_msgs.append((idx, m)) if should_log_details or idx < self._max_debug_msgs_per_dialog: - self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(LLM保护,不可删)") - # 填充消息(寒暄、表情等) + self._log(f" [{idx}] '{msg_text[:30]}...' → 保护(LLM,不可删)") elif self._is_filler_message(m): filler_msgs.append((idx, m)) if should_log_details or idx < self._max_debug_msgs_per_dialog: self._log(f" [{idx}] '{msg_text[:30]}...' → 填充") - # 规则层重要信息(学号、成绩、时间、金额等) - elif self._is_important_message(m): - rule_important_msgs.append((idx, m)) - if should_log_details or idx < self._max_debug_msgs_per_dialog: - self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(场景规则)") - # 其他消息 else: - unimportant_msgs.append((idx, m)) + deletable_msgs.append((idx, m)) if should_log_details or idx < self._max_debug_msgs_per_dialog: - self._log(f" [{idx}] '{msg_text[:30]}...' → 不重要") + self._log(f" [{idx}] '{msg_text[:30]}...' → 可删") - # important_msgs 仅用于日志统计(兼容下方日志输出) - important_msgs = llm_protected_msgs + rule_important_msgs + # important_msgs 仅用于日志统计 + important_msgs = llm_protected_msgs # 计算删除配额 delete_target = int(original_count * proportion) @@ -658,37 +553,23 @@ class SemanticPruner: max_deletable = max(0, original_count - 1) delete_target = min(delete_target, max_deletable) - # 删除策略:优先删除填充消息,再删除不重要消息 + # 删除策略:优先删填充消息,再按出现顺序删其余可删消息 to_delete_indices = set() - deleted_details = [] # 记录删除的消息详情 - + deleted_details = [] + # 第一步:删除填充消息 - filler_to_delete = min(len(filler_msgs), delete_target) - for i in range(filler_to_delete): - idx, msg = filler_msgs[i] + for idx, msg in filler_msgs: + if len(to_delete_indices) >= delete_target: + break to_delete_indices.add(idx) deleted_details.append(f"[{idx}] 填充: '{msg.msg[:50]}'") - - # 第二步:如果还需要删除,删除不重要消息 - remaining_quota = delete_target - len(to_delete_indices) - if remaining_quota > 0: - unimp_to_delete = min(len(unimportant_msgs), remaining_quota) - for i in range(unimp_to_delete): - idx, msg = unimportant_msgs[i] - to_delete_indices.add(idx) - deleted_details.append(f"[{idx}] 不重要: '{msg.msg[:50]}'") - - # 第三步:如果还需要删除,按重要性分数删除规则层重要消息(LLM保护消息绝对不删) - remaining_quota = delete_target - len(to_delete_indices) - if remaining_quota > 0 and rule_important_msgs: - # 按重要性分数排序(分数低的优先删除) - imp_sorted = sorted(rule_important_msgs, key=lambda x: self._importance_score(x[1])) - imp_to_delete = min(len(imp_sorted), remaining_quota) - for i in range(imp_to_delete): - idx, msg = imp_sorted[i] - to_delete_indices.add(idx) - score = self._importance_score(msg) - deleted_details.append(f"[{idx}] 规则重要(分数{score}): '{msg.msg[:50]}'") + + # 第二步:如果还需要删除,按出现顺序删可删消息 + for idx, msg in deletable_msgs: + if len(to_delete_indices) >= delete_target: + break + to_delete_indices.add(idx) + deleted_details.append(f"[{idx}] 可删: '{msg.msg[:50]}'") # 执行删除 kept_msgs = [] @@ -716,7 +597,7 @@ class SemanticPruner: self._log( f"[剪枝-对话] 对话 {d_idx+1} 总消息={original_count} " - f"(重要={len(important_msgs)} 不重要={len(unimportant_msgs)} 填充={len(filler_msgs)}) " + f"(保护={len(important_msgs)} 填充={len(filler_msgs)} 可删={len(deletable_msgs)}) " f"删除={deleted_count} 保留={len(kept_msgs)}" ) diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py index a79ebea5..8e97163e 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py @@ -1,84 +1,25 @@ """ -场景特定配置 - 为不同场景提供定制化的剪枝规则 +场景特定配置 - 统一填充词库 -功能: -- 场景特定的重要信息识别模式 -- 场景特定的重要性评分权重 -- 场景特定的填充词库 -- 场景特定的问答对识别规则 +重要性判断已完全交由 extracat_Pruning.jinja2 提示词 + LLM preserve_tokens 机制承担。 +本模块仅保留统一填充词库(filler_phrases),用于识别无意义寒暄/表情/口头禅。 +所有场景共用同一份词库,场景差异由 LLM 语义判断处理。 """ -from typing import Dict, List, Set, Tuple +from typing import List, Set from dataclasses import dataclass, field @dataclass class ScenePatterns: - """场景特定的识别模式""" - - # 重要信息的正则模式(优先级从高到低) - high_priority_patterns: List[Tuple[str, int]] = field(default_factory=list) # (pattern, weight) - medium_priority_patterns: List[Tuple[str, int]] = field(default_factory=list) - low_priority_patterns: List[Tuple[str, int]] = field(default_factory=list) - - # 填充词库(无意义对话) + """场景特定的识别模式(仅保留填充词库)""" filler_phrases: Set[str] = field(default_factory=set) - - # 问句关键词(用于识别问答对) - question_keywords: Set[str] = field(default_factory=set) - - # 决策性/承诺性关键词 - decision_keywords: Set[str] = field(default_factory=set) class SceneConfigRegistry: - """场景配置注册表 - 管理所有场景的特定配置""" - - # 基础通用模式(所有场景共享) - BASE_HIGH_PRIORITY = [ - (r"订单号|工单|申请号|编号|ID|账号|账户", 5), - (r"金额|费用|价格|¥|¥|\d+元", 5), - (r"\d{11}", 4), # 手机号 - (r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", 4), # 邮箱 - ] - - BASE_MEDIUM_PRIORITY = [ - (r"\d{4}-\d{1,2}-\d{1,2}", 3), # 日期 - (r"\d{4}年\d{1,2}月\d{1,2}日", 3), - (r"电话|手机号|微信|QQ|联系方式", 3), - (r"地址|地点|位置", 2), - (r"时间|日期|有效期|截止", 2), - (r"今天|明天|后天|昨天|前天", 3), # 相对时间(提高权重) - (r"下周|下月|下年|上周|上月|上年|本周|本月|本年", 3), - (r"今年|去年|明年", 3), - # ---- 情绪内容(所有场景通用,用于情绪提取) ---- - (r"开心|高兴|快乐|兴奋|愉快|幸福|满足|喜悦|欣喜", 4), - (r"难过|悲伤|伤心|痛苦|委屈|失落|沮丧|郁闷|忧郁|绝望", 4), - (r"生气|愤怒|烦躁|焦虑|紧张|害怕|恐惧|担心|担忧|压力", 4), - (r"感动|温暖|感激|感谢|惊喜|期待|憧憬|向往", 3), - (r"无聊|无奈|尴尬|后悔|遗憾|羞愧|惭愧", 3), - (r"好[开高快]心|很[开高快]心|超[开高快]心|非常[开高快]心", 4), - (r"好难过|好伤心|好悲伤|好委屈|好痛苦", 4), - (r"好开心|好高兴|好快乐|好幸福|好感动", 4), - # ---- 兴趣/爱好内容(所有场景通用,用于兴趣提取) ---- - (r"喜欢|热爱|爱好|兴趣|擅长|享受|沉迷|着迷|痴迷", 4), - (r"不喜欢|讨厌|厌恶|反感|排斥", 3), - (r"羽毛球|篮球|足球|排球|乒乓球|网球|棒球|高尔夫", 4), - (r"游泳|跑步|健身|瑜伽|舞蹈|武术|骑行|登山|徒步", 4), - (r"音乐|唱歌|吉他|钢琴|绘画|摄影|书法|手工|烹饪", 4), - (r"游戏|电影|动漫|小说|阅读|旅游|美食|宠物", 3), - ] - - BASE_LOW_PRIORITY = [ - (r"\d{1,2}:\d{2}", 2), # 时间点 HH:MM - (r"\d{1,2}点\d{0,2}分?", 2), # 时间点 X点Y分 或 X点 - (r"上午|下午|中午|晚上|早上|傍晚|凌晨", 2), # 时段(提高权重并扩充) - (r"AM|PM|am|pm", 1), - # ---- 情绪程度副词(辅助情绪识别) ---- - (r"特别|非常|超级|极其|十分|很|好[开高快]|太.*了", 1), - ] - - BASE_FILLERS = { + """场景配置注册表 - 所有场景共用统一填充词库""" + + BASE_FILLERS: Set[str] = { # 基础寒暄 "你好", "您好", "在吗", "在的", "在呢", "嗯", "嗯嗯", "哦", "哦哦", "好的", "好", "行", "可以", "不可以", "谢谢", "多谢", "感谢", @@ -87,7 +28,26 @@ class SceneConfigRegistry: "哈哈", "呵呵", "哈哈哈", "嘿嘿", "嘻嘻", "hiahia", "额", "呃", "啊", "诶", "唉", "哎", "嗯哼", # 确认词 - "是的", "对", "对的", "没错", "嗯嗯", "好嘞", "收到", "明白", "了解", "知道了", + "是的", "对", "对的", "没错", "好嘞", "收到", "明白", "了解", "知道了", + # 服务类套话 + "请问", "请稍等", "稍等", "马上", "立即", + "正在查询", "正在处理", "正在为您", "帮您查一下", + "还有其他问题吗", "还需要什么帮助", "很高兴为您服务", + "感谢您的耐心等待", "抱歉让您久等了", + "已记录", "已反馈", "已转接", "已升级", + "祝您生活愉快", "欢迎下次咨询", + # 外呼套话 + "喂", "hello", "打扰了", "不好意思", + "方便接电话吗", "现在方便吗", "占用您一点时间", + "我是", "我们是", "我们公司", "我们这边", + "了解一下", "介绍一下", "简单说一下", + "考虑考虑", "想一想", "再说", "再看看", + "不需要", "不感兴趣", "没兴趣", "不用了", + "没问题", "那就这样", "再联系", "回头聊", "有需要再说", + # 教育场景套话 + "老师好", "同学们好", "上课", "下课", "起立", "坐下", + "举手", "请坐", "很好", "不错", "继续", + "下一个", "下一题", "下一位", "还有吗", "还有问题吗", # 标点和符号 "。。。", "...", "???", "???", "!!!", "!!!", # 表情符号 @@ -99,246 +59,8 @@ class SceneConfigRegistry: "hhh", "hhhh", "2333", "666", "gg", "ok", "OK", "okok", "emmm", "emm", "em", "mmp", "wtf", "omg", } - - BASE_QUESTION_KEYWORDS = { - "什么", "为什么", "怎么", "如何", "哪里", "哪个", "谁", "多少", "几点", "何时", "吗" - } - - BASE_DECISION_KEYWORDS = { - "必须", "一定", "务必", "需要", "要求", "规定", "应该", - "承诺", "保证", "确保", "负责", "同意", "答应" - } - + @classmethod - def get_education_config(cls) -> ScenePatterns: - """教育场景配置""" - return ScenePatterns( - high_priority_patterns=cls.BASE_HIGH_PRIORITY + [ - # 成绩相关(最高优先级) - (r"成绩|分数|得分|满分|及格|不及格", 6), - (r"GPA|绩点|学分|平均分", 6), - (r"\d+分|\d+\.?\d*分", 5), # 具体分数 - (r"排名|名次|第.{1,3}名", 5), # 支持"第三名"、"第1名"等 - - # 学籍信息 - (r"学号|学生证|教师工号|工号", 5), - (r"班级|年级|专业|院系", 4), - - # 课程相关 - (r"课程|科目|学科|必修|选修", 4), - (r"教材|课本|教科书|参考书", 4), - (r"章节|第.{1,3}章|第.{1,3}节", 3), # 支持"第三章"、"第1章"等 - - # 学科内容(新增) - (r"微积分|导数|积分|函数|极限|微分", 4), - (r"代数|几何|三角|概率|统计", 4), - (r"物理|化学|生物|历史|地理", 4), - (r"英语|语文|数学|政治|哲学", 4), - (r"定义|定理|公式|概念|原理|法则", 3), - (r"例题|解题|证明|推导|计算", 3), - ], - medium_priority_patterns=cls.BASE_MEDIUM_PRIORITY + [ - # 教学活动 - (r"作业|练习|习题|题目", 3), - (r"考试|测验|测试|考核|期中|期末", 3), - (r"上课|下课|课堂|讲课", 2), - (r"提问|回答|发言|讨论", 2), - (r"问一下|请教|咨询|询问", 2), # 新增:问询相关 - (r"理解|明白|懂|掌握|学会", 2), # 新增:学习状态 - - # 时间安排 - (r"课表|课程表|时间表", 3), - (r"第.{1,3}节课|第.{1,3}周", 2), # 支持"第三节课"、"第1周"等 - ], - low_priority_patterns=cls.BASE_LOW_PRIORITY + [ - (r"老师|教师|同学|学生", 1), - (r"教室|实验室|图书馆", 1), - ], - filler_phrases=cls.BASE_FILLERS | { - # 教育场景特有填充词(移除了"明白了"、"懂了"、"不懂"等,这些在教育场景中有意义) - "老师好", "同学们好", "上课", "下课", "起立", "坐下", - "举手", "请坐", "很好", "不错", "继续", - "下一个", "下一题", "下一位", "还有吗", "还有问题吗", - }, - question_keywords=cls.BASE_QUESTION_KEYWORDS | { - "为啥", "咋", "咋办", "怎样", "如何做", - "能不能", "可不可以", "行不行", "对不对", "是不是", - }, - decision_keywords=cls.BASE_DECISION_KEYWORDS | { - "必考", "重点", "考点", "难点", "关键", - "记住", "背诵", "掌握", "理解", "复习", - } - ) - - @classmethod - def get_online_service_config(cls) -> ScenePatterns: - """在线服务场景配置""" - return ScenePatterns( - high_priority_patterns=cls.BASE_HIGH_PRIORITY + [ - # 工单相关(最高优先级) - (r"工单号|工单编号|ticket|TK\d+", 6), - (r"工单状态|处理中|已解决|已关闭|待处理", 5), - (r"优先级|紧急|高优先级|P0|P1|P2", 5), - - # 产品信息 - (r"产品型号|型号|SKU|产品编号", 5), - (r"序列号|SN|设备号", 5), - (r"版本号|软件版本|固件版本", 4), - - # 问题描述 - (r"故障|错误|异常|bug|问题", 4), - (r"错误代码|故障代码|error code", 5), - (r"无法|不能|失败|报错", 3), - ], - medium_priority_patterns=cls.BASE_MEDIUM_PRIORITY + [ - # 服务相关 - (r"退款|退货|换货|补发", 4), - (r"发票|收据|凭证", 3), - (r"物流|快递|运单号", 3), - (r"保修|质保|售后", 3), - - # 时效相关 - (r"SLA|响应时间|处理时长", 4), - (r"超时|延迟|等待", 2), - ], - low_priority_patterns=cls.BASE_LOW_PRIORITY + [ - (r"客服|工程师|技术支持", 1), - (r"用户|客户|会员", 1), - ], - filler_phrases=cls.BASE_FILLERS | { - # 在线服务特有填充词 - "您好", "请问", "请稍等", "稍等", "马上", "立即", - "正在查询", "正在处理", "正在为您", "帮您查一下", - "还有其他问题吗", "还需要什么帮助", "很高兴为您服务", - "感谢您的耐心等待", "抱歉让您久等了", - "已记录", "已反馈", "已转接", "已升级", - "祝您生活愉快", "再见", "欢迎下次咨询", - }, - question_keywords=cls.BASE_QUESTION_KEYWORDS | { - "能否", "可否", "是否", "有没有", "能不能", - "怎么办", "如何处理", "怎么解决", - }, - decision_keywords=cls.BASE_DECISION_KEYWORDS | { - "立即处理", "马上解决", "尽快", "优先", - "升级", "转接", "派单", "跟进", - "补偿", "赔偿", "退款", "换货", - } - ) - - @classmethod - def get_outbound_config(cls) -> ScenePatterns: - """外呼场景配置""" - return ScenePatterns( - high_priority_patterns=cls.BASE_HIGH_PRIORITY + [ - # 意向相关(最高优先级) - (r"意向|意愿|兴趣|感兴趣", 6), - (r"A类|B类|C类|D类|高意向|低意向", 6), - (r"成交|签约|下单|购买|确认", 6), - - # 联系信息(外呼场景中更重要) - (r"预约|约定|安排|确定时间", 5), - (r"下次联系|回访|跟进", 5), - (r"方便|有空|可以|时间", 4), - - # 通话状态 - (r"接通|未接通|占线|关机|停机", 4), - (r"通话时长|通话时间", 3), - ], - medium_priority_patterns=cls.BASE_MEDIUM_PRIORITY + [ - # 客户信息 - (r"姓名|称呼|先生|女士", 3), - (r"公司|单位|职位|职务", 3), - (r"需求|要求|期望", 3), - - # 跟进状态 - (r"跟进状态|进展|进度", 3), - (r"已联系|待联系|联系中", 2), - (r"拒绝|不感兴趣|考虑|再说", 3), - ], - low_priority_patterns=cls.BASE_LOW_PRIORITY + [ - (r"销售|客户经理|业务员", 1), - (r"产品|服务|方案", 1), - ], - filler_phrases=cls.BASE_FILLERS | { - # 外呼场景特有填充词 - "您好", "喂", "hello", "打扰了", "不好意思", - "方便接电话吗", "现在方便吗", "占用您一点时间", - "我是", "我们是", "我们公司", "我们这边", - "了解一下", "介绍一下", "简单说一下", - "考虑考虑", "想一想", "再说", "再看看", - "不需要", "不感兴趣", "没兴趣", "不用了", - "好的", "行", "可以", "没问题", "那就这样", - "再联系", "回头聊", "有需要再说", - }, - question_keywords=cls.BASE_QUESTION_KEYWORDS | { - "有没有", "需不需要", "要不要", "考虑不考虑", - "了解吗", "知道吗", "听说过吗", - "方便吗", "有空吗", "在吗", - }, - decision_keywords=cls.BASE_DECISION_KEYWORDS | { - "确定", "决定", "选择", "购买", "下单", - "预约", "安排", "约定", "确认", - "跟进", "回访", "联系", "沟通", - } - ) - - @classmethod - def get_config(cls, scene: str, fallback_to_generic: bool = True) -> ScenePatterns: - """根据场景名称获取配置 - - Args: - scene: 场景名称 ('education', 'online_service', 'outbound' 或其他) - fallback_to_generic: 如果场景不存在,是否降级到通用配置 - - Returns: - 对应场景的配置,如果场景不存在: - - fallback_to_generic=True: 返回通用配置(仅基础规则) - - fallback_to_generic=False: 抛出异常 - """ - scene_map = { - 'education': cls.get_education_config, - 'online_service': cls.get_online_service_config, - 'outbound': cls.get_outbound_config, - } - - if scene in scene_map: - return scene_map[scene]() - - if fallback_to_generic: - # 返回通用配置(仅包含基础规则,不包含场景特定规则) - return cls.get_generic_config() - else: - raise ValueError(f"不支持的场景: {scene},支持的场景: {list(scene_map.keys())}") - - @classmethod - def get_generic_config(cls) -> ScenePatterns: - """通用场景配置 - 仅包含基础规则,适用于未定义的场景 - - 这是一个保守的配置,只使用最通用的规则,避免误删重要信息 - """ - return ScenePatterns( - high_priority_patterns=cls.BASE_HIGH_PRIORITY, - medium_priority_patterns=cls.BASE_MEDIUM_PRIORITY, - low_priority_patterns=cls.BASE_LOW_PRIORITY, - filler_phrases=cls.BASE_FILLERS, - question_keywords=cls.BASE_QUESTION_KEYWORDS, - decision_keywords=cls.BASE_DECISION_KEYWORDS - ) - - @classmethod - def get_all_scenes(cls) -> List[str]: - """获取所有预定义场景的列表""" - return ['education', 'online_service', 'outbound'] - - @classmethod - def is_scene_supported(cls, scene: str) -> bool: - """检查场景是否有专门的配置支持 - - Args: - scene: 场景名称 - - Returns: - True: 有专门配置 - False: 将使用通用配置 - """ - return scene in cls.get_all_scenes() + def get_config(cls, scene: str = "") -> ScenePatterns: + """所有场景统一返回同一份填充词库""" + return ScenePatterns(filler_phrases=cls.BASE_FILLERS) diff --git a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 index 47b3badb..e204b7f9 100644 --- a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 @@ -1,6 +1,6 @@ {# 对话级抽取与相关性判定模板(用于剪枝加速) - 输入:pruning_scene, is_builtin_scene, ontology_classes, dialog_text, language + 输入:pruning_scene, ontology_classes, dialog_text, language 输出:严格 JSON(不要包含任何多余文本),字段: - is_related: bool,是否与所选场景相关 - times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等) @@ -17,44 +17,22 @@ - 仅输出上述键;避免多余解释或字段。 #} -{# ── 内置场景的固定说明 ── #} -{% set builtin_scene_instructions = { - 'education': { - 'zh': '教育场景:教学、课程、考试、作业、老师/学生互动、学习资源、学校管理等。', - 'en': 'Education Scenario: Teaching, courses, exams, homework, teacher/student interaction, learning resources, school management, etc.' - }, - 'online_service': { - 'zh': '在线客服场景:客户咨询、问题排查、服务工单、售后支持、订单/退款、工单升级等。', - 'en': 'Online Service Scenario: Customer inquiries, troubleshooting, service tickets, after-sales support, orders/refunds, ticket escalation, etc.' - }, - 'outbound': { - 'zh': '外呼场景:电话外呼、邀约、调研问卷、线索跟进、对话脚本、回访记录等。', - 'en': 'Outbound Scenario: Outbound calls, invitations, survey questionnaires, lead follow-up, call scripts, follow-up records, etc.' - } -} %} - -{# ── 确定最终使用的场景说明 ── #} -{% if is_builtin_scene %} - {% set scene_key = pruning_scene %} - {% if scene_key not in builtin_scene_instructions %}{% set scene_key = 'education' %}{% endif %} - {% set instruction = builtin_scene_instructions[scene_key][language] if language in ['zh', 'en'] else builtin_scene_instructions[scene_key]['zh'] %} - {% set custom_types_str = '' %} -{% else %} - {% if ontology_classes and ontology_classes | length > 0 %} - {% if language == 'en' %} - {% set custom_types_str = ontology_classes | join(', ') %} - {% set instruction = 'Custom scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %} - {% else %} - {% set custom_types_str = ontology_classes | join('、') %} - {% set instruction = '自定义场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %} - {% endif %} +{# ── 确定场景说明 ── #} +{% if ontology_classes and ontology_classes | length > 0 %} + {% if language == 'en' %} + {% set custom_types_str = ontology_classes | join(', ') %} + {% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %} {% else %} - {% if language == 'en' %} - {% set instruction = 'Custom scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %} - {% else %} - {% set instruction = '自定义场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %} - {% endif %} + {% set custom_types_str = ontology_classes | join('、') %} + {% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %} + {% endif %} +{% else %} + {% if language == 'en' %} {% set custom_types_str = '' %} + {% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %} + {% else %} + {% set custom_types_str = '' %} + {% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %} {% endif %} {% endif %} @@ -64,7 +42,7 @@ 2. 从对话中抽取所有需要保留的重要信息片段。 场景说明:{{ instruction }} -{% if not is_builtin_scene and custom_types_str %} +{% if custom_types_str %} 重要提示:只要对话中出现与上述实体类型({{ custom_types_str }})相关的内容,即判定为相关(is_related=true)。 {% endif %} @@ -118,7 +96,7 @@ You are a dialogue content analysis assistant. Please analyze the full dialogue 2. Extract all important information fragments that must be preserved. Scenario Description: {{ instruction }} -{% if not is_builtin_scene and custom_types_str %} +{% if custom_types_str %} Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true). {% endif %} diff --git a/api/app/services/memory_config_service.py b/api/app/services/memory_config_service.py index 00757f8c..4d67673f 100644 --- a/api/app/services/memory_config_service.py +++ b/api/app/services/memory_config_service.py @@ -107,29 +107,19 @@ def _validate_config_id(config_id, db: Session = None): ) -# 专门场景的内置 key 集合,直接从 SceneConfigRegistry 派生,避免重复维护 -# 使用懒加载函数避免模块级循环导入 -def _get_builtin_pruning_scenes() -> set: - from app.core.memory.storage_services.extraction_engine.data_preprocessing.scene_config import SceneConfigRegistry - return set(SceneConfigRegistry.get_all_scenes()) - - def _load_ontology_classes(db: Session, scene_id, pruning_scene: Optional[str]) -> Optional[list]: - """当 pruning_scene 不是内置场景时,从 ontology_class 表加载类型名称列表。 + """从 ontology_class 表加载场景类型名称列表,用于注入提示词。 Args: db: 数据库会话 scene_id: 本体场景 UUID - pruning_scene: 语义剪枝场景名称 + pruning_scene: 语义剪枝场景名称(保留参数,暂未使用) Returns: - class_name 字符串列表,或 None(内置场景 / 无数据时) + class_name 字符串列表,或 None(无数据时) """ if not scene_id: return None - # 内置场景走 SceneConfigRegistry,不需要注入类型列表 - if pruning_scene in _get_builtin_pruning_scenes(): - return None try: from app.repositories.ontology_class_repository import OntologyClassRepository repo = OntologyClassRepository(db) diff --git a/api/app/services/pilot_run_service.py b/api/app/services/pilot_run_service.py index 5d00d8a5..b63bc0db 100644 --- a/api/app/services/pilot_run_service.py +++ b/api/app/services/pilot_run_service.py @@ -120,7 +120,8 @@ async def run_pilot_extraction( "pruning_switch": memory_config.pruning_enabled, "pruning_scene": memory_config.pruning_scene, "pruning_threshold": memory_config.pruning_threshold, - "llm_model_id": str(memory_config.llm_model_id), + "scene_id": str(memory_config.scene_id) if memory_config.scene_id else None, + "ontology_classes": memory_config.ontology_classes, } config = PruningConfig(**pruning_config_dict) diff --git a/redbear-mem-benchmark b/redbear-mem-benchmark index 8494e824..c3bbc693 160000 --- a/redbear-mem-benchmark +++ b/redbear-mem-benchmark @@ -1 +1 @@ -Subproject commit 8494e82498cb99c70ac67a64a544ff872432363a +Subproject commit c3bbc6931c570e6fac88c0b00658b4f08dc2ac77