Merge pull request #532 from SuanmoSuanyangTechnology/develop

Develop
This commit is contained in:
Ke Sun
2026-03-10 20:17:02 +08:00
committed by GitHub
6 changed files with 124 additions and 552 deletions

View File

@@ -88,26 +88,17 @@ class SemanticPruner:
self._detailed_prune_logging = True # 是否启用详细日志
self._max_debug_msgs_per_dialog = 20 # 每个对话最多记录前N条消息的详细日志
# 加载场景特定配置(内置场景走专门规则,自定义场景 fallback 到通用规则)
self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(
self.config.pruning_scene,
fallback_to_generic=True
)
# 加载统一填充词库
self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(self.config.pruning_scene)
# 判断是否为内置专门场景
self._is_builtin_scene = SceneConfigRegistry.is_scene_supported(self.config.pruning_scene)
# 自定义场景的本体类型列表(用于注入提示词)
# 本体类型列表(用于注入提示词,所有场景均支持)
self._ontology_classes = getattr(self.config, "ontology_classes", None) or []
if self._is_builtin_scene:
self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene} 使用内置专门配置")
self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene}")
if self._ontology_classes:
self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}")
else:
self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene} 为自定义场景,使用通用规则 + 本体类型提示词注入")
if self._ontology_classes:
self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}")
else:
self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词")
self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词")
# Load Jinja2 template
self.template = prompt_env.get_template("extracat_Pruning.jinja2")
@@ -119,88 +110,9 @@ class SemanticPruner:
# 运行日志:收集关键终端输出,便于写入 JSON
self.run_logs: List[str] = []
def _is_important_message(self, message: ConversationMessage) -> bool:
"""基于启发式规则识别重要信息消息,优先保留。
改进版:使用场景特定的模式进行识别
- 根据 pruning_scene 动态加载对应的识别规则
- 支持教育、在线服务、外呼三个场景的特定模式
"""
text = message.msg.strip()
if not text:
return False
# 使用场景特定的模式
all_patterns = (
self.scene_config.high_priority_patterns +
self.scene_config.medium_priority_patterns +
self.scene_config.low_priority_patterns
)
for pattern, _ in all_patterns:
if re.search(pattern, text, flags=re.IGNORECASE):
return True
# 检查是否为问句(以问号结尾或包含疑问词)
if text.endswith("") or text.endswith("?"):
return True
# 检查是否包含问句关键词
if any(keyword in text for keyword in self.scene_config.question_keywords):
return True
# 检查是否包含决策性关键词
if any(keyword in text for keyword in self.scene_config.decision_keywords):
return True
return False
def _importance_score(self, message: ConversationMessage) -> int:
"""为重要消息打分,用于在保留比例内优先保留更关键的内容。
改进版使用场景特定的权重体系0-10分
- 根据场景动态调整不同信息类型的权重
- 高优先级模式4-6分
- 中优先级模式2-3分
- 低优先级模式1分
"""
text = message.msg.strip()
score = 0
# 使用场景特定的权重
for pattern, weight in self.scene_config.high_priority_patterns:
if re.search(pattern, text, flags=re.IGNORECASE):
score += weight
for pattern, weight in self.scene_config.medium_priority_patterns:
if re.search(pattern, text, flags=re.IGNORECASE):
score += weight
for pattern, weight in self.scene_config.low_priority_patterns:
if re.search(pattern, text, flags=re.IGNORECASE):
score += weight
# 问句加分
if text.endswith("") or text.endswith("?"):
score += 2
# 包含问句关键词加分
if any(keyword in text for keyword in self.scene_config.question_keywords):
score += 1
# 包含决策性关键词加分
if any(keyword in text for keyword in self.scene_config.decision_keywords):
score += 2
# 长度加分(较长的消息通常包含更多信息)
if len(text) > 50:
score += 1
if len(text) > 100:
score += 1
return min(score, 10) # 最高10分
# 情绪/兴趣/爱好安全防线正则已移除,改由 extracat_Pruning.jinja2 提示词中的 preserve_keywords 机制处理
# _is_important_message 和 _importance_score 已移除:
# 重要性判断完全由 extracat_Pruning.jinja2 提示词 + LLM 的 preserve_tokens 机制承担。
# LLM 根据注入的本体工程类型语义识别需要保护的内容,无需硬编码正则规则。
def _is_filler_message(self, message: ConversationMessage) -> bool:
"""检测典型寒暄/口头禅/确认类短消息。
@@ -419,14 +331,12 @@ class SemanticPruner:
rendered = self.template.render(
pruning_scene=self.config.pruning_scene,
is_builtin_scene=self._is_builtin_scene,
ontology_classes=self._ontology_classes,
dialog_text=dialog_text,
language=self.language
)
log_template_rendering("extracat_Pruning.jinja2", {
"pruning_scene": self.config.pruning_scene,
"is_builtin_scene": self._is_builtin_scene,
"ontology_classes_count": len(self._ontology_classes),
"language": self.language
})
@@ -491,62 +401,56 @@ class SemanticPruner:
# 相关对话不剪枝
return dialog
# 在不相关对话中,识别重要/不重要消息
tokens = extraction.times + extraction.ids + extraction.amounts + extraction.contacts + extraction.addresses + extraction.keywords
# 在不相关对话中,LLM 已通过 preserve_tokens 标记需要保护的内容
preserve_tokens = (
extraction.times + extraction.ids + extraction.amounts +
extraction.contacts + extraction.addresses + extraction.keywords +
extraction.preserve_keywords
)
msgs = dialog.context.msgs
imp_unrel_msgs: List[ConversationMessage] = []
unimp_unrel_msgs: List[ConversationMessage] = []
# 分类:填充 / 其他可删LLM保护消息通过不加入任何桶来隐式保护
filler_ids: set = set()
deletable: List[ConversationMessage] = []
for m in msgs:
if self._msg_matches_tokens(m, tokens) or self._is_important_message(m):
imp_unrel_msgs.append(m)
if self._msg_matches_tokens(m, preserve_tokens):
pass # 保护消息:不加入任何桶,不会被删除
elif self._is_filler_message(m):
filler_ids.add(id(m))
else:
unimp_unrel_msgs.append(m)
# 计算总删除目标数量
deletable.append(m)
# 计算删除目标
total_unrel = len(msgs)
delete_target = int(total_unrel * proportion)
if proportion > 0 and total_unrel > 0 and delete_target == 0:
delete_target = 1
imp_del_cap = min(int(len(imp_unrel_msgs) * proportion), len(imp_unrel_msgs))
unimp_del_cap = len(unimp_unrel_msgs)
max_capacity = max(0, len(msgs) - 1)
max_deletable = min(imp_del_cap + unimp_del_cap, max_capacity)
max_deletable = min(len(filler_ids) + len(deletable), max(0, total_unrel - 1))
delete_target = min(delete_target, max_deletable)
# 删除配额分配
del_unimp = min(delete_target, unimp_del_cap)
rem = delete_target - del_unimp
del_imp = min(rem, imp_del_cap)
# 选取删除集合
unimp_delete_ids = []
imp_delete_ids = []
if del_unimp > 0:
# 按出现顺序选取前 del_unimp 条不重要消息进行删除(确定性、可复现)
unimp_delete_ids = [id(m) for m in unimp_unrel_msgs[:del_unimp]]
if del_imp > 0:
imp_sorted = sorted(imp_unrel_msgs, key=lambda m: self._importance_score(m))
imp_delete_ids = [id(m) for m in imp_sorted[:del_imp]]
# 统计实际删除数量(重要/不重要)
actual_unimp_deleted = 0
actual_imp_deleted = 0
kept_msgs = []
delete_targets = set(unimp_delete_ids) | set(imp_delete_ids)
# 优先删填充,再删其他可删消息(按出现顺序)
to_delete_ids: set = set()
for m in msgs:
mid = id(m)
if mid in delete_targets:
if mid in set(unimp_delete_ids) and actual_unimp_deleted < del_unimp:
actual_unimp_deleted += 1
continue
if mid in set(imp_delete_ids) and actual_imp_deleted < del_imp:
actual_imp_deleted += 1
continue
kept_msgs.append(m)
if len(to_delete_ids) >= delete_target:
break
if id(m) in filler_ids:
to_delete_ids.add(id(m))
for m in deletable:
if len(to_delete_ids) >= delete_target:
break
to_delete_ids.add(id(m))
kept_msgs = [m for m in msgs if id(m) not in to_delete_ids]
if not kept_msgs and msgs:
kept_msgs = [msgs[0]]
deleted_total = actual_unimp_deleted + actual_imp_deleted
deleted_total = len(msgs) - len(kept_msgs)
protected_count = len(msgs) - len(filler_ids) - len(deletable)
self._log(
f"[剪枝-对话] 对话ID={dialog.id} 总消息={len(msgs)} 删除目标={delete_target} 实删={deleted_total} 保留={len(kept_msgs)}"
f"[剪枝-对话] 对话ID={dialog.id} 总消息={len(msgs)} "
f"(保护={protected_count} 填充={len(filler_ids)} 可删={len(deletable)}) "
f"删除目标={delete_target} 实删={deleted_total} 保留={len(kept_msgs)}"
)
dialog.context = ConversationContext(msgs=kept_msgs)
@@ -616,38 +520,29 @@ class SemanticPruner:
if extraction.preserve_keywords:
self._log(f" 对话[{d_idx}] LLM抽取到情绪/兴趣保护词: {extraction.preserve_keywords}")
# 消息级分类:每条消息独立判断
llm_protected_msgs = [] # LLM 保护消息(情绪/兴趣/重要token绝对不可删除
rule_important_msgs = [] # 规则层重要消息(场景规则):配额不足时可少量删除
unimportant_msgs = [] # 不重要消息(可删除)
# 消息级分类:LLM保护 / 填充 / 其他可删
llm_protected_msgs = [] # LLM 保护消息(preserve_tokens 命中):绝对不可删除
filler_msgs = [] # 填充消息(优先删除)
deletable_msgs = [] # 其余消息(按比例删除)
for idx, m in enumerate(msgs):
msg_text = m.msg.strip()
# LLM 保护:消息包含 preserve_keywords情绪/兴趣词)或其他重要 token → 绝对不可删除
if self._msg_matches_tokens(m, preserve_tokens):
llm_protected_msgs.append((idx, m))
if should_log_details or idx < self._max_debug_msgs_per_dialog:
self._log(f" [{idx}] '{msg_text[:30]}...'重要LLM保护,不可删)")
# 填充消息(寒暄、表情等)
self._log(f" [{idx}] '{msg_text[:30]}...'保护LLM不可删")
elif self._is_filler_message(m):
filler_msgs.append((idx, m))
if should_log_details or idx < self._max_debug_msgs_per_dialog:
self._log(f" [{idx}] '{msg_text[:30]}...' → 填充")
# 规则层重要信息(学号、成绩、时间、金额等)
elif self._is_important_message(m):
rule_important_msgs.append((idx, m))
if should_log_details or idx < self._max_debug_msgs_per_dialog:
self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(场景规则)")
# 其他消息
else:
unimportant_msgs.append((idx, m))
deletable_msgs.append((idx, m))
if should_log_details or idx < self._max_debug_msgs_per_dialog:
self._log(f" [{idx}] '{msg_text[:30]}...'不重要")
self._log(f" [{idx}] '{msg_text[:30]}...'可删")
# important_msgs 仅用于日志统计(兼容下方日志输出)
important_msgs = llm_protected_msgs + rule_important_msgs
# important_msgs 仅用于日志统计
important_msgs = llm_protected_msgs
# 计算删除配额
delete_target = int(original_count * proportion)
@@ -658,37 +553,23 @@ class SemanticPruner:
max_deletable = max(0, original_count - 1)
delete_target = min(delete_target, max_deletable)
# 删除策略:优先删填充消息,再删除不重要消息
# 删除策略:优先删填充消息,再按出现顺序删其余可删消息
to_delete_indices = set()
deleted_details = [] # 记录删除的消息详情
deleted_details = []
# 第一步:删除填充消息
filler_to_delete = min(len(filler_msgs), delete_target)
for i in range(filler_to_delete):
idx, msg = filler_msgs[i]
for idx, msg in filler_msgs:
if len(to_delete_indices) >= delete_target:
break
to_delete_indices.add(idx)
deleted_details.append(f"[{idx}] 填充: '{msg.msg[:50]}'")
# 第二步:如果还需要删除,删除不重要消息
remaining_quota = delete_target - len(to_delete_indices)
if remaining_quota > 0:
unimp_to_delete = min(len(unimportant_msgs), remaining_quota)
for i in range(unimp_to_delete):
idx, msg = unimportant_msgs[i]
to_delete_indices.add(idx)
deleted_details.append(f"[{idx}] 不重要: '{msg.msg[:50]}'")
# 第三步如果还需要删除按重要性分数删除规则层重要消息LLM保护消息绝对不删
remaining_quota = delete_target - len(to_delete_indices)
if remaining_quota > 0 and rule_important_msgs:
# 按重要性分数排序(分数低的优先删除)
imp_sorted = sorted(rule_important_msgs, key=lambda x: self._importance_score(x[1]))
imp_to_delete = min(len(imp_sorted), remaining_quota)
for i in range(imp_to_delete):
idx, msg = imp_sorted[i]
to_delete_indices.add(idx)
score = self._importance_score(msg)
deleted_details.append(f"[{idx}] 规则重要(分数{score}): '{msg.msg[:50]}'")
# 第二步:如果还需要删除,按出现顺序删可删消息
for idx, msg in deletable_msgs:
if len(to_delete_indices) >= delete_target:
break
to_delete_indices.add(idx)
deleted_details.append(f"[{idx}] 可删: '{msg.msg[:50]}'")
# 执行删除
kept_msgs = []
@@ -716,7 +597,7 @@ class SemanticPruner:
self._log(
f"[剪枝-对话] 对话 {d_idx+1} 总消息={original_count} "
f"(重要={len(important_msgs)} 不重要={len(unimportant_msgs)} 填充={len(filler_msgs)}) "
f"(保护={len(important_msgs)} 填充={len(filler_msgs)} 可删={len(deletable_msgs)}) "
f"删除={deleted_count} 保留={len(kept_msgs)}"
)

View File

@@ -1,84 +1,25 @@
"""
场景特定配置 - 为不同场景提供定制化的剪枝规则
场景特定配置 - 统一填充词库
功能:
- 场景特定的重要信息识别模式
- 场景特定的重要性评分权重
- 场景特定的填充词库
- 场景特定的问答对识别规则
重要性判断已完全交由 extracat_Pruning.jinja2 提示词 + LLM preserve_tokens 机制承担。
本模块仅保留统一填充词库filler_phrases用于识别无意义寒暄/表情/口头禅。
所有场景共用同一份词库,场景差异由 LLM 语义判断处理。
"""
from typing import Dict, List, Set, Tuple
from typing import List, Set
from dataclasses import dataclass, field
@dataclass
class ScenePatterns:
"""场景特定的识别模式"""
# 重要信息的正则模式(优先级从高到低)
high_priority_patterns: List[Tuple[str, int]] = field(default_factory=list) # (pattern, weight)
medium_priority_patterns: List[Tuple[str, int]] = field(default_factory=list)
low_priority_patterns: List[Tuple[str, int]] = field(default_factory=list)
# 填充词库(无意义对话)
"""场景特定的识别模式(仅保留填充词库)"""
filler_phrases: Set[str] = field(default_factory=set)
# 问句关键词(用于识别问答对)
question_keywords: Set[str] = field(default_factory=set)
# 决策性/承诺性关键词
decision_keywords: Set[str] = field(default_factory=set)
class SceneConfigRegistry:
"""场景配置注册表 - 管理所有场景的特定配置"""
# 基础通用模式(所有场景共享)
BASE_HIGH_PRIORITY = [
(r"订单号|工单|申请号|编号|ID|账号|账户", 5),
(r"金额|费用|价格|¥|¥|\d+元", 5),
(r"\d{11}", 4), # 手机号
(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", 4), # 邮箱
]
BASE_MEDIUM_PRIORITY = [
(r"\d{4}-\d{1,2}-\d{1,2}", 3), # 日期
(r"\d{4}\d{1,2}月\d{1,2}日", 3),
(r"电话|手机号|微信|QQ|联系方式", 3),
(r"地址|地点|位置", 2),
(r"时间|日期|有效期|截止", 2),
(r"今天|明天|后天|昨天|前天", 3), # 相对时间(提高权重)
(r"下周|下月|下年|上周|上月|上年|本周|本月|本年", 3),
(r"今年|去年|明年", 3),
# ---- 情绪内容(所有场景通用,用于情绪提取) ----
(r"开心|高兴|快乐|兴奋|愉快|幸福|满足|喜悦|欣喜", 4),
(r"难过|悲伤|伤心|痛苦|委屈|失落|沮丧|郁闷|忧郁|绝望", 4),
(r"生气|愤怒|烦躁|焦虑|紧张|害怕|恐惧|担心|担忧|压力", 4),
(r"感动|温暖|感激|感谢|惊喜|期待|憧憬|向往", 3),
(r"无聊|无奈|尴尬|后悔|遗憾|羞愧|惭愧", 3),
(r"好[开高快]心|很[开高快]心|超[开高快]心|非常[开高快]心", 4),
(r"好难过|好伤心|好悲伤|好委屈|好痛苦", 4),
(r"好开心|好高兴|好快乐|好幸福|好感动", 4),
# ---- 兴趣/爱好内容(所有场景通用,用于兴趣提取) ----
(r"喜欢|热爱|爱好|兴趣|擅长|享受|沉迷|着迷|痴迷", 4),
(r"不喜欢|讨厌|厌恶|反感|排斥", 3),
(r"羽毛球|篮球|足球|排球|乒乓球|网球|棒球|高尔夫", 4),
(r"游泳|跑步|健身|瑜伽|舞蹈|武术|骑行|登山|徒步", 4),
(r"音乐|唱歌|吉他|钢琴|绘画|摄影|书法|手工|烹饪", 4),
(r"游戏|电影|动漫|小说|阅读|旅游|美食|宠物", 3),
]
BASE_LOW_PRIORITY = [
(r"\d{1,2}:\d{2}", 2), # 时间点 HH:MM
(r"\d{1,2}点\d{0,2}分?", 2), # 时间点 X点Y分 或 X点
(r"上午|下午|中午|晚上|早上|傍晚|凌晨", 2), # 时段(提高权重并扩充)
(r"AM|PM|am|pm", 1),
# ---- 情绪程度副词(辅助情绪识别) ----
(r"特别|非常|超级|极其|十分|很|好[开高快]|太.*了", 1),
]
BASE_FILLERS = {
"""场景配置注册表 - 所有场景共用统一填充词库"""
BASE_FILLERS: Set[str] = {
# 基础寒暄
"你好", "您好", "在吗", "在的", "在呢", "", "嗯嗯", "", "哦哦",
"好的", "", "", "可以", "不可以", "谢谢", "多谢", "感谢",
@@ -87,7 +28,26 @@ class SceneConfigRegistry:
"哈哈", "呵呵", "哈哈哈", "嘿嘿", "嘻嘻", "hiahia",
"", "", "", "", "", "", "嗯哼",
# 确认词
"是的", "", "对的", "没错", "嗯嗯", "好嘞", "收到", "明白", "了解", "知道了",
"是的", "", "对的", "没错", "好嘞", "收到", "明白", "了解", "知道了",
# 服务类套话
"请问", "请稍等", "稍等", "马上", "立即",
"正在查询", "正在处理", "正在为您", "帮您查一下",
"还有其他问题吗", "还需要什么帮助", "很高兴为您服务",
"感谢您的耐心等待", "抱歉让您久等了",
"已记录", "已反馈", "已转接", "已升级",
"祝您生活愉快", "欢迎下次咨询",
# 外呼套话
"", "hello", "打扰了", "不好意思",
"方便接电话吗", "现在方便吗", "占用您一点时间",
"我是", "我们是", "我们公司", "我们这边",
"了解一下", "介绍一下", "简单说一下",
"考虑考虑", "想一想", "再说", "再看看",
"不需要", "不感兴趣", "没兴趣", "不用了",
"没问题", "那就这样", "再联系", "回头聊", "有需要再说",
# 教育场景套话
"老师好", "同学们好", "上课", "下课", "起立", "坐下",
"举手", "请坐", "很好", "不错", "继续",
"下一个", "下一题", "下一位", "还有吗", "还有问题吗",
# 标点和符号
"。。。", "...", "???", "", "!!!", "",
# 表情符号
@@ -99,246 +59,8 @@ class SceneConfigRegistry:
"hhh", "hhhh", "2333", "666", "gg", "ok", "OK", "okok",
"emmm", "emm", "em", "mmp", "wtf", "omg",
}
BASE_QUESTION_KEYWORDS = {
"什么", "为什么", "怎么", "如何", "哪里", "哪个", "", "多少", "几点", "何时", ""
}
BASE_DECISION_KEYWORDS = {
"必须", "一定", "务必", "需要", "要求", "规定", "应该",
"承诺", "保证", "确保", "负责", "同意", "答应"
}
@classmethod
def get_education_config(cls) -> ScenePatterns:
"""教育场景配置"""
return ScenePatterns(
high_priority_patterns=cls.BASE_HIGH_PRIORITY + [
# 成绩相关(最高优先级)
(r"成绩|分数|得分|满分|及格|不及格", 6),
(r"GPA|绩点|学分|平均分", 6),
(r"\d+分|\d+\.?\d*分", 5), # 具体分数
(r"排名|名次|第.{1,3}名", 5), # 支持"第三名"、"第1名"等
# 学籍信息
(r"学号|学生证|教师工号|工号", 5),
(r"班级|年级|专业|院系", 4),
# 课程相关
(r"课程|科目|学科|必修|选修", 4),
(r"教材|课本|教科书|参考书", 4),
(r"章节|第.{1,3}章|第.{1,3}节", 3), # 支持"第三章"、"第1章"等
# 学科内容(新增)
(r"微积分|导数|积分|函数|极限|微分", 4),
(r"代数|几何|三角|概率|统计", 4),
(r"物理|化学|生物|历史|地理", 4),
(r"英语|语文|数学|政治|哲学", 4),
(r"定义|定理|公式|概念|原理|法则", 3),
(r"例题|解题|证明|推导|计算", 3),
],
medium_priority_patterns=cls.BASE_MEDIUM_PRIORITY + [
# 教学活动
(r"作业|练习|习题|题目", 3),
(r"考试|测验|测试|考核|期中|期末", 3),
(r"上课|下课|课堂|讲课", 2),
(r"提问|回答|发言|讨论", 2),
(r"问一下|请教|咨询|询问", 2), # 新增:问询相关
(r"理解|明白|懂|掌握|学会", 2), # 新增:学习状态
# 时间安排
(r"课表|课程表|时间表", 3),
(r"第.{1,3}节课|第.{1,3}周", 2), # 支持"第三节课"、"第1周"等
],
low_priority_patterns=cls.BASE_LOW_PRIORITY + [
(r"老师|教师|同学|学生", 1),
(r"教室|实验室|图书馆", 1),
],
filler_phrases=cls.BASE_FILLERS | {
# 教育场景特有填充词(移除了"明白了"、"懂了"、"不懂"等,这些在教育场景中有意义)
"老师好", "同学们好", "上课", "下课", "起立", "坐下",
"举手", "请坐", "很好", "不错", "继续",
"下一个", "下一题", "下一位", "还有吗", "还有问题吗",
},
question_keywords=cls.BASE_QUESTION_KEYWORDS | {
"为啥", "", "咋办", "怎样", "如何做",
"能不能", "可不可以", "行不行", "对不对", "是不是",
},
decision_keywords=cls.BASE_DECISION_KEYWORDS | {
"必考", "重点", "考点", "难点", "关键",
"记住", "背诵", "掌握", "理解", "复习",
}
)
@classmethod
def get_online_service_config(cls) -> ScenePatterns:
"""在线服务场景配置"""
return ScenePatterns(
high_priority_patterns=cls.BASE_HIGH_PRIORITY + [
# 工单相关(最高优先级)
(r"工单号|工单编号|ticket|TK\d+", 6),
(r"工单状态|处理中|已解决|已关闭|待处理", 5),
(r"优先级|紧急|高优先级|P0|P1|P2", 5),
# 产品信息
(r"产品型号|型号|SKU|产品编号", 5),
(r"序列号|SN|设备号", 5),
(r"版本号|软件版本|固件版本", 4),
# 问题描述
(r"故障|错误|异常|bug|问题", 4),
(r"错误代码|故障代码|error code", 5),
(r"无法|不能|失败|报错", 3),
],
medium_priority_patterns=cls.BASE_MEDIUM_PRIORITY + [
# 服务相关
(r"退款|退货|换货|补发", 4),
(r"发票|收据|凭证", 3),
(r"物流|快递|运单号", 3),
(r"保修|质保|售后", 3),
# 时效相关
(r"SLA|响应时间|处理时长", 4),
(r"超时|延迟|等待", 2),
],
low_priority_patterns=cls.BASE_LOW_PRIORITY + [
(r"客服|工程师|技术支持", 1),
(r"用户|客户|会员", 1),
],
filler_phrases=cls.BASE_FILLERS | {
# 在线服务特有填充词
"您好", "请问", "请稍等", "稍等", "马上", "立即",
"正在查询", "正在处理", "正在为您", "帮您查一下",
"还有其他问题吗", "还需要什么帮助", "很高兴为您服务",
"感谢您的耐心等待", "抱歉让您久等了",
"已记录", "已反馈", "已转接", "已升级",
"祝您生活愉快", "再见", "欢迎下次咨询",
},
question_keywords=cls.BASE_QUESTION_KEYWORDS | {
"能否", "可否", "是否", "有没有", "能不能",
"怎么办", "如何处理", "怎么解决",
},
decision_keywords=cls.BASE_DECISION_KEYWORDS | {
"立即处理", "马上解决", "尽快", "优先",
"升级", "转接", "派单", "跟进",
"补偿", "赔偿", "退款", "换货",
}
)
@classmethod
def get_outbound_config(cls) -> ScenePatterns:
"""外呼场景配置"""
return ScenePatterns(
high_priority_patterns=cls.BASE_HIGH_PRIORITY + [
# 意向相关(最高优先级)
(r"意向|意愿|兴趣|感兴趣", 6),
(r"A类|B类|C类|D类|高意向|低意向", 6),
(r"成交|签约|下单|购买|确认", 6),
# 联系信息(外呼场景中更重要)
(r"预约|约定|安排|确定时间", 5),
(r"下次联系|回访|跟进", 5),
(r"方便|有空|可以|时间", 4),
# 通话状态
(r"接通|未接通|占线|关机|停机", 4),
(r"通话时长|通话时间", 3),
],
medium_priority_patterns=cls.BASE_MEDIUM_PRIORITY + [
# 客户信息
(r"姓名|称呼|先生|女士", 3),
(r"公司|单位|职位|职务", 3),
(r"需求|要求|期望", 3),
# 跟进状态
(r"跟进状态|进展|进度", 3),
(r"已联系|待联系|联系中", 2),
(r"拒绝|不感兴趣|考虑|再说", 3),
],
low_priority_patterns=cls.BASE_LOW_PRIORITY + [
(r"销售|客户经理|业务员", 1),
(r"产品|服务|方案", 1),
],
filler_phrases=cls.BASE_FILLERS | {
# 外呼场景特有填充词
"您好", "", "hello", "打扰了", "不好意思",
"方便接电话吗", "现在方便吗", "占用您一点时间",
"我是", "我们是", "我们公司", "我们这边",
"了解一下", "介绍一下", "简单说一下",
"考虑考虑", "想一想", "再说", "再看看",
"不需要", "不感兴趣", "没兴趣", "不用了",
"好的", "", "可以", "没问题", "那就这样",
"再联系", "回头聊", "有需要再说",
},
question_keywords=cls.BASE_QUESTION_KEYWORDS | {
"有没有", "需不需要", "要不要", "考虑不考虑",
"了解吗", "知道吗", "听说过吗",
"方便吗", "有空吗", "在吗",
},
decision_keywords=cls.BASE_DECISION_KEYWORDS | {
"确定", "决定", "选择", "购买", "下单",
"预约", "安排", "约定", "确认",
"跟进", "回访", "联系", "沟通",
}
)
@classmethod
def get_config(cls, scene: str, fallback_to_generic: bool = True) -> ScenePatterns:
"""根据场景名称获取配置
Args:
scene: 场景名称 ('education', 'online_service', 'outbound' 或其他)
fallback_to_generic: 如果场景不存在,是否降级到通用配置
Returns:
对应场景的配置,如果场景不存在:
- fallback_to_generic=True: 返回通用配置(仅基础规则)
- fallback_to_generic=False: 抛出异常
"""
scene_map = {
'education': cls.get_education_config,
'online_service': cls.get_online_service_config,
'outbound': cls.get_outbound_config,
}
if scene in scene_map:
return scene_map[scene]()
if fallback_to_generic:
# 返回通用配置(仅包含基础规则,不包含场景特定规则)
return cls.get_generic_config()
else:
raise ValueError(f"不支持的场景: {scene},支持的场景: {list(scene_map.keys())}")
@classmethod
def get_generic_config(cls) -> ScenePatterns:
"""通用场景配置 - 仅包含基础规则,适用于未定义的场景
这是一个保守的配置,只使用最通用的规则,避免误删重要信息
"""
return ScenePatterns(
high_priority_patterns=cls.BASE_HIGH_PRIORITY,
medium_priority_patterns=cls.BASE_MEDIUM_PRIORITY,
low_priority_patterns=cls.BASE_LOW_PRIORITY,
filler_phrases=cls.BASE_FILLERS,
question_keywords=cls.BASE_QUESTION_KEYWORDS,
decision_keywords=cls.BASE_DECISION_KEYWORDS
)
@classmethod
def get_all_scenes(cls) -> List[str]:
"""获取所有预定义场景的列表"""
return ['education', 'online_service', 'outbound']
@classmethod
def is_scene_supported(cls, scene: str) -> bool:
"""检查场景是否有专门的配置支持
Args:
scene: 场景名称
Returns:
True: 有专门配置
False: 将使用通用配置
"""
return scene in cls.get_all_scenes()
def get_config(cls, scene: str = "") -> ScenePatterns:
"""所有场景统一返回同一份填充词库"""
return ScenePatterns(filler_phrases=cls.BASE_FILLERS)

View File

@@ -1,6 +1,6 @@
{#
对话级抽取与相关性判定模板(用于剪枝加速)
输入pruning_scene, is_builtin_scene, ontology_classes, dialog_text, language
输入pruning_scene, ontology_classes, dialog_text, language
输出:严格 JSON不要包含任何多余文本字段
- is_related: bool是否与所选场景相关
- times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等)
@@ -17,44 +17,22 @@
- 仅输出上述键;避免多余解释或字段。
#}
{# ── 内置场景的固定说明 ── #}
{% set builtin_scene_instructions = {
'education': {
'zh': '教育场景:教学、课程、考试、作业、老师/学生互动、学习资源、学校管理等。',
'en': 'Education Scenario: Teaching, courses, exams, homework, teacher/student interaction, learning resources, school management, etc.'
},
'online_service': {
'zh': '在线客服场景:客户咨询、问题排查、服务工单、售后支持、订单/退款、工单升级等。',
'en': 'Online Service Scenario: Customer inquiries, troubleshooting, service tickets, after-sales support, orders/refunds, ticket escalation, etc.'
},
'outbound': {
'zh': '外呼场景:电话外呼、邀约、调研问卷、线索跟进、对话脚本、回访记录等。',
'en': 'Outbound Scenario: Outbound calls, invitations, survey questionnaires, lead follow-up, call scripts, follow-up records, etc.'
}
} %}
{# ── 确定最终使用的场景说明 ── #}
{% if is_builtin_scene %}
{% set scene_key = pruning_scene %}
{% if scene_key not in builtin_scene_instructions %}{% set scene_key = 'education' %}{% endif %}
{% set instruction = builtin_scene_instructions[scene_key][language] if language in ['zh', 'en'] else builtin_scene_instructions[scene_key]['zh'] %}
{% set custom_types_str = '' %}
{% else %}
{% if ontology_classes and ontology_classes | length > 0 %}
{% if language == 'en' %}
{% set custom_types_str = ontology_classes | join(', ') %}
{% set instruction = 'Custom scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %}
{% else %}
{% set custom_types_str = ontology_classes | join('、') %}
{% set instruction = '自定义场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %}
{% endif %}
{# ── 确定场景说明 ── #}
{% if ontology_classes and ontology_classes | length > 0 %}
{% if language == 'en' %}
{% set custom_types_str = ontology_classes | join(', ') %}
{% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %}
{% else %}
{% if language == 'en' %}
{% set instruction = 'Custom scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
{% else %}
{% set instruction = '自定义场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %}
{% endif %}
{% set custom_types_str = ontology_classes | join('、') %}
{% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %}
{% endif %}
{% else %}
{% if language == 'en' %}
{% set custom_types_str = '' %}
{% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
{% else %}
{% set custom_types_str = '' %}
{% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %}
{% endif %}
{% endif %}
@@ -64,7 +42,7 @@
2. 从对话中抽取所有需要保留的重要信息片段。
场景说明:{{ instruction }}
{% if not is_builtin_scene and custom_types_str %}
{% if custom_types_str %}
重要提示:只要对话中出现与上述实体类型({{ custom_types_str }}相关的内容即判定为相关is_related=true
{% endif %}
@@ -118,7 +96,7 @@ You are a dialogue content analysis assistant. Please analyze the full dialogue
2. Extract all important information fragments that must be preserved.
Scenario Description: {{ instruction }}
{% if not is_builtin_scene and custom_types_str %}
{% if custom_types_str %}
Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true).
{% endif %}

View File

@@ -107,29 +107,19 @@ def _validate_config_id(config_id, db: Session = None):
)
# 专门场景的内置 key 集合,直接从 SceneConfigRegistry 派生,避免重复维护
# 使用懒加载函数避免模块级循环导入
def _get_builtin_pruning_scenes() -> set:
from app.core.memory.storage_services.extraction_engine.data_preprocessing.scene_config import SceneConfigRegistry
return set(SceneConfigRegistry.get_all_scenes())
def _load_ontology_classes(db: Session, scene_id, pruning_scene: Optional[str]) -> Optional[list]:
"""当 pruning_scene 不是内置场景时,从 ontology_class 表加载类型名称列表。
"""从 ontology_class 表加载场景类型名称列表,用于注入提示词
Args:
db: 数据库会话
scene_id: 本体场景 UUID
pruning_scene: 语义剪枝场景名称
pruning_scene: 语义剪枝场景名称(保留参数,暂未使用)
Returns:
class_name 字符串列表,或 None内置场景 / 无数据时)
class_name 字符串列表,或 None无数据时
"""
if not scene_id:
return None
# 内置场景走 SceneConfigRegistry不需要注入类型列表
if pruning_scene in _get_builtin_pruning_scenes():
return None
try:
from app.repositories.ontology_class_repository import OntologyClassRepository
repo = OntologyClassRepository(db)

View File

@@ -120,7 +120,8 @@ async def run_pilot_extraction(
"pruning_switch": memory_config.pruning_enabled,
"pruning_scene": memory_config.pruning_scene,
"pruning_threshold": memory_config.pruning_threshold,
"llm_model_id": str(memory_config.llm_model_id),
"scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
"ontology_classes": memory_config.ontology_classes,
}
config = PruningConfig(**pruning_config_dict)