Merge pull request #528 from SuanmoSuanyangTechnology/feature/pruning-optimize

Feature/pruning optimize
This commit is contained in:
Ke Sun
2026-03-10 17:37:43 +08:00
committed by GitHub
3 changed files with 150 additions and 62 deletions

View File

@@ -33,6 +33,7 @@ class DialogExtractionResponse(BaseModel):
- is_related对话与场景的相关性判定。 - is_related对话与场景的相关性判定。
- times / ids / amounts / contacts / addresses / keywords重要信息片段用来在不相关对话中保留关键消息。 - times / ids / amounts / contacts / addresses / keywords重要信息片段用来在不相关对话中保留关键消息。
- preserve_keywords情绪/兴趣/爱好/个人观点相关词,包含这些词的消息必须强制保留。
""" """
is_related: bool = Field(...) is_related: bool = Field(...)
times: List[str] = Field(default_factory=list) times: List[str] = Field(default_factory=list)
@@ -41,6 +42,7 @@ class DialogExtractionResponse(BaseModel):
contacts: List[str] = Field(default_factory=list) contacts: List[str] = Field(default_factory=list)
addresses: List[str] = Field(default_factory=list) addresses: List[str] = Field(default_factory=list)
keywords: List[str] = Field(default_factory=list) keywords: List[str] = Field(default_factory=list)
preserve_keywords: List[str] = Field(default_factory=list, description="情绪/兴趣/爱好/个人观点相关词,包含这些词的消息强制保留")
class MessageImportanceResponse(BaseModel): class MessageImportanceResponse(BaseModel):
@@ -198,17 +200,16 @@ class SemanticPruner:
return min(score, 10) # 最高10分 return min(score, 10) # 最高10分
# 情绪/兴趣/爱好安全防线正则已移除,改由 extracat_Pruning.jinja2 提示词中的 preserve_keywords 机制处理
def _is_filler_message(self, message: ConversationMessage) -> bool: def _is_filler_message(self, message: ConversationMessage) -> bool:
"""检测典型寒暄/口头禅/确认类短消息。 """检测典型寒暄/口头禅/确认类短消息。
改进版:更严格的填充消息判断,避免误删场景相关内容 判断顺序:
满足以下之一视为填充消息 1. 空消息
- 纯标点或空白 2. 场景特定填充词库精确匹配
- 在场景特定填充词库中(精确匹配 3. 常见寒暄精确匹配
- 纯表情符号 4. 纯表情/标点
- 常见寒暄(精确匹配短语)
注意:不再使用长度判断,避免误删短但重要的消息
""" """
t = message.msg.strip() t = message.msg.strip()
if not t: if not t:
@@ -234,20 +235,6 @@ class SemanticPruner:
if re.fullmatch(r"(\[[^\]]+\])+", t): if re.fullmatch(r"(\[[^\]]+\])+", t):
return True return True
# 检查是否为纯emojiUnicode表情
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # 表情符号
"\U0001F300-\U0001F5FF" # 符号和象形文字
"\U0001F680-\U0001F6FF" # 交通和地图符号
"\U0001F1E0-\U0001F1FF" # 旗帜
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE
)
if emoji_pattern.fullmatch(t):
return True
# 纯标点符号 # 纯标点符号
if re.fullmatch(r"[。!?,.!?…·\s]+", t): if re.fullmatch(r"[。!?,.!?…·\s]+", t):
return True return True
@@ -595,43 +582,62 @@ class SemanticPruner:
total_original_msgs = 0 total_original_msgs = 0
total_deleted_msgs = 0 total_deleted_msgs = 0
for d_idx, dd in enumerate(dialogs): # 并发执行所有对话的 LLM 抽取(获取 preserve_keywords 等保护信息)
semaphore = asyncio.Semaphore(self.max_concurrent)
async def extract_with_semaphore(dd: DialogData) -> DialogExtractionResponse:
async with semaphore:
try:
return await self._extract_dialog_important(dd.content)
except Exception as e:
self._log(f"[剪枝-LLM] 对话抽取失败,使用降级策略: {str(e)[:100]}")
return DialogExtractionResponse(is_related=True)
extraction_tasks = [extract_with_semaphore(dd) for dd in dialogs]
extraction_results: List[DialogExtractionResponse] = await asyncio.gather(*extraction_tasks)
for d_idx, (dd, extraction) in enumerate(zip(dialogs, extraction_results)):
msgs = dd.context.msgs msgs = dd.context.msgs
original_count = len(msgs) original_count = len(msgs)
total_original_msgs += original_count total_original_msgs += original_count
# ========== 问答对保护(已注释,暂不启用,留作观察) ========== # 从 LLM 抽取结果中获取所有需要保留的 token
# qa_pairs = self._identify_qa_pairs(msgs) preserve_tokens = (
# protected_indices = self._get_protected_indices(msgs, qa_pairs, window_size=0) extraction.times + extraction.ids + extraction.amounts +
# ======================================================== extraction.contacts + extraction.addresses + extraction.keywords +
extraction.preserve_keywords # 情绪/兴趣/爱好关键词
)
# 消息级分类:每条消息独立判断 # 判断是否需要详细日志
important_msgs = [] # 重要消息(保留)
unimportant_msgs = [] # 不重要消息(可删除)
filler_msgs = [] # 填充消息(优先删除)
# 判断是否需要详细日志仅对前N条消息记录
should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog
if self._detailed_prune_logging and original_count > self._max_debug_msgs_per_dialog: if self._detailed_prune_logging and original_count > self._max_debug_msgs_per_dialog:
self._log(f" 对话[{d_idx}]消息数={original_count},仅采样前{self._max_debug_msgs_per_dialog}条进行详细日志") self._log(f" 对话[{d_idx}]消息数={original_count},仅采样前{self._max_debug_msgs_per_dialog}条进行详细日志")
if extraction.preserve_keywords:
self._log(f" 对话[{d_idx}] LLM抽取到情绪/兴趣保护词: {extraction.preserve_keywords}")
# 消息级分类:每条消息独立判断
llm_protected_msgs = [] # LLM 保护消息(情绪/兴趣/重要token绝对不可删除
rule_important_msgs = [] # 规则层重要消息(场景规则):配额不足时可少量删除
unimportant_msgs = [] # 不重要消息(可删除)
filler_msgs = [] # 填充消息(优先删除)
for idx, m in enumerate(msgs): for idx, m in enumerate(msgs):
msg_text = m.msg.strip() msg_text = m.msg.strip()
# ========== 问答对保护判断(已注释) ========== # LLM 保护:消息包含 preserve_keywords情绪/兴趣词)或其他重要 token → 绝对不可删除
# if idx in protected_indices: if self._msg_matches_tokens(m, preserve_tokens):
# important_msgs.append((idx, m)) llm_protected_msgs.append((idx, m))
# self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(问答对保护)") if should_log_details or idx < self._max_debug_msgs_per_dialog:
# ========================================== self._log(f" [{idx}] '{msg_text[:30]}...' → 重要LLM保护不可删")
# 填充消息(寒暄、表情等) # 填充消息(寒暄、表情等)
if self._is_filler_message(m): elif self._is_filler_message(m):
filler_msgs.append((idx, m)) filler_msgs.append((idx, m))
if should_log_details or idx < self._max_debug_msgs_per_dialog: if should_log_details or idx < self._max_debug_msgs_per_dialog:
self._log(f" [{idx}] '{msg_text[:30]}...' → 填充") self._log(f" [{idx}] '{msg_text[:30]}...' → 填充")
# 重要信息(学号、成绩、时间、金额等) # 规则层重要信息(学号、成绩、时间、金额等)
elif self._is_important_message(m): elif self._is_important_message(m):
important_msgs.append((idx, m)) rule_important_msgs.append((idx, m))
if should_log_details or idx < self._max_debug_msgs_per_dialog: if should_log_details or idx < self._max_debug_msgs_per_dialog:
self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(场景规则)") self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(场景规则)")
# 其他消息 # 其他消息
@@ -640,6 +646,9 @@ class SemanticPruner:
if should_log_details or idx < self._max_debug_msgs_per_dialog: if should_log_details or idx < self._max_debug_msgs_per_dialog:
self._log(f" [{idx}] '{msg_text[:30]}...' → 不重要") self._log(f" [{idx}] '{msg_text[:30]}...' → 不重要")
# important_msgs 仅用于日志统计(兼容下方日志输出)
important_msgs = llm_protected_msgs + rule_important_msgs
# 计算删除配额 # 计算删除配额
delete_target = int(original_count * proportion) delete_target = int(original_count * proportion)
if proportion > 0 and original_count > 0 and delete_target == 0: if proportion > 0 and original_count > 0 and delete_target == 0:
@@ -669,17 +678,17 @@ class SemanticPruner:
to_delete_indices.add(idx) to_delete_indices.add(idx)
deleted_details.append(f"[{idx}] 不重要: '{msg.msg[:50]}'") deleted_details.append(f"[{idx}] 不重要: '{msg.msg[:50]}'")
# 第三步:如果还需要删除,按重要性分数删除重要消息 # 第三步:如果还需要删除,按重要性分数删除规则层重要消息LLM保护消息绝对不删
remaining_quota = delete_target - len(to_delete_indices) remaining_quota = delete_target - len(to_delete_indices)
if remaining_quota > 0 and important_msgs: if remaining_quota > 0 and rule_important_msgs:
# 按重要性分数排序(分数低的优先删除) # 按重要性分数排序(分数低的优先删除)
imp_sorted = sorted(important_msgs, key=lambda x: self._importance_score(x[1])) imp_sorted = sorted(rule_important_msgs, key=lambda x: self._importance_score(x[1]))
imp_to_delete = min(len(imp_sorted), remaining_quota) imp_to_delete = min(len(imp_sorted), remaining_quota)
for i in range(imp_to_delete): for i in range(imp_to_delete):
idx, msg = imp_sorted[i] idx, msg = imp_sorted[i]
to_delete_indices.add(idx) to_delete_indices.add(idx)
score = self._importance_score(msg) score = self._importance_score(msg)
deleted_details.append(f"[{idx}] 重要(分数{score}): '{msg.msg[:50]}'") deleted_details.append(f"[{idx}] 规则重要(分数{score}): '{msg.msg[:50]}'")
# 执行删除 # 执行删除
kept_msgs = [] kept_msgs = []

View File

@@ -51,6 +51,22 @@ class SceneConfigRegistry:
(r"今天|明天|后天|昨天|前天", 3), # 相对时间(提高权重) (r"今天|明天|后天|昨天|前天", 3), # 相对时间(提高权重)
(r"下周|下月|下年|上周|上月|上年|本周|本月|本年", 3), (r"下周|下月|下年|上周|上月|上年|本周|本月|本年", 3),
(r"今年|去年|明年", 3), (r"今年|去年|明年", 3),
# ---- 情绪内容(所有场景通用,用于情绪提取) ----
(r"开心|高兴|快乐|兴奋|愉快|幸福|满足|喜悦|欣喜", 4),
(r"难过|悲伤|伤心|痛苦|委屈|失落|沮丧|郁闷|忧郁|绝望", 4),
(r"生气|愤怒|烦躁|焦虑|紧张|害怕|恐惧|担心|担忧|压力", 4),
(r"感动|温暖|感激|感谢|惊喜|期待|憧憬|向往", 3),
(r"无聊|无奈|尴尬|后悔|遗憾|羞愧|惭愧", 3),
(r"好[开高快]心|很[开高快]心|超[开高快]心|非常[开高快]心", 4),
(r"好难过|好伤心|好悲伤|好委屈|好痛苦", 4),
(r"好开心|好高兴|好快乐|好幸福|好感动", 4),
# ---- 兴趣/爱好内容(所有场景通用,用于兴趣提取) ----
(r"喜欢|热爱|爱好|兴趣|擅长|享受|沉迷|着迷|痴迷", 4),
(r"不喜欢|讨厌|厌恶|反感|排斥", 3),
(r"羽毛球|篮球|足球|排球|乒乓球|网球|棒球|高尔夫", 4),
(r"游泳|跑步|健身|瑜伽|舞蹈|武术|骑行|登山|徒步", 4),
(r"音乐|唱歌|吉他|钢琴|绘画|摄影|书法|手工|烹饪", 4),
(r"游戏|电影|动漫|小说|阅读|旅游|美食|宠物", 3),
] ]
BASE_LOW_PRIORITY = [ BASE_LOW_PRIORITY = [
@@ -58,6 +74,8 @@ class SceneConfigRegistry:
(r"\d{1,2}点\d{0,2}分?", 2), # 时间点 X点Y分 或 X点 (r"\d{1,2}点\d{0,2}分?", 2), # 时间点 X点Y分 或 X点
(r"上午|下午|中午|晚上|早上|傍晚|凌晨", 2), # 时段(提高权重并扩充) (r"上午|下午|中午|晚上|早上|傍晚|凌晨", 2), # 时段(提高权重并扩充)
(r"AM|PM|am|pm", 1), (r"AM|PM|am|pm", 1),
# ---- 情绪程度副词(辅助情绪识别) ----
(r"特别|非常|超级|极其|十分|很|好[开高快]|太.*了", 1),
] ]
BASE_FILLERS = { BASE_FILLERS = {

View File

@@ -9,10 +9,11 @@
- contacts: [string],联系方式(电话/手机号/邮箱/微信/QQ等 - contacts: [string],联系方式(电话/手机号/邮箱/微信/QQ等
- addresses: [string],地址/地点相关文本 - addresses: [string],地址/地点相关文本
- keywords: [string],其它有助于保留的重要关键词(与场景强相关的术语) - keywords: [string],其它有助于保留的重要关键词(与场景强相关的术语)
- preserve_keywords: [string],必须保留的情绪/兴趣/爱好/个人偏好相关词或短语片段
要求: 要求:
- 必须只输出上述 JSON且键名一致不得输出解释、前后缀不得包含注释。 - 必须只输出上述 JSON且键名一致不得输出解释、前后缀不得包含注释。
- times/ids/amounts/contacts/addresses/keywords 仅抽取原文片段或规范化后的简单字符串。 - times/ids/amounts/contacts/addresses/keywords/preserve_keywords 仅抽取原文片段或规范化后的简单字符串。
- 仅输出上述键;避免多余解释或字段。 - 仅输出上述键;避免多余解释或字段。
#} #}
@@ -34,13 +35,11 @@
{# ── 确定最终使用的场景说明 ── #} {# ── 确定最终使用的场景说明 ── #}
{% if is_builtin_scene %} {% if is_builtin_scene %}
{# 内置专门场景:使用固定说明 #}
{% set scene_key = pruning_scene %} {% set scene_key = pruning_scene %}
{% if scene_key not in builtin_scene_instructions %}{% set scene_key = 'education' %}{% endif %} {% if scene_key not in builtin_scene_instructions %}{% set scene_key = 'education' %}{% endif %}
{% set instruction = builtin_scene_instructions[scene_key][language] if language in ['zh', 'en'] else builtin_scene_instructions[scene_key]['zh'] %} {% set instruction = builtin_scene_instructions[scene_key][language] if language in ['zh', 'en'] else builtin_scene_instructions[scene_key]['zh'] %}
{% set custom_types_str = '' %} {% set custom_types_str = '' %}
{% else %} {% else %}
{# 自定义场景:使用场景名称 + 本体类型列表构建说明 #}
{% if ontology_classes and ontology_classes | length > 0 %} {% if ontology_classes and ontology_classes | length > 0 %}
{% if language == 'en' %} {% if language == 'en' %}
{% set custom_types_str = ontology_classes | join(', ') %} {% set custom_types_str = ontology_classes | join(', ') %}
@@ -50,7 +49,6 @@
{% set instruction = '自定义场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %} {% set instruction = '自定义场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %}
{% endif %} {% endif %}
{% else %} {% else %}
{# 无本体类型时退化为通用说明 #}
{% if language == 'en' %} {% if language == 'en' %}
{% set instruction = 'Custom scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %} {% set instruction = 'Custom scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
{% else %} {% else %}
@@ -61,12 +59,43 @@
{% endif %} {% endif %}
{% if language == "zh" %} {% if language == "zh" %}
请在下方对话全文基础上,按该场景进行一次性抽取并判定相关性 你是一个对话内容分析助手。请对下方对话全文进行一次性分析,完成两项任务
1. 判断对话是否与指定场景相关;
2. 从对话中抽取所有需要保留的重要信息片段。
场景说明:{{ instruction }} 场景说明:{{ instruction }}
{% if not is_builtin_scene and custom_types_str %} {% if not is_builtin_scene and custom_types_str %}
重要提示:只要对话中出现与上述实体类型({{ custom_types_str }}相关的内容即判定为相关is_related=true 重要提示:只要对话中出现与上述实体类型({{ custom_types_str }}相关的内容即判定为相关is_related=true
{% endif %} {% endif %}
---
【必须保留的内容(不可删除)】
以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段:
- 时间信息:日期、时间点、时间段、有效期 → times 字段
- 编号信息学号、工号、订单号、申请号、账号、ID → ids 字段
- 金额信息:价格、费用、金额(含货币符号或单位) → amounts 字段
- 联系方式电话、手机号、邮箱、微信、QQ → contacts 字段
- 地址信息:地点、地址、位置 → addresses 字段
- 场景关键词:与场景强相关的专业术语、事件名称 → keywords 字段
- **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段
- **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段
- **个人观点与态度**:对某事物的明确看法、评价、立场 → preserve_keywords 字段
【可以删除的内容】
以下类型的内容属于低价值信息,可以在剪枝时删除:
- 纯寒暄问候:如"你好"、"在吗"、"拜拜"、"嗯"、"好的"、"哦"等无实质内容的短语
- 纯表情/符号:如"[微笑]"、"😊"、"哈哈"等
- 重复确认:如"对对对"、"是的是的"、"嗯嗯嗯"等无新增信息的重复
- 无意义填充:如"啊"、"呢"、"嘛"等语气词单独成句
**注意:即使消息很短,只要包含情绪、兴趣、爱好、个人观点等有价值信息,就必须保留,不得删除。**
例如:
- "我好开心呀" → 包含情绪开心必须保留preserve_keywords 中加入"开心"
- "好喜欢打羽毛球呀" → 包含兴趣爱好喜欢打羽毛球必须保留preserve_keywords 中加入"喜欢打羽毛球"
- "我好难过" → 包含情绪难过必须保留preserve_keywords 中加入"难过"
- "太好啦!看到你开心,我也跟着心情亮起来" → 包含情绪必须保留preserve_keywords 中加入"开心"
---
对话全文: 对话全文:
""" """
{{ dialog_text }} {{ dialog_text }}
@@ -80,15 +109,46 @@
"amounts": [<string>...], "amounts": [<string>...],
"contacts": [<string>...], "contacts": [<string>...],
"addresses": [<string>...], "addresses": [<string>...],
"keywords": [<string>...] "keywords": [<string>...],
"preserve_keywords": [<string>...]
} }
{% else %} {% else %}
Based on the full dialogue below, perform one-time extraction and relevance determination according to this scenario: You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks:
1. Determine whether the dialogue is relevant to the specified scene;
2. Extract all important information fragments that must be preserved.
Scenario Description: {{ instruction }} Scenario Description: {{ instruction }}
{% if not is_builtin_scene and custom_types_str %} {% if not is_builtin_scene and custom_types_str %}
Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true). Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true).
{% endif %} {% endif %}
---
[MUST PRESERVE (cannot be deleted)]
The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields:
- Time information: dates, time points, durations, expiry dates → times field
- ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field
- Amount information: prices, fees, amounts (with currency symbols or units) → amounts field
- Contact information: phone numbers, emails, WeChat, QQ → contacts field
- Address information: locations, addresses, places → addresses field
- Scene keywords: professional terms and event names strongly related to the scene → keywords field
- **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field
- **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field
- **Personal opinions and attitudes**: clear views, evaluations, or stances on something → preserve_keywords field
[CAN BE DELETED]
The following types of content are low-value and can be removed during pruning:
- Pure greetings: e.g., "hello", "are you there", "bye", "ok", "yeah" — short phrases with no substantive content
- Pure emojis/symbols: e.g., "[smile]", "😊", "haha"
- Repetitive confirmations: e.g., "yes yes yes", "right right", "uh huh" — repetitions with no new information
- Meaningless fillers: standalone interjections like "ah", "well", "hmm"
**Note: Even if a message is short, if it contains emotions, interests, hobbies, or personal opinions, it MUST be preserved.**
Examples:
- "I'm so happy!" → contains emotion (happy), must preserve; add "happy" to preserve_keywords
- "I love playing badminton!" → contains interest (love playing badminton), must preserve; add "love playing badminton" to preserve_keywords
- "I feel so sad" → contains emotion (sad), must preserve; add "sad" to preserve_keywords
---
Full Dialogue: Full Dialogue:
""" """
{{ dialog_text }} {{ dialog_text }}
@@ -102,6 +162,7 @@ Output strict JSON only (fixed keys, order doesn't matter):
"amounts": [<string>...], "amounts": [<string>...],
"contacts": [<string>...], "contacts": [<string>...],
"addresses": [<string>...], "addresses": [<string>...],
"keywords": [<string>...] "keywords": [<string>...],
"preserve_keywords": [<string>...]
} }
{% endif %} {% endif %}