[changes] Optimize the semantic pruning judgment rules
This commit is contained in:
@@ -84,7 +84,7 @@ async def get_chunked_dialogs(
|
||||
pruning_scene=memory_config.pruning_scene or "education",
|
||||
pruning_threshold=memory_config.pruning_threshold,
|
||||
scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
|
||||
ontology_classes=memory_config.ontology_classes,
|
||||
ontology_class_infos=memory_config.ontology_classes,
|
||||
)
|
||||
logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}")
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ of the memory system including LLM, chunking, pruning, and search.
|
||||
Classes:
|
||||
LLMConfig: Configuration for LLM client
|
||||
ChunkerConfig: Configuration for dialogue chunking
|
||||
OntologyClassInfo: Single ontology class with name and description
|
||||
PruningConfig: Configuration for semantic pruning
|
||||
TemporalSearchParams: Parameters for temporal search queries
|
||||
"""
|
||||
@@ -50,30 +51,41 @@ class ChunkerConfig(BaseModel):
|
||||
min_characters_per_chunk: Optional[int] = Field(24, ge=0, description="The minimum number of characters in each chunk.")
|
||||
|
||||
|
||||
class OntologyClassInfo(BaseModel):
|
||||
"""本体类型的名称与语义描述,用于剪枝提示词注入。
|
||||
|
||||
Attributes:
|
||||
class_name: 本体类型名称(如"患者"、"课程")
|
||||
class_description: 本体类型语义描述,告知 LLM 该类型在当前场景下的含义
|
||||
"""
|
||||
class_name: str = Field(..., description="本体类型名称")
|
||||
class_description: str = Field(default="", description="本体类型语义描述")
|
||||
|
||||
|
||||
class PruningConfig(BaseModel):
|
||||
"""Configuration for semantic pruning of dialogue content.
|
||||
|
||||
Attributes:
|
||||
pruning_switch: Enable or disable semantic pruning
|
||||
pruning_scene: Scene name for pruning, either a built-in key
|
||||
('education', 'online_service', 'outbound') or a custom scene_name
|
||||
from ontology_scene table
|
||||
pruning_scene: Scene name for pruning from ontology_scene table
|
||||
pruning_threshold: Pruning ratio (0-0.9, max 0.9 to avoid complete removal)
|
||||
scene_id: Optional ontology scene UUID, used to load custom ontology classes
|
||||
ontology_classes: List of class_name strings from ontology_class table,
|
||||
injected into the prompt when pruning_scene is not a built-in scene
|
||||
scene_id: Optional ontology scene UUID
|
||||
ontology_class_infos: Full ontology class info (name + description) from
|
||||
ontology_class table, injected into the pruning prompt to drive
|
||||
scene-aware preservation decisions
|
||||
"""
|
||||
pruning_switch: bool = Field(False, description="Enable semantic pruning when True.")
|
||||
pruning_scene: str = Field(
|
||||
"education",
|
||||
description="Scene for pruning: built-in key or custom scene_name from ontology_scene.",
|
||||
description="Scene name from ontology_scene table.",
|
||||
)
|
||||
pruning_threshold: float = Field(
|
||||
0.5, ge=0.0, le=0.9,
|
||||
description="Pruning ratio within 0-0.9 (max 0.9 to avoid termination).")
|
||||
scene_id: Optional[str] = Field(None, description="Ontology scene UUID (optional).")
|
||||
ontology_classes: Optional[List[str]] = Field(
|
||||
None, description="Class names from ontology_class table for custom scenes."
|
||||
ontology_class_infos: List[OntologyClassInfo] = Field(
|
||||
default_factory=list,
|
||||
description="Full ontology class info (name + description) injected into pruning prompt."
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -20,7 +20,6 @@ from pydantic import BaseModel, Field
|
||||
|
||||
from app.core.memory.models.message_models import DialogData, ConversationMessage, ConversationContext
|
||||
from app.core.memory.models.config_models import PruningConfig
|
||||
from app.core.memory.utils.config.config_utils import get_pruning_config
|
||||
from app.core.memory.utils.prompt.prompt_utils import prompt_env, log_prompt_rendering, log_template_rendering
|
||||
from app.core.memory.storage_services.extraction_engine.data_preprocessing.scene_config import (
|
||||
SceneConfigRegistry,
|
||||
@@ -34,6 +33,8 @@ class DialogExtractionResponse(BaseModel):
|
||||
- is_related:对话与场景的相关性判定。
|
||||
- times / ids / amounts / contacts / addresses / keywords:重要信息片段,用来在不相关对话中保留关键消息。
|
||||
- preserve_keywords:情绪/兴趣/爱好/个人观点相关词,包含这些词的消息必须强制保留。
|
||||
- scene_unrelated_snippets:与当前场景无关且无语义关联的消息片段(原文截取),
|
||||
用于高阈值阶段精准删除跨场景内容。
|
||||
"""
|
||||
is_related: bool = Field(...)
|
||||
times: List[str] = Field(default_factory=list)
|
||||
@@ -43,6 +44,7 @@ class DialogExtractionResponse(BaseModel):
|
||||
addresses: List[str] = Field(default_factory=list)
|
||||
keywords: List[str] = Field(default_factory=list)
|
||||
preserve_keywords: List[str] = Field(default_factory=list, description="情绪/兴趣/爱好/个人观点相关词,包含这些词的消息强制保留")
|
||||
scene_unrelated_snippets: List[str] = Field(default_factory=list,description="与当前场景无关且无语义关联的消息原文片段,高阈值阶段用于精准删除跨场景内容")
|
||||
|
||||
|
||||
class MessageImportanceResponse(BaseModel):
|
||||
@@ -91,12 +93,14 @@ class SemanticPruner:
|
||||
# 加载统一填充词库
|
||||
self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(self.config.pruning_scene)
|
||||
|
||||
# 本体类型列表(用于注入提示词,所有场景均支持)
|
||||
self._ontology_classes = getattr(self.config, "ontology_classes", None) or []
|
||||
# 本体类型列表:直接使用 ontology_class_infos(name + description)
|
||||
self._ontology_class_infos = getattr(self.config, "ontology_class_infos", None) or []
|
||||
# _ontology_classes 仅用于日志统计
|
||||
self._ontology_classes = [info.class_name for info in self._ontology_class_infos]
|
||||
|
||||
self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene}")
|
||||
if self._ontology_classes:
|
||||
self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}")
|
||||
if self._ontology_class_infos:
|
||||
self._log(f"[剪枝-初始化] 注入本体类型({len(self._ontology_class_infos)}个): {self._ontology_classes}")
|
||||
else:
|
||||
self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词")
|
||||
|
||||
@@ -121,7 +125,8 @@ class SemanticPruner:
|
||||
1. 空消息
|
||||
2. 场景特定填充词库精确匹配
|
||||
3. 常见寒暄精确匹配
|
||||
4. 纯表情/标点
|
||||
4. 组合寒暄模式(前缀+后缀组合,如"好的谢谢"、"同学你好"、"明白了")
|
||||
5. 纯表情/标点
|
||||
"""
|
||||
t = message.msg.strip()
|
||||
if not t:
|
||||
@@ -143,6 +148,55 @@ class SemanticPruner:
|
||||
if t in common_greetings:
|
||||
return True
|
||||
|
||||
# 组合寒暄模式:短消息(≤15字)且完全由寒暄成分构成
|
||||
# 策略:将消息拆分后,每个片段都能在填充词库或常见寒暄中找到,则整体为填充
|
||||
if len(t) <= 15:
|
||||
# 确认+称呼/感谢组合,如"好的谢谢"、"明白了"、"知道了谢谢"
|
||||
_confirm_prefixes = {"好的", "好", "嗯", "嗯嗯", "哦", "明白", "明白了", "知道了", "了解", "收到", "没问题"}
|
||||
_thanks_suffixes = {"谢谢", "谢谢你", "谢谢您", "多谢", "感谢", "谢了"}
|
||||
_greeting_suffixes = {"你好", "您好", "老师好", "同学好", "大家好"}
|
||||
_greeting_prefixes = {"同学", "老师", "您好", "你好"}
|
||||
_close_patterns = {
|
||||
"没有了", "没事了", "没问题了", "好了", "行了", "可以了",
|
||||
"不用了", "不需要了", "就这样", "就这样吧", "那就这样",
|
||||
}
|
||||
_polite_responses = {
|
||||
"不客气", "不用谢", "没关系", "没事", "应该的", "这是我应该做的",
|
||||
}
|
||||
|
||||
# 规则1:确认词 + 感谢词(如"好的谢谢"、"嗯谢谢")
|
||||
for cp in _confirm_prefixes:
|
||||
for ts in _thanks_suffixes:
|
||||
if t == cp + ts or t == cp + "," + ts or t == cp + "," + ts:
|
||||
return True
|
||||
|
||||
# 规则2:称呼前缀 + 问候(如"同学你好"、"老师好")
|
||||
for gp in _greeting_prefixes:
|
||||
for gs in _greeting_suffixes:
|
||||
if t == gp + gs or t.startswith(gp) and t.endswith("好"):
|
||||
return True
|
||||
|
||||
# 规则3:结束语 + 感谢(如"没有了,谢谢老师"、"没有了谢谢")
|
||||
for cp in _close_patterns:
|
||||
if t.startswith(cp):
|
||||
remainder = t[len(cp):].lstrip(",,、 ")
|
||||
if not remainder or any(remainder.startswith(ts) for ts in _thanks_suffixes):
|
||||
return True
|
||||
|
||||
# 规则4:礼貌回应(如"不客气,祝你考试顺利"——前缀是礼貌词,后半是祝福套话)
|
||||
for pr in _polite_responses:
|
||||
if t.startswith(pr):
|
||||
remainder = t[len(pr):].lstrip(",,、 ")
|
||||
# 后半是祝福/套话(不含实质信息)
|
||||
if not remainder or re.match(r"^(祝|希望|期待|加油|顺利|好好|保重)", remainder):
|
||||
return True
|
||||
|
||||
# 规则5:纯确认词加"了"后缀(如"明白了"、"知道了"、"好了")
|
||||
_confirm_base = {"明白", "知道", "了解", "收到", "好", "行", "可以", "没问题"}
|
||||
for cb in _confirm_base:
|
||||
if t == cb + "了" or t == cb + "了。" or t == cb + "了!":
|
||||
return True
|
||||
|
||||
# 检查是否为纯表情符号(方括号包裹)
|
||||
if re.fullmatch(r"(\[[^\]]+\])+", t):
|
||||
return True
|
||||
@@ -331,13 +385,13 @@ class SemanticPruner:
|
||||
|
||||
rendered = self.template.render(
|
||||
pruning_scene=self.config.pruning_scene,
|
||||
ontology_classes=self._ontology_classes,
|
||||
ontology_class_infos=self._ontology_class_infos,
|
||||
dialog_text=dialog_text,
|
||||
language=self.language
|
||||
)
|
||||
log_template_rendering("extracat_Pruning.jinja2", {
|
||||
"pruning_scene": self.config.pruning_scene,
|
||||
"ontology_classes_count": len(self._ontology_classes),
|
||||
"ontology_class_infos_count": len(self._ontology_class_infos),
|
||||
"language": self.language
|
||||
})
|
||||
log_prompt_rendering("pruning-extract", rendered)
|
||||
@@ -377,6 +431,189 @@ class SemanticPruner:
|
||||
)
|
||||
return fallback_response
|
||||
|
||||
def _get_pruning_mode(self) -> str:
|
||||
"""根据 pruning_threshold 返回当前剪枝阶段。
|
||||
|
||||
- 低阈值 [0.0, 0.3):conservative 只删填充,保留所有实质内容
|
||||
- 中阈值 [0.3, 0.6):semantic 保留场景相关 + 有语义关联的内容,删除无关联内容
|
||||
- 高阈值 [0.6, 0.9]:strict 只保留场景相关内容,跨场景内容可被删除
|
||||
"""
|
||||
t = float(self.config.pruning_threshold)
|
||||
if t < 0.3:
|
||||
return "conservative"
|
||||
elif t < 0.6:
|
||||
return "semantic"
|
||||
else:
|
||||
return "strict"
|
||||
|
||||
def _apply_related_dialog_pruning(
|
||||
self,
|
||||
msgs: List[ConversationMessage],
|
||||
extraction: "DialogExtractionResponse",
|
||||
dialog_label: str,
|
||||
pruning_mode: str,
|
||||
) -> List[ConversationMessage]:
|
||||
"""相关对话统一剪枝入口,消除 prune_dialog / prune_dataset 中的重复逻辑。
|
||||
|
||||
- conservative:只删填充
|
||||
- semantic / strict:场景感知剪枝
|
||||
"""
|
||||
if pruning_mode == "conservative":
|
||||
preserve_tokens = self._build_preserve_tokens(extraction)
|
||||
return self._prune_fillers_only(msgs, preserve_tokens, dialog_label)
|
||||
else:
|
||||
return self._prune_with_scene_filter(msgs, extraction, dialog_label, pruning_mode)
|
||||
|
||||
def _prune_fillers_only(
|
||||
self,
|
||||
msgs: List[ConversationMessage],
|
||||
preserve_tokens: List[str],
|
||||
dialog_label: str,
|
||||
) -> List[ConversationMessage]:
|
||||
"""相关对话专用:只删填充消息,LLM 保护消息和实质内容一律保留。
|
||||
|
||||
不受 pruning_threshold 约束,删多少算多少(填充有多少删多少)。
|
||||
至少保留 1 条消息。
|
||||
注意:填充检测优先于 preserve_tokens 保护——填充消息本身无信息价值,
|
||||
即使 LLM 误将其关键词放入 preserve_tokens 也应删除。
|
||||
"""
|
||||
to_delete_ids: set = set()
|
||||
for m in msgs:
|
||||
# 填充检测优先:先判断是否为填充,再看 LLM 保护
|
||||
if self._is_filler_message(m):
|
||||
to_delete_ids.add(id(m))
|
||||
self._log(f" [填充] '{m.msg[:40]}' → 删除")
|
||||
continue
|
||||
if self._msg_matches_tokens(m, preserve_tokens):
|
||||
self._log(f" [保护] '{m.msg[:40]}' → LLM保护,跳过")
|
||||
|
||||
kept = [m for m in msgs if id(m) not in to_delete_ids]
|
||||
if not kept and msgs:
|
||||
kept = [msgs[0]]
|
||||
|
||||
deleted = len(msgs) - len(kept)
|
||||
self._log(
|
||||
f"[剪枝-相关] {dialog_label} 总消息={len(msgs)} "
|
||||
f"填充删除={deleted} 保留={len(kept)}"
|
||||
)
|
||||
return kept
|
||||
|
||||
def _prune_with_scene_filter(
|
||||
self,
|
||||
msgs: List[ConversationMessage],
|
||||
extraction: "DialogExtractionResponse",
|
||||
dialog_label: str,
|
||||
mode: str,
|
||||
) -> List[ConversationMessage]:
|
||||
"""场景感知剪枝,供 semantic / strict 两个阈值档位调用。
|
||||
|
||||
本函数体现剪枝系统的三层递进逻辑:
|
||||
|
||||
第一层(conservative,阈值 < 0.3):
|
||||
不进入本函数,由 _prune_fillers_only 处理。
|
||||
保留标准:只问"有没有信息量",填充消息(嗯/好的/哈哈等)删除,其余一律保留。
|
||||
|
||||
第二层(semantic,阈值 [0.3, 0.6)):
|
||||
保留标准:内容价值优先,场景相关性是参考而非唯一标准。
|
||||
- 填充消息 → 删除(最高优先级)
|
||||
- 场景相关消息 → 保留
|
||||
- 场景无关消息 → 有两次豁免机会:
|
||||
1. 命中 scene_preserve_tokens(LLM 标记的关键词/时间/金额等)→ 保留
|
||||
2. 含情感词(感觉/压力/开心等)→ 保留(情感内容有记忆价值)
|
||||
3. 两次豁免均未命中 → 删除
|
||||
|
||||
第三层(strict,阈值 [0.6, 0.9]):
|
||||
保留标准:场景相关性优先,豁免权极度收窄。
|
||||
- 填充消息 → 删除(最高优先级)
|
||||
- 场景相关消息 → 保留
|
||||
- 场景无关消息 → 直接删除,仅保留一个例外:
|
||||
LLM 同时将该消息放入 preserve_keywords(自相矛盾时以情感标记为准)→ 保留
|
||||
注意:strict 模式下情感词兜底不再生效,场景相关性是最终裁决标准。
|
||||
|
||||
至少保留 1 条消息(兜底取第一条)。
|
||||
"""
|
||||
# strict 模式收窄保护范围:只保护结构化关键信息(时间/编号/金额/联系方式/地址),
|
||||
# 不保护 keywords / preserve_keywords,让场景过滤能删掉更多内容。
|
||||
# semantic 模式完整保护:包含 LLM 抽取的所有重要片段(含 keywords 和 preserve_keywords)。
|
||||
if mode == "strict":
|
||||
scene_preserve_tokens = (
|
||||
extraction.times + extraction.ids + extraction.amounts +
|
||||
extraction.contacts + extraction.addresses
|
||||
)
|
||||
else:
|
||||
scene_preserve_tokens = self._build_preserve_tokens(extraction)
|
||||
|
||||
unrelated_snippets = extraction.scene_unrelated_snippets or []
|
||||
|
||||
to_delete_ids: set = set()
|
||||
for m in msgs:
|
||||
msg_text = m.msg.strip()
|
||||
|
||||
# 第一优先级:填充消息无论模式直接删除,不参与后续场景判断
|
||||
if self._is_filler_message(m):
|
||||
to_delete_ids.add(id(m))
|
||||
self._log(f" [填充] '{msg_text[:40]}' → 删除")
|
||||
continue
|
||||
|
||||
# 双向包含匹配:处理 LLM 返回片段与原始消息文本长度不完全一致的情况
|
||||
is_scene_unrelated = any(
|
||||
snip and (snip in msg_text or msg_text in snip)
|
||||
for snip in unrelated_snippets
|
||||
)
|
||||
|
||||
if is_scene_unrelated:
|
||||
if mode == "strict":
|
||||
# strict:场景无关 → 删除
|
||||
# 唯一例外:LLM 同时将该消息标记为 preserve_keywords,
|
||||
# 说明 LLM 自相矛盾(既认为场景无关又认为值得保留),以 preserve_keywords 为准
|
||||
if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords):
|
||||
self._log(f" [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护,保留")
|
||||
else:
|
||||
to_delete_ids.add(id(m))
|
||||
self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除")
|
||||
elif mode == "semantic":
|
||||
# semantic:场景无关但有内容价值 → 保留
|
||||
# 豁免第一层:命中 scene_preserve_tokens(关键词/结构化信息保护)
|
||||
if self._msg_matches_tokens(m, scene_preserve_tokens):
|
||||
self._log(f" [保护] '{msg_text[:40]}' → 场景关键词保护,保留")
|
||||
else:
|
||||
# 豁免第二层:含情感词,认为有情境记忆价值,即使场景无关也保留
|
||||
has_contextual_emotion = any(
|
||||
word in msg_text
|
||||
for word in ["感觉", "觉得", "心情", "开心", "难过", "高兴", "沮丧",
|
||||
"喜欢", "讨厌", "爱", "恨", "担心", "害怕", "兴奋",
|
||||
"压力", "累", "疲惫", "烦", "焦虑", "委屈", "感动"]
|
||||
)
|
||||
if not has_contextual_emotion:
|
||||
to_delete_ids.add(id(m))
|
||||
self._log(f" [场景无关-语义] '{msg_text[:40]}' → 删除(无情感关联)")
|
||||
else:
|
||||
self._log(f" [场景关联-保留] '{msg_text[:40]}' → 有情感关联,保留")
|
||||
else:
|
||||
# 不在 scene_unrelated_snippets 中 → 场景相关,直接保留
|
||||
if self._msg_matches_tokens(m, scene_preserve_tokens):
|
||||
self._log(f" [保护] '{msg_text[:40]}' → LLM保护,跳过")
|
||||
# else: 普通场景相关消息,保留,不输出日志
|
||||
|
||||
kept = [m for m in msgs if id(m) not in to_delete_ids]
|
||||
if not kept and msgs:
|
||||
kept = [msgs[0]]
|
||||
|
||||
deleted = len(msgs) - len(kept)
|
||||
self._log(
|
||||
f"[剪枝-{mode}] {dialog_label} 总消息={len(msgs)} "
|
||||
f"删除={deleted} 保留={len(kept)}"
|
||||
)
|
||||
return kept
|
||||
|
||||
def _build_preserve_tokens(self, extraction: "DialogExtractionResponse") -> List[str]:
|
||||
"""统一构建 preserve_tokens,合并 LLM 抽取的所有重要片段。"""
|
||||
return (
|
||||
extraction.times + extraction.ids + extraction.amounts +
|
||||
extraction.contacts + extraction.addresses + extraction.keywords +
|
||||
extraction.preserve_keywords
|
||||
)
|
||||
|
||||
def _msg_matches_tokens(self, message: ConversationMessage, tokens: List[str]) -> bool:
|
||||
"""判断消息是否包含任意抽取到的重要片段。"""
|
||||
if not tokens:
|
||||
@@ -397,16 +634,18 @@ class SemanticPruner:
|
||||
|
||||
proportion = float(self.config.pruning_threshold)
|
||||
extraction = await self._extract_dialog_important(dialog.content)
|
||||
pruning_mode = self._get_pruning_mode()
|
||||
self._log(f"[剪枝-模式] 阈值={proportion} → 模式={pruning_mode}")
|
||||
|
||||
if extraction.is_related:
|
||||
# 相关对话不剪枝
|
||||
kept = self._apply_related_dialog_pruning(
|
||||
dialog.context.msgs, extraction, f"对话ID={dialog.id}", pruning_mode
|
||||
)
|
||||
dialog.context = ConversationContext(msgs=kept)
|
||||
return dialog
|
||||
|
||||
# 在不相关对话中,LLM 已通过 preserve_tokens 标记需要保护的内容
|
||||
preserve_tokens = (
|
||||
extraction.times + extraction.ids + extraction.amounts +
|
||||
extraction.contacts + extraction.addresses + extraction.keywords +
|
||||
extraction.preserve_keywords
|
||||
)
|
||||
preserve_tokens = self._build_preserve_tokens(extraction)
|
||||
msgs = dialog.context.msgs
|
||||
|
||||
# 分类:填充 / 其他可删(LLM保护消息通过不加入任何桶来隐式保护)
|
||||
@@ -482,6 +721,9 @@ class SemanticPruner:
|
||||
f"[剪枝-数据集] 对话总数={len(dialogs)} 场景={self.config.pruning_scene} 删除比例={proportion} 开关={self.config.pruning_switch} 模式=消息级独立判断"
|
||||
)
|
||||
|
||||
pruning_mode = self._get_pruning_mode()
|
||||
self._log(f"[剪枝-数据集] 阈值={proportion} → 剪枝阶段={pruning_mode}")
|
||||
|
||||
result: List[DialogData] = []
|
||||
total_original_msgs = 0
|
||||
total_deleted_msgs = 0
|
||||
@@ -505,12 +747,19 @@ class SemanticPruner:
|
||||
original_count = len(msgs)
|
||||
total_original_msgs += original_count
|
||||
|
||||
# 相关对话:根据阶段决定处理力度
|
||||
if extraction.is_related:
|
||||
kept = self._apply_related_dialog_pruning(
|
||||
msgs, extraction, f"对话 {d_idx+1}", pruning_mode
|
||||
)
|
||||
deleted_count = original_count - len(kept)
|
||||
total_deleted_msgs += deleted_count
|
||||
dd.context.msgs = kept
|
||||
result.append(dd)
|
||||
continue
|
||||
|
||||
# 从 LLM 抽取结果中获取所有需要保留的 token
|
||||
preserve_tokens = (
|
||||
extraction.times + extraction.ids + extraction.amounts +
|
||||
extraction.contacts + extraction.addresses + extraction.keywords +
|
||||
extraction.preserve_keywords # 情绪/兴趣/爱好关键词
|
||||
)
|
||||
preserve_tokens = self._build_preserve_tokens(extraction)
|
||||
|
||||
# 判断是否需要详细日志
|
||||
should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog
|
||||
@@ -605,6 +854,18 @@ class SemanticPruner:
|
||||
|
||||
self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
|
||||
|
||||
# 补充统计日志(供 _parse_logs_to_structured 正则解析)
|
||||
related_count = sum(1 for ex in extraction_results if ex.is_related)
|
||||
unrelated_count = len(dialogs) - related_count
|
||||
related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related]
|
||||
unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related]
|
||||
self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}")
|
||||
self._log(
|
||||
f"[剪枝-数据集] 相关对话:第[{', '.join(related_indices)}]段;"
|
||||
f"不相关对话:第[{', '.join(unrelated_indices)}]段"
|
||||
)
|
||||
self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs} 条")
|
||||
|
||||
# 保存日志
|
||||
try:
|
||||
from app.core.config import settings
|
||||
@@ -686,7 +947,7 @@ class SemanticPruner:
|
||||
re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)")
|
||||
re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)")
|
||||
re_indices = re.compile(r"相关对话:第\[(.*?)\]段;不相关对话:第\[(.*?)\]段")
|
||||
re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+)\s+分配删除=(\d+)\s+实删=(\d+)\s+保留=(\d+)")
|
||||
re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b")
|
||||
re_total_del = re.compile(r"总删除\s+(\d+)\s+条")
|
||||
re_remaining = re.compile(r"剩余对话数=(\d+)")
|
||||
|
||||
@@ -720,9 +981,8 @@ class SemanticPruner:
|
||||
dialogs.append({
|
||||
"index": parse_int(m.group(1)),
|
||||
"total_messages": parse_int(m.group(2)),
|
||||
"quota_delete": parse_int(m.group(3)),
|
||||
"actual_deleted": parse_int(m.group(4)),
|
||||
"kept": parse_int(m.group(5)),
|
||||
"deleted": parse_int(m.group(3)),
|
||||
"kept": parse_int(m.group(4)),
|
||||
})
|
||||
continue
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{#
|
||||
对话级抽取与相关性判定模板(用于剪枝加速)
|
||||
输入:pruning_scene, ontology_classes, dialog_text, language
|
||||
输入:pruning_scene, ontology_class_infos, dialog_text, language
|
||||
- ontology_class_infos: List[{class_name: str, class_description: str}]
|
||||
输出:严格 JSON(不要包含任何多余文本),字段:
|
||||
- is_related: bool,是否与所选场景相关
|
||||
- times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等)
|
||||
@@ -18,20 +19,16 @@
|
||||
#}
|
||||
|
||||
{# ── 确定场景说明 ── #}
|
||||
{% if ontology_classes and ontology_classes | length > 0 %}
|
||||
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
|
||||
{% if language == 'en' %}
|
||||
{% set custom_types_str = ontology_classes | join(', ') %}
|
||||
{% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %}
|
||||
{% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is relevant if it involves any of the following entity types.' %}
|
||||
{% else %}
|
||||
{% set custom_types_str = ontology_classes | join('、') %}
|
||||
{% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %}
|
||||
{% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关。' %}
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% if language == 'en' %}
|
||||
{% set custom_types_str = '' %}
|
||||
{% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
|
||||
{% else %}
|
||||
{% set custom_types_str = '' %}
|
||||
{% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
@@ -42,8 +39,17 @@
|
||||
2. 从对话中抽取所有需要保留的重要信息片段。
|
||||
|
||||
场景说明:{{ instruction }}
|
||||
{% if custom_types_str %}
|
||||
重要提示:只要对话中出现与上述实体类型({{ custom_types_str }})相关的内容,即判定为相关(is_related=true)。
|
||||
|
||||
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
|
||||
【本场景实体类型定义】
|
||||
以下实体类型定义了本场景中哪些内容是重要的。
|
||||
凡是与以下任意类型相关的内容,都必须保留,并将关键词/短语提取到 keywords 字段:
|
||||
|
||||
{% for info in ontology_class_infos %}
|
||||
- {{ info.class_name }}:{{ info.class_description }}
|
||||
{% endfor %}
|
||||
|
||||
重要提示:只要对话中出现与上述任意实体类型相关的内容,即判定为相关(is_related=true)。
|
||||
{% endif %}
|
||||
|
||||
---
|
||||
@@ -51,13 +57,40 @@
|
||||
以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段:
|
||||
- 时间信息:日期、时间点、时间段、有效期 → times 字段
|
||||
- 编号信息:学号、工号、订单号、申请号、账号、ID → ids 字段
|
||||
- 金额信息:价格、费用、金额(含货币符号或单位) → amounts 字段
|
||||
- 金额信息:价格、费用、金额(含货币符号或单位,如"100元"、"¥200")→ amounts 字段(注意:考试分数、成绩分数不属于金额,不要放入此字段)
|
||||
- 联系方式:电话、手机号、邮箱、微信、QQ → contacts 字段
|
||||
- 地址信息:地点、地址、位置 → addresses 字段
|
||||
- 场景关键词:与场景强相关的专业术语、事件名称 → keywords 字段
|
||||
- 场景关键词:与**当前场景**强相关的专业术语、事件名称 → keywords 字段(注意:只放与当前场景直接相关的词,跨场景的内容不要放入此字段)
|
||||
- **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段
|
||||
- **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段
|
||||
- **个人观点与态度**:对某事物的明确看法、评价、立场 → preserve_keywords 字段
|
||||
- **个人情感态度**:对人际关系、情感状态的明确表达(如"我跟室友闹矛盾了"、"我都快抑郁了")→ preserve_keywords 字段
|
||||
- 注意:学业目标(如"我想考研")、成绩(如"87分")、学科偏好(如"喜欢数学")属于学业信息,不属于情绪/情感,不要放入 preserve_keywords 字段
|
||||
|
||||
【场景无关内容标记】
|
||||
请从对话中识别出与当前场景({{ pruning_scene }})**既不相关、也无语义关联**的消息片段,将其原文(或关键片段)提取到 scene_unrelated_snippets 字段。
|
||||
判断标准:
|
||||
- 与场景实体类型完全无关
|
||||
- 与场景话题没有因果/时间/情境上的关联(例如:不是"因为上课所以累"这种关联)
|
||||
- 纯粹是另一个话题的内容(如在教育场景中讨论购物、娱乐等)
|
||||
注意:有情绪/感受表达的消息即使话题不同,也可能有语义关联,请谨慎标记。
|
||||
|
||||
**重要:scene_unrelated_snippets 必须认真填写,不能为空数组。**
|
||||
如果对话中存在与场景无关的内容,必须将其原文片段提取出来。
|
||||
|
||||
示例(场景=在线教育):
|
||||
- "我最近心情很差,跟室友闹矛盾了" → 与教育场景无关,加入 scene_unrelated_snippets
|
||||
- "她总是很晚回来吵到我睡觉" → 与教育场景无关,加入 scene_unrelated_snippets
|
||||
- "对,我都快抑郁了" → 与教育场景无关,加入 scene_unrelated_snippets
|
||||
- "期末考试12月25日" → 与教育场景相关,不加入 scene_unrelated_snippets
|
||||
- "我上次高数作业87分" → 与教育场景相关,不加入 scene_unrelated_snippets
|
||||
- "我的目标是考研" → 与教育场景相关,不加入 scene_unrelated_snippets
|
||||
|
||||
示例(场景=情感陪伴):
|
||||
- "我最近心情很差,跟室友闹矛盾了" → 与情感陪伴场景相关(情绪+关系),不加入 scene_unrelated_snippets
|
||||
- "对,我都快抑郁了" → 与情感陪伴场景相关(情绪),不加入 scene_unrelated_snippets
|
||||
- "期末考试12月25日,3号教学楼201室" → 与情感陪伴场景无关(教育信息),加入 scene_unrelated_snippets
|
||||
- "我上次高数作业87分,这次能考好吗" → 与情感陪伴场景无关(学业信息),加入 scene_unrelated_snippets
|
||||
- "我的目标是考研,想读应用数学" → 与情感陪伴场景无关(学业目标),加入 scene_unrelated_snippets
|
||||
|
||||
【可以删除的内容】
|
||||
以下类型的内容属于低价值信息,可以在剪枝时删除:
|
||||
@@ -88,7 +121,8 @@
|
||||
"contacts": [<string>...],
|
||||
"addresses": [<string>...],
|
||||
"keywords": [<string>...],
|
||||
"preserve_keywords": [<string>...]
|
||||
"preserve_keywords": [<string>...],
|
||||
"scene_unrelated_snippets": [<string>...]
|
||||
}
|
||||
{% else %}
|
||||
You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks:
|
||||
@@ -96,8 +130,17 @@ You are a dialogue content analysis assistant. Please analyze the full dialogue
|
||||
2. Extract all important information fragments that must be preserved.
|
||||
|
||||
Scenario Description: {{ instruction }}
|
||||
{% if custom_types_str %}
|
||||
Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true).
|
||||
|
||||
{% if ontology_class_infos and ontology_class_infos | length > 0 %}
|
||||
[Scene Entity Type Definitions]
|
||||
The following entity types define what content is important in this scene.
|
||||
Content related to ANY of these types must be preserved and extracted into the keywords field:
|
||||
|
||||
{% for info in ontology_class_infos %}
|
||||
- {{ info.class_name }}: {{ info.class_description }}
|
||||
{% endfor %}
|
||||
|
||||
Important: If the dialogue contains content related to any of the entity types above, mark it as relevant (is_related=true).
|
||||
{% endif %}
|
||||
|
||||
---
|
||||
@@ -105,13 +148,22 @@ Important: If the dialogue contains content related to any of the entity types a
|
||||
The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields:
|
||||
- Time information: dates, time points, durations, expiry dates → times field
|
||||
- ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field
|
||||
- Amount information: prices, fees, amounts (with currency symbols or units) → amounts field
|
||||
- Amount information: prices, fees, amounts (with currency symbols or units, e.g., "$100", "¥200") → amounts field (Note: exam scores and grades are NOT amounts, do not put them here)
|
||||
- Contact information: phone numbers, emails, WeChat, QQ → contacts field
|
||||
- Address information: locations, addresses, places → addresses field
|
||||
- Scene keywords: professional terms and event names strongly related to the scene → keywords field
|
||||
- Scene keywords: professional terms and event names strongly related to **the current scene** → keywords field (Note: only put terms directly related to the current scene; cross-scene content should not be placed here)
|
||||
- **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field
|
||||
- **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field
|
||||
- **Personal opinions and attitudes**: clear views, evaluations, or stances on something → preserve_keywords field
|
||||
- **Personal emotional attitudes**: clear expressions about interpersonal relationships or emotional states (e.g., "I had a fight with my roommate", "I'm almost depressed") → preserve_keywords field
|
||||
- Note: Academic goals (e.g., "I want to pursue a master's degree"), grades (e.g., "87 points"), and subject preferences (e.g., "I like math") are academic information, NOT emotions/feelings — do not put them in preserve_keywords
|
||||
|
||||
[Scene-Unrelated Content Marking]
|
||||
Please identify message snippets in the dialogue that are **neither relevant to nor semantically associated with** the current scene ({{ pruning_scene }}), and extract their original text (or key fragments) into the scene_unrelated_snippets field.
|
||||
Criteria:
|
||||
- Completely unrelated to the scene's entity types
|
||||
- No causal/temporal/contextual association with the scene topic (e.g., "feeling tired because of class" IS associated)
|
||||
- Purely belongs to a different topic (e.g., discussing shopping or entertainment in an education scene)
|
||||
Note: Messages with emotional/feeling expressions may still have semantic association even if the topic differs — mark carefully.
|
||||
|
||||
[CAN BE DELETED]
|
||||
The following types of content are low-value and can be removed during pruning:
|
||||
@@ -141,6 +193,7 @@ Output strict JSON only (fixed keys, order doesn't matter):
|
||||
"contacts": [<string>...],
|
||||
"addresses": [<string>...],
|
||||
"keywords": [<string>...],
|
||||
"preserve_keywords": [<string>...]
|
||||
"preserve_keywords": [<string>...],
|
||||
"scene_unrelated_snippets": [<string>...]
|
||||
}
|
||||
{% endif %}
|
||||
|
||||
@@ -417,7 +417,7 @@ class MemoryConfig:
|
||||
|
||||
# Ontology scene association
|
||||
scene_id: Optional[UUID] = None
|
||||
ontology_classes: Optional[list] = field(default=None)
|
||||
ontology_class_infos: list[dict] = field(default_factory=list)
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate configuration after initialization."""
|
||||
|
||||
@@ -107,28 +107,29 @@ def _validate_config_id(config_id, db: Session = None):
|
||||
)
|
||||
|
||||
|
||||
def _load_ontology_classes(db: Session, scene_id, pruning_scene: Optional[str]) -> Optional[list]:
|
||||
"""从 ontology_class 表加载场景类型名称列表,用于注入提示词。
|
||||
def _load_ontology_class_infos(db: Session, scene_id) -> list:
|
||||
"""从 ontology_class 表加载完整本体类型信息(name + description),用于注入剪枝提示词。
|
||||
|
||||
Args:
|
||||
db: 数据库会话
|
||||
scene_id: 本体场景 UUID
|
||||
pruning_scene: 语义剪枝场景名称(保留参数,暂未使用)
|
||||
|
||||
Returns:
|
||||
class_name 字符串列表,或 None(无数据时)
|
||||
[{"class_name": ..., "class_description": ...}, ...] 或空列表
|
||||
"""
|
||||
if not scene_id:
|
||||
return None
|
||||
return []
|
||||
try:
|
||||
from app.repositories.ontology_class_repository import OntologyClassRepository
|
||||
repo = OntologyClassRepository(db)
|
||||
classes = repo.get_classes_by_scene(scene_id)
|
||||
names = [c.class_name for c in classes if c.class_name]
|
||||
return names if names else None
|
||||
return [
|
||||
{"class_name": c.class_name, "class_description": c.class_description or ""}
|
||||
for c in classes if c.class_name
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load ontology classes for scene_id={scene_id}: {e}")
|
||||
return None
|
||||
logger.warning(f"Failed to load ontology class infos for scene_id={scene_id}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
class MemoryConfigService:
|
||||
@@ -383,7 +384,7 @@ class MemoryConfigService:
|
||||
pruning_threshold=float(memory_config.pruning_threshold) if memory_config.pruning_threshold is not None else 0.5,
|
||||
# Ontology scene association
|
||||
scene_id=memory_config.scene_id,
|
||||
ontology_classes=_load_ontology_classes(self.db, memory_config.scene_id, memory_config.pruning_scene),
|
||||
ontology_class_infos=_load_ontology_class_infos(self.db, memory_config.scene_id),
|
||||
)
|
||||
|
||||
elapsed_ms = (time.time() - start_time) * 1000
|
||||
@@ -550,11 +551,13 @@ class MemoryConfigService:
|
||||
- pruning_switch: bool
|
||||
- pruning_scene: str
|
||||
- pruning_threshold: float
|
||||
- ontology_class_infos: list of {class_name, class_description} dicts
|
||||
"""
|
||||
return {
|
||||
"pruning_switch": memory_config.pruning_enabled,
|
||||
"pruning_scene": memory_config.pruning_scene,
|
||||
"pruning_threshold": memory_config.pruning_threshold,
|
||||
"ontology_class_infos": memory_config.ontology_class_infos or [],
|
||||
}
|
||||
|
||||
def get_ontology_types(self, memory_config: MemoryConfig):
|
||||
|
||||
@@ -121,7 +121,7 @@ async def run_pilot_extraction(
|
||||
"pruning_scene": memory_config.pruning_scene,
|
||||
"pruning_threshold": memory_config.pruning_threshold,
|
||||
"scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
|
||||
"ontology_classes": memory_config.ontology_classes,
|
||||
"ontology_class_infos": memory_config.ontology_classes,
|
||||
}
|
||||
config = PruningConfig(**pruning_config_dict)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user