[changes] Optimize the semantic pruning judgment rules

This commit is contained in:
lanceyq
2026-03-19 11:49:59 +08:00
parent a07727c047
commit d12ad213e0
7 changed files with 393 additions and 65 deletions

View File

@@ -84,7 +84,7 @@ async def get_chunked_dialogs(
pruning_scene=memory_config.pruning_scene or "education", pruning_scene=memory_config.pruning_scene or "education",
pruning_threshold=memory_config.pruning_threshold, pruning_threshold=memory_config.pruning_threshold,
scene_id=str(memory_config.scene_id) if memory_config.scene_id else None, scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
ontology_classes=memory_config.ontology_classes, ontology_class_infos=memory_config.ontology_classes,
) )
logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}") logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}")

View File

@@ -6,6 +6,7 @@ of the memory system including LLM, chunking, pruning, and search.
Classes: Classes:
LLMConfig: Configuration for LLM client LLMConfig: Configuration for LLM client
ChunkerConfig: Configuration for dialogue chunking ChunkerConfig: Configuration for dialogue chunking
OntologyClassInfo: Single ontology class with name and description
PruningConfig: Configuration for semantic pruning PruningConfig: Configuration for semantic pruning
TemporalSearchParams: Parameters for temporal search queries TemporalSearchParams: Parameters for temporal search queries
""" """
@@ -50,30 +51,41 @@ class ChunkerConfig(BaseModel):
min_characters_per_chunk: Optional[int] = Field(24, ge=0, description="The minimum number of characters in each chunk.") min_characters_per_chunk: Optional[int] = Field(24, ge=0, description="The minimum number of characters in each chunk.")
class OntologyClassInfo(BaseModel):
"""本体类型的名称与语义描述,用于剪枝提示词注入。
Attributes:
class_name: 本体类型名称(如"患者""课程"
class_description: 本体类型语义描述,告知 LLM 该类型在当前场景下的含义
"""
class_name: str = Field(..., description="本体类型名称")
class_description: str = Field(default="", description="本体类型语义描述")
class PruningConfig(BaseModel): class PruningConfig(BaseModel):
"""Configuration for semantic pruning of dialogue content. """Configuration for semantic pruning of dialogue content.
Attributes: Attributes:
pruning_switch: Enable or disable semantic pruning pruning_switch: Enable or disable semantic pruning
pruning_scene: Scene name for pruning, either a built-in key pruning_scene: Scene name for pruning from ontology_scene table
('education', 'online_service', 'outbound') or a custom scene_name
from ontology_scene table
pruning_threshold: Pruning ratio (0-0.9, max 0.9 to avoid complete removal) pruning_threshold: Pruning ratio (0-0.9, max 0.9 to avoid complete removal)
scene_id: Optional ontology scene UUID, used to load custom ontology classes scene_id: Optional ontology scene UUID
ontology_classes: List of class_name strings from ontology_class table, ontology_class_infos: Full ontology class info (name + description) from
injected into the prompt when pruning_scene is not a built-in scene ontology_class table, injected into the pruning prompt to drive
scene-aware preservation decisions
""" """
pruning_switch: bool = Field(False, description="Enable semantic pruning when True.") pruning_switch: bool = Field(False, description="Enable semantic pruning when True.")
pruning_scene: str = Field( pruning_scene: str = Field(
"education", "education",
description="Scene for pruning: built-in key or custom scene_name from ontology_scene.", description="Scene name from ontology_scene table.",
) )
pruning_threshold: float = Field( pruning_threshold: float = Field(
0.5, ge=0.0, le=0.9, 0.5, ge=0.0, le=0.9,
description="Pruning ratio within 0-0.9 (max 0.9 to avoid termination).") description="Pruning ratio within 0-0.9 (max 0.9 to avoid termination).")
scene_id: Optional[str] = Field(None, description="Ontology scene UUID (optional).") scene_id: Optional[str] = Field(None, description="Ontology scene UUID (optional).")
ontology_classes: Optional[List[str]] = Field( ontology_class_infos: List[OntologyClassInfo] = Field(
None, description="Class names from ontology_class table for custom scenes." default_factory=list,
description="Full ontology class info (name + description) injected into pruning prompt."
) )

View File

@@ -20,7 +20,6 @@ from pydantic import BaseModel, Field
from app.core.memory.models.message_models import DialogData, ConversationMessage, ConversationContext from app.core.memory.models.message_models import DialogData, ConversationMessage, ConversationContext
from app.core.memory.models.config_models import PruningConfig from app.core.memory.models.config_models import PruningConfig
from app.core.memory.utils.config.config_utils import get_pruning_config
from app.core.memory.utils.prompt.prompt_utils import prompt_env, log_prompt_rendering, log_template_rendering from app.core.memory.utils.prompt.prompt_utils import prompt_env, log_prompt_rendering, log_template_rendering
from app.core.memory.storage_services.extraction_engine.data_preprocessing.scene_config import ( from app.core.memory.storage_services.extraction_engine.data_preprocessing.scene_config import (
SceneConfigRegistry, SceneConfigRegistry,
@@ -34,6 +33,8 @@ class DialogExtractionResponse(BaseModel):
- is_related对话与场景的相关性判定。 - is_related对话与场景的相关性判定。
- times / ids / amounts / contacts / addresses / keywords重要信息片段用来在不相关对话中保留关键消息。 - times / ids / amounts / contacts / addresses / keywords重要信息片段用来在不相关对话中保留关键消息。
- preserve_keywords情绪/兴趣/爱好/个人观点相关词,包含这些词的消息必须强制保留。 - preserve_keywords情绪/兴趣/爱好/个人观点相关词,包含这些词的消息必须强制保留。
- scene_unrelated_snippets与当前场景无关且无语义关联的消息片段原文截取
用于高阈值阶段精准删除跨场景内容。
""" """
is_related: bool = Field(...) is_related: bool = Field(...)
times: List[str] = Field(default_factory=list) times: List[str] = Field(default_factory=list)
@@ -43,6 +44,7 @@ class DialogExtractionResponse(BaseModel):
addresses: List[str] = Field(default_factory=list) addresses: List[str] = Field(default_factory=list)
keywords: List[str] = Field(default_factory=list) keywords: List[str] = Field(default_factory=list)
preserve_keywords: List[str] = Field(default_factory=list, description="情绪/兴趣/爱好/个人观点相关词,包含这些词的消息强制保留") preserve_keywords: List[str] = Field(default_factory=list, description="情绪/兴趣/爱好/个人观点相关词,包含这些词的消息强制保留")
scene_unrelated_snippets: List[str] = Field(default_factory=list,description="与当前场景无关且无语义关联的消息原文片段,高阈值阶段用于精准删除跨场景内容")
class MessageImportanceResponse(BaseModel): class MessageImportanceResponse(BaseModel):
@@ -91,12 +93,14 @@ class SemanticPruner:
# 加载统一填充词库 # 加载统一填充词库
self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(self.config.pruning_scene) self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(self.config.pruning_scene)
# 本体类型列表(用于注入提示词,所有场景均支持 # 本体类型列表:直接使用 ontology_class_infosname + description
self._ontology_classes = getattr(self.config, "ontology_classes", None) or [] self._ontology_class_infos = getattr(self.config, "ontology_class_infos", None) or []
# _ontology_classes 仅用于日志统计
self._ontology_classes = [info.class_name for info in self._ontology_class_infos]
self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene}") self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene}")
if self._ontology_classes: if self._ontology_class_infos:
self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}") self._log(f"[剪枝-初始化] 注入本体类型({len(self._ontology_class_infos)}个): {self._ontology_classes}")
else: else:
self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词") self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词")
@@ -121,7 +125,8 @@ class SemanticPruner:
1. 空消息 1. 空消息
2. 场景特定填充词库精确匹配 2. 场景特定填充词库精确匹配
3. 常见寒暄精确匹配 3. 常见寒暄精确匹配
4. 纯表情/标点 4. 组合寒暄模式(前缀+后缀组合,如"好的谢谢""同学你好""明白了"
5. 纯表情/标点
""" """
t = message.msg.strip() t = message.msg.strip()
if not t: if not t:
@@ -143,6 +148,55 @@ class SemanticPruner:
if t in common_greetings: if t in common_greetings:
return True return True
# 组合寒暄模式短消息≤15字且完全由寒暄成分构成
# 策略:将消息拆分后,每个片段都能在填充词库或常见寒暄中找到,则整体为填充
if len(t) <= 15:
# 确认+称呼/感谢组合,如"好的谢谢"、"明白了"、"知道了谢谢"
_confirm_prefixes = {"好的", "", "", "嗯嗯", "", "明白", "明白了", "知道了", "了解", "收到", "没问题"}
_thanks_suffixes = {"谢谢", "谢谢你", "谢谢您", "多谢", "感谢", "谢了"}
_greeting_suffixes = {"你好", "您好", "老师好", "同学好", "大家好"}
_greeting_prefixes = {"同学", "老师", "您好", "你好"}
_close_patterns = {
"没有了", "没事了", "没问题了", "好了", "行了", "可以了",
"不用了", "不需要了", "就这样", "就这样吧", "那就这样",
}
_polite_responses = {
"不客气", "不用谢", "没关系", "没事", "应该的", "这是我应该做的",
}
# 规则1确认词 + 感谢词(如"好的谢谢"、"嗯谢谢"
for cp in _confirm_prefixes:
for ts in _thanks_suffixes:
if t == cp + ts or t == cp + "" + ts or t == cp + "" + ts:
return True
# 规则2称呼前缀 + 问候(如"同学你好"、"老师好"
for gp in _greeting_prefixes:
for gs in _greeting_suffixes:
if t == gp + gs or t.startswith(gp) and t.endswith(""):
return True
# 规则3结束语 + 感谢(如"没有了,谢谢老师"、"没有了谢谢"
for cp in _close_patterns:
if t.startswith(cp):
remainder = t[len(cp):].lstrip(",、 ")
if not remainder or any(remainder.startswith(ts) for ts in _thanks_suffixes):
return True
# 规则4礼貌回应如"不客气,祝你考试顺利"——前缀是礼貌词,后半是祝福套话)
for pr in _polite_responses:
if t.startswith(pr):
remainder = t[len(pr):].lstrip(",、 ")
# 后半是祝福/套话(不含实质信息)
if not remainder or re.match(r"^(祝|希望|期待|加油|顺利|好好|保重)", remainder):
return True
# 规则5纯确认词加"了"后缀(如"明白了"、"知道了"、"好了"
_confirm_base = {"明白", "知道", "了解", "收到", "", "", "可以", "没问题"}
for cb in _confirm_base:
if t == cb + "" or t == cb + "了。" or t == cb + "了!":
return True
# 检查是否为纯表情符号(方括号包裹) # 检查是否为纯表情符号(方括号包裹)
if re.fullmatch(r"(\[[^\]]+\])+", t): if re.fullmatch(r"(\[[^\]]+\])+", t):
return True return True
@@ -331,13 +385,13 @@ class SemanticPruner:
rendered = self.template.render( rendered = self.template.render(
pruning_scene=self.config.pruning_scene, pruning_scene=self.config.pruning_scene,
ontology_classes=self._ontology_classes, ontology_class_infos=self._ontology_class_infos,
dialog_text=dialog_text, dialog_text=dialog_text,
language=self.language language=self.language
) )
log_template_rendering("extracat_Pruning.jinja2", { log_template_rendering("extracat_Pruning.jinja2", {
"pruning_scene": self.config.pruning_scene, "pruning_scene": self.config.pruning_scene,
"ontology_classes_count": len(self._ontology_classes), "ontology_class_infos_count": len(self._ontology_class_infos),
"language": self.language "language": self.language
}) })
log_prompt_rendering("pruning-extract", rendered) log_prompt_rendering("pruning-extract", rendered)
@@ -377,6 +431,189 @@ class SemanticPruner:
) )
return fallback_response return fallback_response
def _get_pruning_mode(self) -> str:
"""根据 pruning_threshold 返回当前剪枝阶段。
- 低阈值 [0.0, 0.3)conservative 只删填充,保留所有实质内容
- 中阈值 [0.3, 0.6)semantic 保留场景相关 + 有语义关联的内容,删除无关联内容
- 高阈值 [0.6, 0.9]strict 只保留场景相关内容,跨场景内容可被删除
"""
t = float(self.config.pruning_threshold)
if t < 0.3:
return "conservative"
elif t < 0.6:
return "semantic"
else:
return "strict"
def _apply_related_dialog_pruning(
self,
msgs: List[ConversationMessage],
extraction: "DialogExtractionResponse",
dialog_label: str,
pruning_mode: str,
) -> List[ConversationMessage]:
"""相关对话统一剪枝入口,消除 prune_dialog / prune_dataset 中的重复逻辑。
- conservative只删填充
- semantic / strict场景感知剪枝
"""
if pruning_mode == "conservative":
preserve_tokens = self._build_preserve_tokens(extraction)
return self._prune_fillers_only(msgs, preserve_tokens, dialog_label)
else:
return self._prune_with_scene_filter(msgs, extraction, dialog_label, pruning_mode)
def _prune_fillers_only(
self,
msgs: List[ConversationMessage],
preserve_tokens: List[str],
dialog_label: str,
) -> List[ConversationMessage]:
"""相关对话专用只删填充消息LLM 保护消息和实质内容一律保留。
不受 pruning_threshold 约束,删多少算多少(填充有多少删多少)。
至少保留 1 条消息。
注意:填充检测优先于 preserve_tokens 保护——填充消息本身无信息价值,
即使 LLM 误将其关键词放入 preserve_tokens 也应删除。
"""
to_delete_ids: set = set()
for m in msgs:
# 填充检测优先:先判断是否为填充,再看 LLM 保护
if self._is_filler_message(m):
to_delete_ids.add(id(m))
self._log(f" [填充] '{m.msg[:40]}' → 删除")
continue
if self._msg_matches_tokens(m, preserve_tokens):
self._log(f" [保护] '{m.msg[:40]}' → LLM保护跳过")
kept = [m for m in msgs if id(m) not in to_delete_ids]
if not kept and msgs:
kept = [msgs[0]]
deleted = len(msgs) - len(kept)
self._log(
f"[剪枝-相关] {dialog_label} 总消息={len(msgs)} "
f"填充删除={deleted} 保留={len(kept)}"
)
return kept
def _prune_with_scene_filter(
self,
msgs: List[ConversationMessage],
extraction: "DialogExtractionResponse",
dialog_label: str,
mode: str,
) -> List[ConversationMessage]:
"""场景感知剪枝,供 semantic / strict 两个阈值档位调用。
本函数体现剪枝系统的三层递进逻辑:
第一层conservative阈值 < 0.3
不进入本函数,由 _prune_fillers_only 处理。
保留标准:只问"有没有信息量",填充消息(嗯/好的/哈哈等)删除,其余一律保留。
第二层semantic阈值 [0.3, 0.6)
保留标准:内容价值优先,场景相关性是参考而非唯一标准。
- 填充消息 → 删除(最高优先级)
- 场景相关消息 → 保留
- 场景无关消息 → 有两次豁免机会:
1. 命中 scene_preserve_tokensLLM 标记的关键词/时间/金额等)→ 保留
2. 含情感词(感觉/压力/开心等)→ 保留(情感内容有记忆价值)
3. 两次豁免均未命中 → 删除
第三层strict阈值 [0.6, 0.9]
保留标准:场景相关性优先,豁免权极度收窄。
- 填充消息 → 删除(最高优先级)
- 场景相关消息 → 保留
- 场景无关消息 → 直接删除,仅保留一个例外:
LLM 同时将该消息放入 preserve_keywords自相矛盾时以情感标记为准→ 保留
注意strict 模式下情感词兜底不再生效,场景相关性是最终裁决标准。
至少保留 1 条消息(兜底取第一条)。
"""
# strict 模式收窄保护范围:只保护结构化关键信息(时间/编号/金额/联系方式/地址),
# 不保护 keywords / preserve_keywords让场景过滤能删掉更多内容。
# semantic 模式完整保护:包含 LLM 抽取的所有重要片段(含 keywords 和 preserve_keywords
if mode == "strict":
scene_preserve_tokens = (
extraction.times + extraction.ids + extraction.amounts +
extraction.contacts + extraction.addresses
)
else:
scene_preserve_tokens = self._build_preserve_tokens(extraction)
unrelated_snippets = extraction.scene_unrelated_snippets or []
to_delete_ids: set = set()
for m in msgs:
msg_text = m.msg.strip()
# 第一优先级:填充消息无论模式直接删除,不参与后续场景判断
if self._is_filler_message(m):
to_delete_ids.add(id(m))
self._log(f" [填充] '{msg_text[:40]}' → 删除")
continue
# 双向包含匹配:处理 LLM 返回片段与原始消息文本长度不完全一致的情况
is_scene_unrelated = any(
snip and (snip in msg_text or msg_text in snip)
for snip in unrelated_snippets
)
if is_scene_unrelated:
if mode == "strict":
# strict场景无关 → 删除
# 唯一例外LLM 同时将该消息标记为 preserve_keywords
# 说明 LLM 自相矛盾(既认为场景无关又认为值得保留),以 preserve_keywords 为准
if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords):
self._log(f" [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护,保留")
else:
to_delete_ids.add(id(m))
self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除")
elif mode == "semantic":
# semantic场景无关但有内容价值 → 保留
# 豁免第一层:命中 scene_preserve_tokens关键词/结构化信息保护)
if self._msg_matches_tokens(m, scene_preserve_tokens):
self._log(f" [保护] '{msg_text[:40]}' → 场景关键词保护,保留")
else:
# 豁免第二层:含情感词,认为有情境记忆价值,即使场景无关也保留
has_contextual_emotion = any(
word in msg_text
for word in ["感觉", "觉得", "心情", "开心", "难过", "高兴", "沮丧",
"喜欢", "讨厌", "", "", "担心", "害怕", "兴奋",
"压力", "", "疲惫", "", "焦虑", "委屈", "感动"]
)
if not has_contextual_emotion:
to_delete_ids.add(id(m))
self._log(f" [场景无关-语义] '{msg_text[:40]}' → 删除(无情感关联)")
else:
self._log(f" [场景关联-保留] '{msg_text[:40]}' → 有情感关联,保留")
else:
# 不在 scene_unrelated_snippets 中 → 场景相关,直接保留
if self._msg_matches_tokens(m, scene_preserve_tokens):
self._log(f" [保护] '{msg_text[:40]}' → LLM保护跳过")
# else: 普通场景相关消息,保留,不输出日志
kept = [m for m in msgs if id(m) not in to_delete_ids]
if not kept and msgs:
kept = [msgs[0]]
deleted = len(msgs) - len(kept)
self._log(
f"[剪枝-{mode}] {dialog_label} 总消息={len(msgs)} "
f"删除={deleted} 保留={len(kept)}"
)
return kept
def _build_preserve_tokens(self, extraction: "DialogExtractionResponse") -> List[str]:
"""统一构建 preserve_tokens合并 LLM 抽取的所有重要片段。"""
return (
extraction.times + extraction.ids + extraction.amounts +
extraction.contacts + extraction.addresses + extraction.keywords +
extraction.preserve_keywords
)
def _msg_matches_tokens(self, message: ConversationMessage, tokens: List[str]) -> bool: def _msg_matches_tokens(self, message: ConversationMessage, tokens: List[str]) -> bool:
"""判断消息是否包含任意抽取到的重要片段。""" """判断消息是否包含任意抽取到的重要片段。"""
if not tokens: if not tokens:
@@ -397,16 +634,18 @@ class SemanticPruner:
proportion = float(self.config.pruning_threshold) proportion = float(self.config.pruning_threshold)
extraction = await self._extract_dialog_important(dialog.content) extraction = await self._extract_dialog_important(dialog.content)
pruning_mode = self._get_pruning_mode()
self._log(f"[剪枝-模式] 阈值={proportion} → 模式={pruning_mode}")
if extraction.is_related: if extraction.is_related:
# 相关对话不剪枝 kept = self._apply_related_dialog_pruning(
dialog.context.msgs, extraction, f"对话ID={dialog.id}", pruning_mode
)
dialog.context = ConversationContext(msgs=kept)
return dialog return dialog
# 在不相关对话中LLM 已通过 preserve_tokens 标记需要保护的内容 # 在不相关对话中LLM 已通过 preserve_tokens 标记需要保护的内容
preserve_tokens = ( preserve_tokens = self._build_preserve_tokens(extraction)
extraction.times + extraction.ids + extraction.amounts +
extraction.contacts + extraction.addresses + extraction.keywords +
extraction.preserve_keywords
)
msgs = dialog.context.msgs msgs = dialog.context.msgs
# 分类:填充 / 其他可删LLM保护消息通过不加入任何桶来隐式保护 # 分类:填充 / 其他可删LLM保护消息通过不加入任何桶来隐式保护
@@ -482,6 +721,9 @@ class SemanticPruner:
f"[剪枝-数据集] 对话总数={len(dialogs)} 场景={self.config.pruning_scene} 删除比例={proportion} 开关={self.config.pruning_switch} 模式=消息级独立判断" f"[剪枝-数据集] 对话总数={len(dialogs)} 场景={self.config.pruning_scene} 删除比例={proportion} 开关={self.config.pruning_switch} 模式=消息级独立判断"
) )
pruning_mode = self._get_pruning_mode()
self._log(f"[剪枝-数据集] 阈值={proportion} → 剪枝阶段={pruning_mode}")
result: List[DialogData] = [] result: List[DialogData] = []
total_original_msgs = 0 total_original_msgs = 0
total_deleted_msgs = 0 total_deleted_msgs = 0
@@ -505,12 +747,19 @@ class SemanticPruner:
original_count = len(msgs) original_count = len(msgs)
total_original_msgs += original_count total_original_msgs += original_count
# 相关对话:根据阶段决定处理力度
if extraction.is_related:
kept = self._apply_related_dialog_pruning(
msgs, extraction, f"对话 {d_idx+1}", pruning_mode
)
deleted_count = original_count - len(kept)
total_deleted_msgs += deleted_count
dd.context.msgs = kept
result.append(dd)
continue
# 从 LLM 抽取结果中获取所有需要保留的 token # 从 LLM 抽取结果中获取所有需要保留的 token
preserve_tokens = ( preserve_tokens = self._build_preserve_tokens(extraction)
extraction.times + extraction.ids + extraction.amounts +
extraction.contacts + extraction.addresses + extraction.keywords +
extraction.preserve_keywords # 情绪/兴趣/爱好关键词
)
# 判断是否需要详细日志 # 判断是否需要详细日志
should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog
@@ -605,6 +854,18 @@ class SemanticPruner:
self._log(f"[剪枝-数据集] 剩余对话数={len(result)}") self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
# 补充统计日志(供 _parse_logs_to_structured 正则解析)
related_count = sum(1 for ex in extraction_results if ex.is_related)
unrelated_count = len(dialogs) - related_count
related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related]
unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related]
self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}")
self._log(
f"[剪枝-数据集] 相关对话:第[{', '.join(related_indices)}]段;"
f"不相关对话:第[{', '.join(unrelated_indices)}]段"
)
self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs}")
# 保存日志 # 保存日志
try: try:
from app.core.config import settings from app.core.config import settings
@@ -686,7 +947,7 @@ class SemanticPruner:
re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)") re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)")
re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)") re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)")
re_indices = re.compile(r"相关对话:第\[(.*?)\]段;不相关对话:第\[(.*?)\]段") re_indices = re.compile(r"相关对话:第\[(.*?)\]段;不相关对话:第\[(.*?)\]段")
re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+)\s+分配删除=(\d+)\s+实删=(\d+)\s+保留=(\d+)") re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b")
re_total_del = re.compile(r"总删除\s+(\d+)\s+条") re_total_del = re.compile(r"总删除\s+(\d+)\s+条")
re_remaining = re.compile(r"剩余对话数=(\d+)") re_remaining = re.compile(r"剩余对话数=(\d+)")
@@ -720,9 +981,8 @@ class SemanticPruner:
dialogs.append({ dialogs.append({
"index": parse_int(m.group(1)), "index": parse_int(m.group(1)),
"total_messages": parse_int(m.group(2)), "total_messages": parse_int(m.group(2)),
"quota_delete": parse_int(m.group(3)), "deleted": parse_int(m.group(3)),
"actual_deleted": parse_int(m.group(4)), "kept": parse_int(m.group(4)),
"kept": parse_int(m.group(5)),
}) })
continue continue

View File

@@ -1,6 +1,7 @@
{# {#
对话级抽取与相关性判定模板(用于剪枝加速) 对话级抽取与相关性判定模板(用于剪枝加速)
输入pruning_scene, ontology_classes, dialog_text, language 输入pruning_scene, ontology_class_infos, dialog_text, language
- ontology_class_infos: List[{class_name: str, class_description: str}]
输出:严格 JSON不要包含任何多余文本字段 输出:严格 JSON不要包含任何多余文本字段
- is_related: bool是否与所选场景相关 - is_related: bool是否与所选场景相关
- times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等) - times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等)
@@ -18,20 +19,16 @@
#} #}
{# ── 确定场景说明 ── #} {# ── 确定场景说明 ── #}
{% if ontology_classes and ontology_classes | length > 0 %} {% if ontology_class_infos and ontology_class_infos | length > 0 %}
{% if language == 'en' %} {% if language == 'en' %}
{% set custom_types_str = ontology_classes | join(', ') %} {% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is relevant if it involves any of the following entity types.' %}
{% set instruction = 'Scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %}
{% else %} {% else %}
{% set custom_types_str = ontology_classes | join('、') %} {% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关。' %}
{% set instruction = '场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %}
{% endif %} {% endif %}
{% else %} {% else %}
{% if language == 'en' %} {% if language == 'en' %}
{% set custom_types_str = '' %}
{% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %} {% set instruction = 'Scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %}
{% else %} {% else %}
{% set custom_types_str = '' %}
{% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %} {% set instruction = '场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %}
{% endif %} {% endif %}
{% endif %} {% endif %}
@@ -42,8 +39,17 @@
2. 从对话中抽取所有需要保留的重要信息片段。 2. 从对话中抽取所有需要保留的重要信息片段。
场景说明:{{ instruction }} 场景说明:{{ instruction }}
{% if custom_types_str %}
重要提示:只要对话中出现与上述实体类型({{ custom_types_str }}相关的内容即判定为相关is_related=true {% if ontology_class_infos and ontology_class_infos | length > 0 %}
【本场景实体类型定义】
以下实体类型定义了本场景中哪些内容是重要的。
凡是与以下任意类型相关的内容,都必须保留,并将关键词/短语提取到 keywords 字段:
{% for info in ontology_class_infos %}
- {{ info.class_name }}{{ info.class_description }}
{% endfor %}
重要提示只要对话中出现与上述任意实体类型相关的内容即判定为相关is_related=true
{% endif %} {% endif %}
--- ---
@@ -51,13 +57,40 @@
以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段: 以下类型的内容无论是否与场景直接相关,都必须保留,请将其关键词/短语抽取到对应字段:
- 时间信息:日期、时间点、时间段、有效期 → times 字段 - 时间信息:日期、时间点、时间段、有效期 → times 字段
- 编号信息学号、工号、订单号、申请号、账号、ID → ids 字段 - 编号信息学号、工号、订单号、申请号、账号、ID → ids 字段
- 金额信息:价格、费用、金额(含货币符号或单位 → amounts 字段 - 金额信息:价格、费用、金额(含货币符号或单位,如"100元"、"¥200")→ amounts 字段(注意:考试分数、成绩分数不属于金额,不要放入此字段)
- 联系方式电话、手机号、邮箱、微信、QQ → contacts 字段 - 联系方式电话、手机号、邮箱、微信、QQ → contacts 字段
- 地址信息:地点、地址、位置 → addresses 字段 - 地址信息:地点、地址、位置 → addresses 字段
- 场景关键词:与场景强相关的专业术语、事件名称 → keywords 字段 - 场景关键词:与**当前场景**强相关的专业术语、事件名称 → keywords 字段(注意:只放与当前场景直接相关的词,跨场景的内容不要放入此字段)
- **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段 - **情绪与情感**:喜悦、悲伤、愤怒、焦虑、开心、难过、委屈、兴奋、害怕、担心、压力、感动等情绪表达 → preserve_keywords 字段
- **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段 - **兴趣与爱好**:喜欢、热爱、爱好、擅长、享受、沉迷、着迷、讨厌某事物等个人偏好表达 → preserve_keywords 字段
- **个人观点与态度**:对某事物的明确看法、评价、立场 → preserve_keywords 字段 - **个人情感态度**:对人际关系、情感状态的明确表达(如"我跟室友闹矛盾了"、"我都快抑郁了"→ preserve_keywords 字段
- 注意:学业目标(如"我想考研")、成绩(如"87分")、学科偏好(如"喜欢数学")属于学业信息,不属于情绪/情感,不要放入 preserve_keywords 字段
【场景无关内容标记】
请从对话中识别出与当前场景({{ pruning_scene }}**既不相关、也无语义关联**的消息片段,将其原文(或关键片段)提取到 scene_unrelated_snippets 字段。
判断标准:
- 与场景实体类型完全无关
- 与场景话题没有因果/时间/情境上的关联(例如:不是"因为上课所以累"这种关联)
- 纯粹是另一个话题的内容(如在教育场景中讨论购物、娱乐等)
注意:有情绪/感受表达的消息即使话题不同,也可能有语义关联,请谨慎标记。
**重要scene_unrelated_snippets 必须认真填写,不能为空数组。**
如果对话中存在与场景无关的内容,必须将其原文片段提取出来。
示例(场景=在线教育):
- "我最近心情很差,跟室友闹矛盾了" → 与教育场景无关,加入 scene_unrelated_snippets
- "她总是很晚回来吵到我睡觉" → 与教育场景无关,加入 scene_unrelated_snippets
- "对,我都快抑郁了" → 与教育场景无关,加入 scene_unrelated_snippets
- "期末考试12月25日" → 与教育场景相关,不加入 scene_unrelated_snippets
- "我上次高数作业87分" → 与教育场景相关,不加入 scene_unrelated_snippets
- "我的目标是考研" → 与教育场景相关,不加入 scene_unrelated_snippets
示例(场景=情感陪伴):
- "我最近心情很差,跟室友闹矛盾了" → 与情感陪伴场景相关(情绪+关系),不加入 scene_unrelated_snippets
- "对,我都快抑郁了" → 与情感陪伴场景相关(情绪),不加入 scene_unrelated_snippets
- "期末考试12月25日3号教学楼201室" → 与情感陪伴场景无关(教育信息),加入 scene_unrelated_snippets
- "我上次高数作业87分这次能考好吗" → 与情感陪伴场景无关(学业信息),加入 scene_unrelated_snippets
- "我的目标是考研,想读应用数学" → 与情感陪伴场景无关(学业目标),加入 scene_unrelated_snippets
【可以删除的内容】 【可以删除的内容】
以下类型的内容属于低价值信息,可以在剪枝时删除: 以下类型的内容属于低价值信息,可以在剪枝时删除:
@@ -88,7 +121,8 @@
"contacts": [<string>...], "contacts": [<string>...],
"addresses": [<string>...], "addresses": [<string>...],
"keywords": [<string>...], "keywords": [<string>...],
"preserve_keywords": [<string>...] "preserve_keywords": [<string>...],
"scene_unrelated_snippets": [<string>...]
} }
{% else %} {% else %}
You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks: You are a dialogue content analysis assistant. Please analyze the full dialogue below in one pass and complete two tasks:
@@ -96,8 +130,17 @@ You are a dialogue content analysis assistant. Please analyze the full dialogue
2. Extract all important information fragments that must be preserved. 2. Extract all important information fragments that must be preserved.
Scenario Description: {{ instruction }} Scenario Description: {{ instruction }}
{% if custom_types_str %}
Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true). {% if ontology_class_infos and ontology_class_infos | length > 0 %}
[Scene Entity Type Definitions]
The following entity types define what content is important in this scene.
Content related to ANY of these types must be preserved and extracted into the keywords field:
{% for info in ontology_class_infos %}
- {{ info.class_name }}: {{ info.class_description }}
{% endfor %}
Important: If the dialogue contains content related to any of the entity types above, mark it as relevant (is_related=true).
{% endif %} {% endif %}
--- ---
@@ -105,13 +148,22 @@ Important: If the dialogue contains content related to any of the entity types a
The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields: The following types of content must always be preserved regardless of scene relevance. Extract their keywords/phrases into the corresponding fields:
- Time information: dates, time points, durations, expiry dates → times field - Time information: dates, time points, durations, expiry dates → times field
- ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field - ID information: student IDs, employee IDs, order numbers, application numbers, account IDs → ids field
- Amount information: prices, fees, amounts (with currency symbols or units) → amounts field - Amount information: prices, fees, amounts (with currency symbols or units, e.g., "$100", "¥200") → amounts field (Note: exam scores and grades are NOT amounts, do not put them here)
- Contact information: phone numbers, emails, WeChat, QQ → contacts field - Contact information: phone numbers, emails, WeChat, QQ → contacts field
- Address information: locations, addresses, places → addresses field - Address information: locations, addresses, places → addresses field
- Scene keywords: professional terms and event names strongly related to the scene → keywords field - Scene keywords: professional terms and event names strongly related to **the current scene** → keywords field (Note: only put terms directly related to the current scene; cross-scene content should not be placed here)
- **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field - **Emotions and feelings**: joy, sadness, anger, anxiety, happiness, sadness, excitement, fear, worry, stress, being moved, etc. → preserve_keywords field
- **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field - **Interests and hobbies**: likes, loves, hobbies, good at, enjoys, obsessed with, hates something, personal preferences → preserve_keywords field
- **Personal opinions and attitudes**: clear views, evaluations, or stances on something → preserve_keywords field - **Personal emotional attitudes**: clear expressions about interpersonal relationships or emotional states (e.g., "I had a fight with my roommate", "I'm almost depressed") → preserve_keywords field
- Note: Academic goals (e.g., "I want to pursue a master's degree"), grades (e.g., "87 points"), and subject preferences (e.g., "I like math") are academic information, NOT emotions/feelings — do not put them in preserve_keywords
[Scene-Unrelated Content Marking]
Please identify message snippets in the dialogue that are **neither relevant to nor semantically associated with** the current scene ({{ pruning_scene }}), and extract their original text (or key fragments) into the scene_unrelated_snippets field.
Criteria:
- Completely unrelated to the scene's entity types
- No causal/temporal/contextual association with the scene topic (e.g., "feeling tired because of class" IS associated)
- Purely belongs to a different topic (e.g., discussing shopping or entertainment in an education scene)
Note: Messages with emotional/feeling expressions may still have semantic association even if the topic differs — mark carefully.
[CAN BE DELETED] [CAN BE DELETED]
The following types of content are low-value and can be removed during pruning: The following types of content are low-value and can be removed during pruning:
@@ -141,6 +193,7 @@ Output strict JSON only (fixed keys, order doesn't matter):
"contacts": [<string>...], "contacts": [<string>...],
"addresses": [<string>...], "addresses": [<string>...],
"keywords": [<string>...], "keywords": [<string>...],
"preserve_keywords": [<string>...] "preserve_keywords": [<string>...],
"scene_unrelated_snippets": [<string>...]
} }
{% endif %} {% endif %}

View File

@@ -417,7 +417,7 @@ class MemoryConfig:
# Ontology scene association # Ontology scene association
scene_id: Optional[UUID] = None scene_id: Optional[UUID] = None
ontology_classes: Optional[list] = field(default=None) ontology_class_infos: list[dict] = field(default_factory=list)
def __post_init__(self): def __post_init__(self):
"""Validate configuration after initialization.""" """Validate configuration after initialization."""

View File

@@ -107,28 +107,29 @@ def _validate_config_id(config_id, db: Session = None):
) )
def _load_ontology_classes(db: Session, scene_id, pruning_scene: Optional[str]) -> Optional[list]: def _load_ontology_class_infos(db: Session, scene_id) -> list:
"""从 ontology_class 表加载场景类型名称列表,用于注入提示词。 """从 ontology_class 表加载完整本体类型信息name + description,用于注入剪枝提示词。
Args: Args:
db: 数据库会话 db: 数据库会话
scene_id: 本体场景 UUID scene_id: 本体场景 UUID
pruning_scene: 语义剪枝场景名称(保留参数,暂未使用)
Returns: Returns:
class_name 字符串列表,或 None无数据时 [{"class_name": ..., "class_description": ...}, ...] 或空列表
""" """
if not scene_id: if not scene_id:
return None return []
try: try:
from app.repositories.ontology_class_repository import OntologyClassRepository from app.repositories.ontology_class_repository import OntologyClassRepository
repo = OntologyClassRepository(db) repo = OntologyClassRepository(db)
classes = repo.get_classes_by_scene(scene_id) classes = repo.get_classes_by_scene(scene_id)
names = [c.class_name for c in classes if c.class_name] return [
return names if names else None {"class_name": c.class_name, "class_description": c.class_description or ""}
for c in classes if c.class_name
]
except Exception as e: except Exception as e:
logger.warning(f"Failed to load ontology classes for scene_id={scene_id}: {e}") logger.warning(f"Failed to load ontology class infos for scene_id={scene_id}: {e}")
return None return []
class MemoryConfigService: class MemoryConfigService:
@@ -383,7 +384,7 @@ class MemoryConfigService:
pruning_threshold=float(memory_config.pruning_threshold) if memory_config.pruning_threshold is not None else 0.5, pruning_threshold=float(memory_config.pruning_threshold) if memory_config.pruning_threshold is not None else 0.5,
# Ontology scene association # Ontology scene association
scene_id=memory_config.scene_id, scene_id=memory_config.scene_id,
ontology_classes=_load_ontology_classes(self.db, memory_config.scene_id, memory_config.pruning_scene), ontology_class_infos=_load_ontology_class_infos(self.db, memory_config.scene_id),
) )
elapsed_ms = (time.time() - start_time) * 1000 elapsed_ms = (time.time() - start_time) * 1000
@@ -550,11 +551,13 @@ class MemoryConfigService:
- pruning_switch: bool - pruning_switch: bool
- pruning_scene: str - pruning_scene: str
- pruning_threshold: float - pruning_threshold: float
- ontology_class_infos: list of {class_name, class_description} dicts
""" """
return { return {
"pruning_switch": memory_config.pruning_enabled, "pruning_switch": memory_config.pruning_enabled,
"pruning_scene": memory_config.pruning_scene, "pruning_scene": memory_config.pruning_scene,
"pruning_threshold": memory_config.pruning_threshold, "pruning_threshold": memory_config.pruning_threshold,
"ontology_class_infos": memory_config.ontology_class_infos or [],
} }
def get_ontology_types(self, memory_config: MemoryConfig): def get_ontology_types(self, memory_config: MemoryConfig):

View File

@@ -121,7 +121,7 @@ async def run_pilot_extraction(
"pruning_scene": memory_config.pruning_scene, "pruning_scene": memory_config.pruning_scene,
"pruning_threshold": memory_config.pruning_threshold, "pruning_threshold": memory_config.pruning_threshold,
"scene_id": str(memory_config.scene_id) if memory_config.scene_id else None, "scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
"ontology_classes": memory_config.ontology_classes, "ontology_class_infos": memory_config.ontology_classes,
} }
config = PruningConfig(**pruning_config_dict) config = PruningConfig(**pruning_config_dict)