[changes] Remove regular logs and apply strict rules.

This commit is contained in:
lanceyq
2026-03-19 12:26:16 +08:00
parent d12ad213e0
commit c2fc4ab4ff
3 changed files with 74 additions and 156 deletions

View File

@@ -84,7 +84,7 @@ async def get_chunked_dialogs(
pruning_scene=memory_config.pruning_scene or "education",
pruning_threshold=memory_config.pruning_threshold,
scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
ontology_class_infos=memory_config.ontology_classes,
ontology_class_infos=memory_config.ontology_class_infos,
)
logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}")

View File

@@ -523,12 +523,10 @@ class SemanticPruner:
3. 两次豁免均未命中 → 删除
第三层strict阈值 [0.6, 0.9]
保留标准:场景相关性优先,豁免权极度收窄
保留标准:场景相关性优先,无任何豁免。
- 填充消息 → 删除(最高优先级)
- 场景相关消息 → 保留
- 场景无关消息 → 直接删除,仅保留一个例外:
LLM 同时将该消息放入 preserve_keywords自相矛盾时以情感标记为准→ 保留
注意strict 模式下情感词兜底不再生效,场景相关性是最终裁决标准。
- 场景无关消息 → 直接删除,preserve_keywords 和情感词在此模式下均不生效
至少保留 1 条消息(兜底取第一条)。
"""
@@ -563,14 +561,10 @@ class SemanticPruner:
if is_scene_unrelated:
if mode == "strict":
# strict场景无关 → 删除
# 唯一例外LLM 同时将该消息标记为 preserve_keywords
# 说明 LLM 自相矛盾(既认为场景无关又认为值得保留),以 preserve_keywords 为准
if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords):
self._log(f" [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护,保留")
else:
to_delete_ids.add(id(m))
self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除")
# strict场景无关直接删除,不做任何豁免
# 场景相关性是唯一裁决标准,preserve_keywords 在此模式下不生效
to_delete_ids.add(id(m))
self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除")
elif mode == "semantic":
# semantic场景无关但有内容价值 → 保留
# 豁免第一层:命中 scene_preserve_tokens关键词/结构化信息保护)
@@ -720,14 +714,30 @@ class SemanticPruner:
self._log(
f"[剪枝-数据集] 对话总数={len(dialogs)} 场景={self.config.pruning_scene} 删除比例={proportion} 开关={self.config.pruning_switch} 模式=消息级独立判断"
)
pruning_mode = self._get_pruning_mode()
self._log(f"[剪枝-数据集] 阈值={proportion} → 剪枝阶段={pruning_mode}")
result: List[DialogData] = []
total_original_msgs = 0
total_deleted_msgs = 0
# 统计对象:直接收集结构化数据,无需事后正则解析
stats = {
"scene": self.config.pruning_scene,
"dialog_total": len(dialogs),
"deletion_ratio": proportion,
"enabled": self.config.pruning_switch,
"pruning_mode": pruning_mode,
"related_count": 0,
"unrelated_count": 0,
"related_indices": [],
"unrelated_indices": [],
"total_deleted_messages": 0,
"remaining_dialogs": 0,
"dialogs": [],
}
# 并发执行所有对话的 LLM 抽取(获取 preserve_keywords 等保护信息)
semaphore = asyncio.Semaphore(self.max_concurrent)
@@ -749,6 +759,8 @@ class SemanticPruner:
# 相关对话:根据阶段决定处理力度
if extraction.is_related:
stats["related_count"] += 1
stats["related_indices"].append(d_idx)
kept = self._apply_related_dialog_pruning(
msgs, extraction, f"对话 {d_idx+1}", pruning_mode
)
@@ -756,8 +768,18 @@ class SemanticPruner:
total_deleted_msgs += deleted_count
dd.context.msgs = kept
result.append(dd)
stats["dialogs"].append({
"index": d_idx + 1,
"is_related": True,
"total_messages": original_count,
"deleted": deleted_count,
"kept": len(kept),
})
continue
stats["unrelated_count"] += 1
stats["unrelated_indices"].append(d_idx)
# 从 LLM 抽取结果中获取所有需要保留的 token
preserve_tokens = self._build_preserve_tokens(extraction)
@@ -792,16 +814,16 @@ class SemanticPruner:
# important_msgs 仅用于日志统计
important_msgs = llm_protected_msgs
# 计算删除配额
delete_target = int(original_count * proportion)
if proportion > 0 and original_count > 0 and delete_target == 0:
delete_target = 1
# 确保至少保留1条消息
max_deletable = max(0, original_count - 1)
delete_target = min(delete_target, max_deletable)
# 删除策略:优先删填充消息,再按出现顺序删其余可删消息
to_delete_indices = set()
deleted_details = []
@@ -819,62 +841,65 @@ class SemanticPruner:
break
to_delete_indices.add(idx)
deleted_details.append(f"[{idx}] 可删: '{msg.msg[:50]}'")
# 执行删除
kept_msgs = []
for idx, m in enumerate(msgs):
if idx not in to_delete_indices:
kept_msgs.append(m)
# 确保至少保留1条
if not kept_msgs and msgs:
kept_msgs = [msgs[0]]
dd.context.msgs = kept_msgs
deleted_count = original_count - len(kept_msgs)
total_deleted_msgs += deleted_count
# 输出删除详情
if deleted_details:
self._log(f"[剪枝-删除详情] 对话 {d_idx+1} 删除了以下消息:")
for detail in deleted_details:
self._log(f" {detail}")
# ========== 问答对统计(已注释) ==========
# qa_info = f",问答对={len(qa_pairs)}" if qa_pairs else ""
# ========================================
self._log(
f"[剪枝-对话] 对话 {d_idx+1} 总消息={original_count} "
f"(保护={len(important_msgs)} 填充={len(filler_msgs)} 可删={len(deletable_msgs)}) "
f"删除={deleted_count} 保留={len(kept_msgs)}"
)
result.append(dd)
self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
# 补充统计日志(供 _parse_logs_to_structured 正则解析)
related_count = sum(1 for ex in extraction_results if ex.is_related)
unrelated_count = len(dialogs) - related_count
related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related]
unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related]
self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}")
self._log(
f"[剪枝-数据集] 相关对话:第[{', '.join(related_indices)}]段;"
f"不相关对话:第[{', '.join(unrelated_indices)}]段"
)
stats["dialogs"].append({
"index": d_idx + 1,
"is_related": False,
"total_messages": original_count,
"protected": len(important_msgs),
"fillers": len(filler_msgs),
"deletable": len(deletable_msgs),
"deleted": deleted_count,
"kept": len(kept_msgs),
})
result.append(dd)
# 补全统计对象
stats["total_deleted_messages"] = total_deleted_msgs
stats["remaining_dialogs"] = len(result)
self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
self._log(f"[剪枝-数据集] 相关对话数={stats['related_count']} 不相关对话数={stats['unrelated_count']}")
self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs}")
# 保存日志
# 直接序列化统计对象,无需正则解析
try:
from app.core.config import settings
settings.ensure_memory_output_dir()
log_output_path = settings.get_memory_output_path("pruned_terminal.json")
sanitized_logs = [self._sanitize_log_line(l) for l in self.run_logs]
payload = self._parse_logs_to_structured(sanitized_logs)
with open(log_output_path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
json.dump(stats, f, ensure_ascii=False, indent=2)
except Exception as e:
self._log(f"[剪枝-数据集] 保存终端输出日志失败:{e}")
@@ -882,7 +907,7 @@ class SemanticPruner:
if not result:
print("警告: 语义剪枝后数据集为空,已回退为未剪枝数据以避免流程中断")
return dialogs
return result
def _log(self, msg: str) -> None:
@@ -894,113 +919,4 @@ class SemanticPruner:
pass
print(msg)
def _sanitize_log_line(self, line: str) -> str:
"""移除行首的方括号标签前缀,例如 [剪枝-数据集] 或 [剪枝-对话]。"""
try:
return re.sub(r"^\[[^\]]+\]\s*", "", line)
except Exception:
return line
def _parse_logs_to_structured(self, logs: List[str]) -> dict:
"""将已去前缀的日志列表解析为结构化 JSON便于数据对接。"""
summary = {
"scene": self.config.pruning_scene,
"dialog_total": None,
"deletion_ratio": None,
"enabled": None,
"related_count": None,
"unrelated_count": None,
"related_indices": [],
"unrelated_indices": [],
"total_deleted_messages": None,
"remaining_dialogs": None,
}
dialogs = []
# 解析函数
def parse_int(value: str) -> Optional[int]:
try:
return int(value)
except Exception:
return None
def parse_float(value: str) -> Optional[float]:
try:
return float(value)
except Exception:
return None
def parse_indices(s: str) -> List[int]:
s = s.strip()
if not s:
return []
parts = [p.strip() for p in s.split(",") if p.strip()]
out: List[int] = []
for p in parts:
try:
out.append(int(p))
except Exception:
pass
return out
# 正则
re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)")
re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)")
re_indices = re.compile(r"相关对话:第\[(.*?)\]段;不相关对话:第\[(.*?)\]段")
re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b")
re_total_del = re.compile(r"总删除\s+(\d+)\s+条")
re_remaining = re.compile(r"剩余对话数=(\d+)")
for line in logs:
# 第一行:总览
m = re_header.search(line)
if m:
summary["dialog_total"] = parse_int(m.group(1))
# 顶层 scene 依配置,这里不覆盖,但也可校验 m.group(2)
summary["deletion_ratio"] = parse_float(m.group(3))
summary["enabled"] = True if m.group(4) == "True" else False
continue
# 第二行:相关/不相关数量
m = re_counts.search(line)
if m:
summary["related_count"] = parse_int(m.group(1))
summary["unrelated_count"] = parse_int(m.group(2))
continue
# 第三行:相关/不相关索引
m = re_indices.search(line)
if m:
summary["related_indices"] = parse_indices(m.group(1))
summary["unrelated_indices"] = parse_indices(m.group(2))
continue
# 对话级统计
m = re_dialog.search(line)
if m:
dialogs.append({
"index": parse_int(m.group(1)),
"total_messages": parse_int(m.group(2)),
"deleted": parse_int(m.group(3)),
"kept": parse_int(m.group(4)),
})
continue
# 全局删除总数
m = re_total_del.search(line)
if m:
summary["total_deleted_messages"] = parse_int(m.group(1))
continue
# 剩余对话数
m = re_remaining.search(line)
if m:
summary["remaining_dialogs"] = parse_int(m.group(1))
continue
return {
"scene": summary["scene"],
"timestamp": datetime.now().isoformat(),
"summary": {k: v for k, v in summary.items() if k != "scene"},
"dialogs": dialogs,
}

View File

@@ -121,7 +121,7 @@ async def run_pilot_extraction(
"pruning_scene": memory_config.pruning_scene,
"pruning_threshold": memory_config.pruning_threshold,
"scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
"ontology_class_infos": memory_config.ontology_classes,
"ontology_class_infos": memory_config.ontology_class_infos,
}
config = PruningConfig(**pruning_config_dict)
@@ -232,9 +232,11 @@ async def run_pilot_extraction(
"chunker_strategy": memory_config.chunker_strategy,
}
# 添加剪枝统计信息
if pruning_stats:
preprocessing_summary["pruning"] = pruning_stats
# 添加剪枝统计信息(始终包含 pruning 字段,确保前端不会因字段缺失报错)
preprocessing_summary["pruning"] = pruning_stats if pruning_stats else {
"enabled": memory_config.pruning_enabled,
"deleted_count": 0,
}
await progress_callback("text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", preprocessing_summary)