[changes] Remove regular logs and apply strict rules.

This commit is contained in:
lanceyq
2026-03-19 12:26:16 +08:00
parent d12ad213e0
commit c2fc4ab4ff
3 changed files with 74 additions and 156 deletions

View File

@@ -84,7 +84,7 @@ async def get_chunked_dialogs(
pruning_scene=memory_config.pruning_scene or "education", pruning_scene=memory_config.pruning_scene or "education",
pruning_threshold=memory_config.pruning_threshold, pruning_threshold=memory_config.pruning_threshold,
scene_id=str(memory_config.scene_id) if memory_config.scene_id else None, scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
ontology_class_infos=memory_config.ontology_classes, ontology_class_infos=memory_config.ontology_class_infos,
) )
logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}") logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}")

View File

@@ -523,12 +523,10 @@ class SemanticPruner:
3. 两次豁免均未命中 → 删除 3. 两次豁免均未命中 → 删除
第三层strict阈值 [0.6, 0.9] 第三层strict阈值 [0.6, 0.9]
保留标准:场景相关性优先,豁免权极度收窄 保留标准:场景相关性优先,无任何豁免。
- 填充消息 → 删除(最高优先级) - 填充消息 → 删除(最高优先级)
- 场景相关消息 → 保留 - 场景相关消息 → 保留
- 场景无关消息 → 直接删除,仅保留一个例外: - 场景无关消息 → 直接删除,preserve_keywords 和情感词在此模式下均不生效
LLM 同时将该消息放入 preserve_keywords自相矛盾时以情感标记为准→ 保留
注意strict 模式下情感词兜底不再生效,场景相关性是最终裁决标准。
至少保留 1 条消息(兜底取第一条)。 至少保留 1 条消息(兜底取第一条)。
""" """
@@ -563,14 +561,10 @@ class SemanticPruner:
if is_scene_unrelated: if is_scene_unrelated:
if mode == "strict": if mode == "strict":
# strict场景无关 → 删除 # strict场景无关直接删除,不做任何豁免
# 唯一例外LLM 同时将该消息标记为 preserve_keywords # 场景相关性是唯一裁决标准,preserve_keywords 在此模式下不生效
# 说明 LLM 自相矛盾(既认为场景无关又认为值得保留),以 preserve_keywords 为准 to_delete_ids.add(id(m))
if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords): self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除")
self._log(f" [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护,保留")
else:
to_delete_ids.add(id(m))
self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除")
elif mode == "semantic": elif mode == "semantic":
# semantic场景无关但有内容价值 → 保留 # semantic场景无关但有内容价值 → 保留
# 豁免第一层:命中 scene_preserve_tokens关键词/结构化信息保护) # 豁免第一层:命中 scene_preserve_tokens关键词/结构化信息保护)
@@ -728,6 +722,22 @@ class SemanticPruner:
total_original_msgs = 0 total_original_msgs = 0
total_deleted_msgs = 0 total_deleted_msgs = 0
# 统计对象:直接收集结构化数据,无需事后正则解析
stats = {
"scene": self.config.pruning_scene,
"dialog_total": len(dialogs),
"deletion_ratio": proportion,
"enabled": self.config.pruning_switch,
"pruning_mode": pruning_mode,
"related_count": 0,
"unrelated_count": 0,
"related_indices": [],
"unrelated_indices": [],
"total_deleted_messages": 0,
"remaining_dialogs": 0,
"dialogs": [],
}
# 并发执行所有对话的 LLM 抽取(获取 preserve_keywords 等保护信息) # 并发执行所有对话的 LLM 抽取(获取 preserve_keywords 等保护信息)
semaphore = asyncio.Semaphore(self.max_concurrent) semaphore = asyncio.Semaphore(self.max_concurrent)
@@ -749,6 +759,8 @@ class SemanticPruner:
# 相关对话:根据阶段决定处理力度 # 相关对话:根据阶段决定处理力度
if extraction.is_related: if extraction.is_related:
stats["related_count"] += 1
stats["related_indices"].append(d_idx)
kept = self._apply_related_dialog_pruning( kept = self._apply_related_dialog_pruning(
msgs, extraction, f"对话 {d_idx+1}", pruning_mode msgs, extraction, f"对话 {d_idx+1}", pruning_mode
) )
@@ -756,8 +768,18 @@ class SemanticPruner:
total_deleted_msgs += deleted_count total_deleted_msgs += deleted_count
dd.context.msgs = kept dd.context.msgs = kept
result.append(dd) result.append(dd)
stats["dialogs"].append({
"index": d_idx + 1,
"is_related": True,
"total_messages": original_count,
"deleted": deleted_count,
"kept": len(kept),
})
continue continue
stats["unrelated_count"] += 1
stats["unrelated_indices"].append(d_idx)
# 从 LLM 抽取结果中获取所有需要保留的 token # 从 LLM 抽取结果中获取所有需要保留的 token
preserve_tokens = self._build_preserve_tokens(extraction) preserve_tokens = self._build_preserve_tokens(extraction)
@@ -850,31 +872,34 @@ class SemanticPruner:
f"删除={deleted_count} 保留={len(kept_msgs)}" f"删除={deleted_count} 保留={len(kept_msgs)}"
) )
stats["dialogs"].append({
"index": d_idx + 1,
"is_related": False,
"total_messages": original_count,
"protected": len(important_msgs),
"fillers": len(filler_msgs),
"deletable": len(deletable_msgs),
"deleted": deleted_count,
"kept": len(kept_msgs),
})
result.append(dd) result.append(dd)
self._log(f"[剪枝-数据集] 剩余对话数={len(result)}") # 补全统计对象
stats["total_deleted_messages"] = total_deleted_msgs
stats["remaining_dialogs"] = len(result)
# 补充统计日志(供 _parse_logs_to_structured 正则解析) self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
related_count = sum(1 for ex in extraction_results if ex.is_related) self._log(f"[剪枝-数据集] 相关对话数={stats['related_count']} 不相关对话数={stats['unrelated_count']}")
unrelated_count = len(dialogs) - related_count
related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related]
unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related]
self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}")
self._log(
f"[剪枝-数据集] 相关对话:第[{', '.join(related_indices)}]段;"
f"不相关对话:第[{', '.join(unrelated_indices)}]段"
)
self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs}") self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs}")
# 保存日志 # 直接序列化统计对象,无需正则解析
try: try:
from app.core.config import settings from app.core.config import settings
settings.ensure_memory_output_dir() settings.ensure_memory_output_dir()
log_output_path = settings.get_memory_output_path("pruned_terminal.json") log_output_path = settings.get_memory_output_path("pruned_terminal.json")
sanitized_logs = [self._sanitize_log_line(l) for l in self.run_logs]
payload = self._parse_logs_to_structured(sanitized_logs)
with open(log_output_path, "w", encoding="utf-8") as f: with open(log_output_path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2) json.dump(stats, f, ensure_ascii=False, indent=2)
except Exception as e: except Exception as e:
self._log(f"[剪枝-数据集] 保存终端输出日志失败:{e}") self._log(f"[剪枝-数据集] 保存终端输出日志失败:{e}")
@@ -894,113 +919,4 @@ class SemanticPruner:
pass pass
print(msg) print(msg)
def _sanitize_log_line(self, line: str) -> str:
"""移除行首的方括号标签前缀,例如 [剪枝-数据集] 或 [剪枝-对话]。"""
try:
return re.sub(r"^\[[^\]]+\]\s*", "", line)
except Exception:
return line
def _parse_logs_to_structured(self, logs: List[str]) -> dict:
"""将已去前缀的日志列表解析为结构化 JSON便于数据对接。"""
summary = {
"scene": self.config.pruning_scene,
"dialog_total": None,
"deletion_ratio": None,
"enabled": None,
"related_count": None,
"unrelated_count": None,
"related_indices": [],
"unrelated_indices": [],
"total_deleted_messages": None,
"remaining_dialogs": None,
}
dialogs = []
# 解析函数
def parse_int(value: str) -> Optional[int]:
try:
return int(value)
except Exception:
return None
def parse_float(value: str) -> Optional[float]:
try:
return float(value)
except Exception:
return None
def parse_indices(s: str) -> List[int]:
s = s.strip()
if not s:
return []
parts = [p.strip() for p in s.split(",") if p.strip()]
out: List[int] = []
for p in parts:
try:
out.append(int(p))
except Exception:
pass
return out
# 正则
re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)")
re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)")
re_indices = re.compile(r"相关对话:第\[(.*?)\]段;不相关对话:第\[(.*?)\]段")
re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b")
re_total_del = re.compile(r"总删除\s+(\d+)\s+条")
re_remaining = re.compile(r"剩余对话数=(\d+)")
for line in logs:
# 第一行:总览
m = re_header.search(line)
if m:
summary["dialog_total"] = parse_int(m.group(1))
# 顶层 scene 依配置,这里不覆盖,但也可校验 m.group(2)
summary["deletion_ratio"] = parse_float(m.group(3))
summary["enabled"] = True if m.group(4) == "True" else False
continue
# 第二行:相关/不相关数量
m = re_counts.search(line)
if m:
summary["related_count"] = parse_int(m.group(1))
summary["unrelated_count"] = parse_int(m.group(2))
continue
# 第三行:相关/不相关索引
m = re_indices.search(line)
if m:
summary["related_indices"] = parse_indices(m.group(1))
summary["unrelated_indices"] = parse_indices(m.group(2))
continue
# 对话级统计
m = re_dialog.search(line)
if m:
dialogs.append({
"index": parse_int(m.group(1)),
"total_messages": parse_int(m.group(2)),
"deleted": parse_int(m.group(3)),
"kept": parse_int(m.group(4)),
})
continue
# 全局删除总数
m = re_total_del.search(line)
if m:
summary["total_deleted_messages"] = parse_int(m.group(1))
continue
# 剩余对话数
m = re_remaining.search(line)
if m:
summary["remaining_dialogs"] = parse_int(m.group(1))
continue
return {
"scene": summary["scene"],
"timestamp": datetime.now().isoformat(),
"summary": {k: v for k, v in summary.items() if k != "scene"},
"dialogs": dialogs,
}

View File

@@ -121,7 +121,7 @@ async def run_pilot_extraction(
"pruning_scene": memory_config.pruning_scene, "pruning_scene": memory_config.pruning_scene,
"pruning_threshold": memory_config.pruning_threshold, "pruning_threshold": memory_config.pruning_threshold,
"scene_id": str(memory_config.scene_id) if memory_config.scene_id else None, "scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
"ontology_class_infos": memory_config.ontology_classes, "ontology_class_infos": memory_config.ontology_class_infos,
} }
config = PruningConfig(**pruning_config_dict) config = PruningConfig(**pruning_config_dict)
@@ -232,9 +232,11 @@ async def run_pilot_extraction(
"chunker_strategy": memory_config.chunker_strategy, "chunker_strategy": memory_config.chunker_strategy,
} }
# 添加剪枝统计信息 # 添加剪枝统计信息(始终包含 pruning 字段,确保前端不会因字段缺失报错)
if pruning_stats: preprocessing_summary["pruning"] = pruning_stats if pruning_stats else {
preprocessing_summary["pruning"] = pruning_stats "enabled": memory_config.pruning_enabled,
"deleted_count": 0,
}
await progress_callback("text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", preprocessing_summary) await progress_callback("text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", preprocessing_summary)