[changes] Remove regular logs and apply strict rules.
This commit is contained in:
@@ -84,7 +84,7 @@ async def get_chunked_dialogs(
|
|||||||
pruning_scene=memory_config.pruning_scene or "education",
|
pruning_scene=memory_config.pruning_scene or "education",
|
||||||
pruning_threshold=memory_config.pruning_threshold,
|
pruning_threshold=memory_config.pruning_threshold,
|
||||||
scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
|
scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
|
||||||
ontology_class_infos=memory_config.ontology_classes,
|
ontology_class_infos=memory_config.ontology_class_infos,
|
||||||
)
|
)
|
||||||
logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}")
|
logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}")
|
||||||
|
|
||||||
|
|||||||
@@ -523,12 +523,10 @@ class SemanticPruner:
|
|||||||
3. 两次豁免均未命中 → 删除
|
3. 两次豁免均未命中 → 删除
|
||||||
|
|
||||||
第三层(strict,阈值 [0.6, 0.9]):
|
第三层(strict,阈值 [0.6, 0.9]):
|
||||||
保留标准:场景相关性优先,豁免权极度收窄。
|
保留标准:场景相关性优先,无任何豁免。
|
||||||
- 填充消息 → 删除(最高优先级)
|
- 填充消息 → 删除(最高优先级)
|
||||||
- 场景相关消息 → 保留
|
- 场景相关消息 → 保留
|
||||||
- 场景无关消息 → 直接删除,仅保留一个例外:
|
- 场景无关消息 → 直接删除,preserve_keywords 和情感词在此模式下均不生效
|
||||||
LLM 同时将该消息放入 preserve_keywords(自相矛盾时以情感标记为准)→ 保留
|
|
||||||
注意:strict 模式下情感词兜底不再生效,场景相关性是最终裁决标准。
|
|
||||||
|
|
||||||
至少保留 1 条消息(兜底取第一条)。
|
至少保留 1 条消息(兜底取第一条)。
|
||||||
"""
|
"""
|
||||||
@@ -563,14 +561,10 @@ class SemanticPruner:
|
|||||||
|
|
||||||
if is_scene_unrelated:
|
if is_scene_unrelated:
|
||||||
if mode == "strict":
|
if mode == "strict":
|
||||||
# strict:场景无关 → 删除
|
# strict:场景无关直接删除,不做任何豁免
|
||||||
# 唯一例外:LLM 同时将该消息标记为 preserve_keywords,
|
# 场景相关性是唯一裁决标准,preserve_keywords 在此模式下不生效
|
||||||
# 说明 LLM 自相矛盾(既认为场景无关又认为值得保留),以 preserve_keywords 为准
|
to_delete_ids.add(id(m))
|
||||||
if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords):
|
self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除")
|
||||||
self._log(f" [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护,保留")
|
|
||||||
else:
|
|
||||||
to_delete_ids.add(id(m))
|
|
||||||
self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除")
|
|
||||||
elif mode == "semantic":
|
elif mode == "semantic":
|
||||||
# semantic:场景无关但有内容价值 → 保留
|
# semantic:场景无关但有内容价值 → 保留
|
||||||
# 豁免第一层:命中 scene_preserve_tokens(关键词/结构化信息保护)
|
# 豁免第一层:命中 scene_preserve_tokens(关键词/结构化信息保护)
|
||||||
@@ -728,6 +722,22 @@ class SemanticPruner:
|
|||||||
total_original_msgs = 0
|
total_original_msgs = 0
|
||||||
total_deleted_msgs = 0
|
total_deleted_msgs = 0
|
||||||
|
|
||||||
|
# 统计对象:直接收集结构化数据,无需事后正则解析
|
||||||
|
stats = {
|
||||||
|
"scene": self.config.pruning_scene,
|
||||||
|
"dialog_total": len(dialogs),
|
||||||
|
"deletion_ratio": proportion,
|
||||||
|
"enabled": self.config.pruning_switch,
|
||||||
|
"pruning_mode": pruning_mode,
|
||||||
|
"related_count": 0,
|
||||||
|
"unrelated_count": 0,
|
||||||
|
"related_indices": [],
|
||||||
|
"unrelated_indices": [],
|
||||||
|
"total_deleted_messages": 0,
|
||||||
|
"remaining_dialogs": 0,
|
||||||
|
"dialogs": [],
|
||||||
|
}
|
||||||
|
|
||||||
# 并发执行所有对话的 LLM 抽取(获取 preserve_keywords 等保护信息)
|
# 并发执行所有对话的 LLM 抽取(获取 preserve_keywords 等保护信息)
|
||||||
semaphore = asyncio.Semaphore(self.max_concurrent)
|
semaphore = asyncio.Semaphore(self.max_concurrent)
|
||||||
|
|
||||||
@@ -749,6 +759,8 @@ class SemanticPruner:
|
|||||||
|
|
||||||
# 相关对话:根据阶段决定处理力度
|
# 相关对话:根据阶段决定处理力度
|
||||||
if extraction.is_related:
|
if extraction.is_related:
|
||||||
|
stats["related_count"] += 1
|
||||||
|
stats["related_indices"].append(d_idx)
|
||||||
kept = self._apply_related_dialog_pruning(
|
kept = self._apply_related_dialog_pruning(
|
||||||
msgs, extraction, f"对话 {d_idx+1}", pruning_mode
|
msgs, extraction, f"对话 {d_idx+1}", pruning_mode
|
||||||
)
|
)
|
||||||
@@ -756,8 +768,18 @@ class SemanticPruner:
|
|||||||
total_deleted_msgs += deleted_count
|
total_deleted_msgs += deleted_count
|
||||||
dd.context.msgs = kept
|
dd.context.msgs = kept
|
||||||
result.append(dd)
|
result.append(dd)
|
||||||
|
stats["dialogs"].append({
|
||||||
|
"index": d_idx + 1,
|
||||||
|
"is_related": True,
|
||||||
|
"total_messages": original_count,
|
||||||
|
"deleted": deleted_count,
|
||||||
|
"kept": len(kept),
|
||||||
|
})
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
stats["unrelated_count"] += 1
|
||||||
|
stats["unrelated_indices"].append(d_idx)
|
||||||
|
|
||||||
# 从 LLM 抽取结果中获取所有需要保留的 token
|
# 从 LLM 抽取结果中获取所有需要保留的 token
|
||||||
preserve_tokens = self._build_preserve_tokens(extraction)
|
preserve_tokens = self._build_preserve_tokens(extraction)
|
||||||
|
|
||||||
@@ -850,31 +872,34 @@ class SemanticPruner:
|
|||||||
f"删除={deleted_count} 保留={len(kept_msgs)}"
|
f"删除={deleted_count} 保留={len(kept_msgs)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
stats["dialogs"].append({
|
||||||
|
"index": d_idx + 1,
|
||||||
|
"is_related": False,
|
||||||
|
"total_messages": original_count,
|
||||||
|
"protected": len(important_msgs),
|
||||||
|
"fillers": len(filler_msgs),
|
||||||
|
"deletable": len(deletable_msgs),
|
||||||
|
"deleted": deleted_count,
|
||||||
|
"kept": len(kept_msgs),
|
||||||
|
})
|
||||||
|
|
||||||
result.append(dd)
|
result.append(dd)
|
||||||
|
|
||||||
self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
|
# 补全统计对象
|
||||||
|
stats["total_deleted_messages"] = total_deleted_msgs
|
||||||
|
stats["remaining_dialogs"] = len(result)
|
||||||
|
|
||||||
# 补充统计日志(供 _parse_logs_to_structured 正则解析)
|
self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
|
||||||
related_count = sum(1 for ex in extraction_results if ex.is_related)
|
self._log(f"[剪枝-数据集] 相关对话数={stats['related_count']} 不相关对话数={stats['unrelated_count']}")
|
||||||
unrelated_count = len(dialogs) - related_count
|
|
||||||
related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related]
|
|
||||||
unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related]
|
|
||||||
self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}")
|
|
||||||
self._log(
|
|
||||||
f"[剪枝-数据集] 相关对话:第[{', '.join(related_indices)}]段;"
|
|
||||||
f"不相关对话:第[{', '.join(unrelated_indices)}]段"
|
|
||||||
)
|
|
||||||
self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs} 条")
|
self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs} 条")
|
||||||
|
|
||||||
# 保存日志
|
# 直接序列化统计对象,无需正则解析
|
||||||
try:
|
try:
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
settings.ensure_memory_output_dir()
|
settings.ensure_memory_output_dir()
|
||||||
log_output_path = settings.get_memory_output_path("pruned_terminal.json")
|
log_output_path = settings.get_memory_output_path("pruned_terminal.json")
|
||||||
sanitized_logs = [self._sanitize_log_line(l) for l in self.run_logs]
|
|
||||||
payload = self._parse_logs_to_structured(sanitized_logs)
|
|
||||||
with open(log_output_path, "w", encoding="utf-8") as f:
|
with open(log_output_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(payload, f, ensure_ascii=False, indent=2)
|
json.dump(stats, f, ensure_ascii=False, indent=2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._log(f"[剪枝-数据集] 保存终端输出日志失败:{e}")
|
self._log(f"[剪枝-数据集] 保存终端输出日志失败:{e}")
|
||||||
|
|
||||||
@@ -894,113 +919,4 @@ class SemanticPruner:
|
|||||||
pass
|
pass
|
||||||
print(msg)
|
print(msg)
|
||||||
|
|
||||||
def _sanitize_log_line(self, line: str) -> str:
|
|
||||||
"""移除行首的方括号标签前缀,例如 [剪枝-数据集] 或 [剪枝-对话]。"""
|
|
||||||
try:
|
|
||||||
return re.sub(r"^\[[^\]]+\]\s*", "", line)
|
|
||||||
except Exception:
|
|
||||||
return line
|
|
||||||
|
|
||||||
def _parse_logs_to_structured(self, logs: List[str]) -> dict:
|
|
||||||
"""将已去前缀的日志列表解析为结构化 JSON,便于数据对接。"""
|
|
||||||
summary = {
|
|
||||||
"scene": self.config.pruning_scene,
|
|
||||||
"dialog_total": None,
|
|
||||||
"deletion_ratio": None,
|
|
||||||
"enabled": None,
|
|
||||||
"related_count": None,
|
|
||||||
"unrelated_count": None,
|
|
||||||
"related_indices": [],
|
|
||||||
"unrelated_indices": [],
|
|
||||||
"total_deleted_messages": None,
|
|
||||||
"remaining_dialogs": None,
|
|
||||||
}
|
|
||||||
dialogs = []
|
|
||||||
|
|
||||||
# 解析函数
|
|
||||||
def parse_int(value: str) -> Optional[int]:
|
|
||||||
try:
|
|
||||||
return int(value)
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def parse_float(value: str) -> Optional[float]:
|
|
||||||
try:
|
|
||||||
return float(value)
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def parse_indices(s: str) -> List[int]:
|
|
||||||
s = s.strip()
|
|
||||||
if not s:
|
|
||||||
return []
|
|
||||||
parts = [p.strip() for p in s.split(",") if p.strip()]
|
|
||||||
out: List[int] = []
|
|
||||||
for p in parts:
|
|
||||||
try:
|
|
||||||
out.append(int(p))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return out
|
|
||||||
|
|
||||||
# 正则
|
|
||||||
re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)")
|
|
||||||
re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)")
|
|
||||||
re_indices = re.compile(r"相关对话:第\[(.*?)\]段;不相关对话:第\[(.*?)\]段")
|
|
||||||
re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b")
|
|
||||||
re_total_del = re.compile(r"总删除\s+(\d+)\s+条")
|
|
||||||
re_remaining = re.compile(r"剩余对话数=(\d+)")
|
|
||||||
|
|
||||||
for line in logs:
|
|
||||||
# 第一行:总览
|
|
||||||
m = re_header.search(line)
|
|
||||||
if m:
|
|
||||||
summary["dialog_total"] = parse_int(m.group(1))
|
|
||||||
# 顶层 scene 依配置,这里不覆盖,但也可校验 m.group(2)
|
|
||||||
summary["deletion_ratio"] = parse_float(m.group(3))
|
|
||||||
summary["enabled"] = True if m.group(4) == "True" else False
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 第二行:相关/不相关数量
|
|
||||||
m = re_counts.search(line)
|
|
||||||
if m:
|
|
||||||
summary["related_count"] = parse_int(m.group(1))
|
|
||||||
summary["unrelated_count"] = parse_int(m.group(2))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 第三行:相关/不相关索引
|
|
||||||
m = re_indices.search(line)
|
|
||||||
if m:
|
|
||||||
summary["related_indices"] = parse_indices(m.group(1))
|
|
||||||
summary["unrelated_indices"] = parse_indices(m.group(2))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 对话级统计
|
|
||||||
m = re_dialog.search(line)
|
|
||||||
if m:
|
|
||||||
dialogs.append({
|
|
||||||
"index": parse_int(m.group(1)),
|
|
||||||
"total_messages": parse_int(m.group(2)),
|
|
||||||
"deleted": parse_int(m.group(3)),
|
|
||||||
"kept": parse_int(m.group(4)),
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 全局删除总数
|
|
||||||
m = re_total_del.search(line)
|
|
||||||
if m:
|
|
||||||
summary["total_deleted_messages"] = parse_int(m.group(1))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 剩余对话数
|
|
||||||
m = re_remaining.search(line)
|
|
||||||
if m:
|
|
||||||
summary["remaining_dialogs"] = parse_int(m.group(1))
|
|
||||||
continue
|
|
||||||
|
|
||||||
return {
|
|
||||||
"scene": summary["scene"],
|
|
||||||
"timestamp": datetime.now().isoformat(),
|
|
||||||
"summary": {k: v for k, v in summary.items() if k != "scene"},
|
|
||||||
"dialogs": dialogs,
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ async def run_pilot_extraction(
|
|||||||
"pruning_scene": memory_config.pruning_scene,
|
"pruning_scene": memory_config.pruning_scene,
|
||||||
"pruning_threshold": memory_config.pruning_threshold,
|
"pruning_threshold": memory_config.pruning_threshold,
|
||||||
"scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
|
"scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
|
||||||
"ontology_class_infos": memory_config.ontology_classes,
|
"ontology_class_infos": memory_config.ontology_class_infos,
|
||||||
}
|
}
|
||||||
config = PruningConfig(**pruning_config_dict)
|
config = PruningConfig(**pruning_config_dict)
|
||||||
|
|
||||||
@@ -232,9 +232,11 @@ async def run_pilot_extraction(
|
|||||||
"chunker_strategy": memory_config.chunker_strategy,
|
"chunker_strategy": memory_config.chunker_strategy,
|
||||||
}
|
}
|
||||||
|
|
||||||
# 添加剪枝统计信息
|
# 添加剪枝统计信息(始终包含 pruning 字段,确保前端不会因字段缺失报错)
|
||||||
if pruning_stats:
|
preprocessing_summary["pruning"] = pruning_stats if pruning_stats else {
|
||||||
preprocessing_summary["pruning"] = pruning_stats
|
"enabled": memory_config.pruning_enabled,
|
||||||
|
"deleted_count": 0,
|
||||||
|
}
|
||||||
|
|
||||||
await progress_callback("text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", preprocessing_summary)
|
await progress_callback("text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", preprocessing_summary)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user