From c2fc4ab4ffad3d051617886c5565fea705f586d8 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Thu, 19 Mar 2026 12:26:16 +0800 Subject: [PATCH] [changes] Remove regular logs and apply strict rules. --- .../core/memory/agent/utils/get_dialogs.py | 2 +- .../data_preprocessing/data_pruning.py | 218 ++++++------------ api/app/services/pilot_run_service.py | 10 +- 3 files changed, 74 insertions(+), 156 deletions(-) diff --git a/api/app/core/memory/agent/utils/get_dialogs.py b/api/app/core/memory/agent/utils/get_dialogs.py index a301a5ef..3b06defe 100644 --- a/api/app/core/memory/agent/utils/get_dialogs.py +++ b/api/app/core/memory/agent/utils/get_dialogs.py @@ -84,7 +84,7 @@ async def get_chunked_dialogs( pruning_scene=memory_config.pruning_scene or "education", pruning_threshold=memory_config.pruning_threshold, scene_id=str(memory_config.scene_id) if memory_config.scene_id else None, - ontology_class_infos=memory_config.ontology_classes, + ontology_class_infos=memory_config.ontology_class_infos, ) logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}") diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py index 28e2f96b..c9bdfcf4 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py @@ -523,12 +523,10 @@ class SemanticPruner: 3. 两次豁免均未命中 → 删除 第三层(strict,阈值 [0.6, 0.9]): - 保留标准:场景相关性优先,豁免权极度收窄。 + 保留标准:场景相关性优先,无任何豁免。 - 填充消息 → 删除(最高优先级) - 场景相关消息 → 保留 - - 场景无关消息 → 直接删除,仅保留一个例外: - LLM 同时将该消息放入 preserve_keywords(自相矛盾时以情感标记为准)→ 保留 - 注意:strict 模式下情感词兜底不再生效,场景相关性是最终裁决标准。 + - 场景无关消息 → 直接删除,preserve_keywords 和情感词在此模式下均不生效 至少保留 1 条消息(兜底取第一条)。 """ @@ -563,14 +561,10 @@ class SemanticPruner: if is_scene_unrelated: if mode == "strict": - # strict:场景无关 → 删除 - # 唯一例外:LLM 同时将该消息标记为 preserve_keywords, - # 说明 LLM 自相矛盾(既认为场景无关又认为值得保留),以 preserve_keywords 为准 - if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords): - self._log(f" [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护,保留") - else: - to_delete_ids.add(id(m)) - self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除") + # strict:场景无关直接删除,不做任何豁免 + # 场景相关性是唯一裁决标准,preserve_keywords 在此模式下不生效 + to_delete_ids.add(id(m)) + self._log(f" [场景无关-严格] '{msg_text[:40]}' → 删除") elif mode == "semantic": # semantic:场景无关但有内容价值 → 保留 # 豁免第一层:命中 scene_preserve_tokens(关键词/结构化信息保护) @@ -720,14 +714,30 @@ class SemanticPruner: self._log( f"[剪枝-数据集] 对话总数={len(dialogs)} 场景={self.config.pruning_scene} 删除比例={proportion} 开关={self.config.pruning_switch} 模式=消息级独立判断" ) - + pruning_mode = self._get_pruning_mode() self._log(f"[剪枝-数据集] 阈值={proportion} → 剪枝阶段={pruning_mode}") - + result: List[DialogData] = [] total_original_msgs = 0 total_deleted_msgs = 0 + # 统计对象:直接收集结构化数据,无需事后正则解析 + stats = { + "scene": self.config.pruning_scene, + "dialog_total": len(dialogs), + "deletion_ratio": proportion, + "enabled": self.config.pruning_switch, + "pruning_mode": pruning_mode, + "related_count": 0, + "unrelated_count": 0, + "related_indices": [], + "unrelated_indices": [], + "total_deleted_messages": 0, + "remaining_dialogs": 0, + "dialogs": [], + } + # 并发执行所有对话的 LLM 抽取(获取 preserve_keywords 等保护信息) semaphore = asyncio.Semaphore(self.max_concurrent) @@ -749,6 +759,8 @@ class SemanticPruner: # 相关对话:根据阶段决定处理力度 if extraction.is_related: + stats["related_count"] += 1 + stats["related_indices"].append(d_idx) kept = self._apply_related_dialog_pruning( msgs, extraction, f"对话 {d_idx+1}", pruning_mode ) @@ -756,8 +768,18 @@ class SemanticPruner: total_deleted_msgs += deleted_count dd.context.msgs = kept result.append(dd) + stats["dialogs"].append({ + "index": d_idx + 1, + "is_related": True, + "total_messages": original_count, + "deleted": deleted_count, + "kept": len(kept), + }) continue + stats["unrelated_count"] += 1 + stats["unrelated_indices"].append(d_idx) + # 从 LLM 抽取结果中获取所有需要保留的 token preserve_tokens = self._build_preserve_tokens(extraction) @@ -792,16 +814,16 @@ class SemanticPruner: # important_msgs 仅用于日志统计 important_msgs = llm_protected_msgs - + # 计算删除配额 delete_target = int(original_count * proportion) if proportion > 0 and original_count > 0 and delete_target == 0: delete_target = 1 - + # 确保至少保留1条消息 max_deletable = max(0, original_count - 1) delete_target = min(delete_target, max_deletable) - + # 删除策略:优先删填充消息,再按出现顺序删其余可删消息 to_delete_indices = set() deleted_details = [] @@ -819,62 +841,65 @@ class SemanticPruner: break to_delete_indices.add(idx) deleted_details.append(f"[{idx}] 可删: '{msg.msg[:50]}'") - + # 执行删除 kept_msgs = [] for idx, m in enumerate(msgs): if idx not in to_delete_indices: kept_msgs.append(m) - + # 确保至少保留1条 if not kept_msgs and msgs: kept_msgs = [msgs[0]] - + dd.context.msgs = kept_msgs deleted_count = original_count - len(kept_msgs) total_deleted_msgs += deleted_count - + # 输出删除详情 if deleted_details: self._log(f"[剪枝-删除详情] 对话 {d_idx+1} 删除了以下消息:") for detail in deleted_details: self._log(f" {detail}") - + # ========== 问答对统计(已注释) ========== # qa_info = f",问答对={len(qa_pairs)}" if qa_pairs else "" # ======================================== - + self._log( f"[剪枝-对话] 对话 {d_idx+1} 总消息={original_count} " f"(保护={len(important_msgs)} 填充={len(filler_msgs)} 可删={len(deletable_msgs)}) " f"删除={deleted_count} 保留={len(kept_msgs)}" ) - - result.append(dd) - - self._log(f"[剪枝-数据集] 剩余对话数={len(result)}") - # 补充统计日志(供 _parse_logs_to_structured 正则解析) - related_count = sum(1 for ex in extraction_results if ex.is_related) - unrelated_count = len(dialogs) - related_count - related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related] - unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related] - self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}") - self._log( - f"[剪枝-数据集] 相关对话:第[{', '.join(related_indices)}]段;" - f"不相关对话:第[{', '.join(unrelated_indices)}]段" - ) + stats["dialogs"].append({ + "index": d_idx + 1, + "is_related": False, + "total_messages": original_count, + "protected": len(important_msgs), + "fillers": len(filler_msgs), + "deletable": len(deletable_msgs), + "deleted": deleted_count, + "kept": len(kept_msgs), + }) + + result.append(dd) + + # 补全统计对象 + stats["total_deleted_messages"] = total_deleted_msgs + stats["remaining_dialogs"] = len(result) + + self._log(f"[剪枝-数据集] 剩余对话数={len(result)}") + self._log(f"[剪枝-数据集] 相关对话数={stats['related_count']} 不相关对话数={stats['unrelated_count']}") self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs} 条") - # 保存日志 + # 直接序列化统计对象,无需正则解析 try: from app.core.config import settings settings.ensure_memory_output_dir() log_output_path = settings.get_memory_output_path("pruned_terminal.json") - sanitized_logs = [self._sanitize_log_line(l) for l in self.run_logs] - payload = self._parse_logs_to_structured(sanitized_logs) with open(log_output_path, "w", encoding="utf-8") as f: - json.dump(payload, f, ensure_ascii=False, indent=2) + json.dump(stats, f, ensure_ascii=False, indent=2) except Exception as e: self._log(f"[剪枝-数据集] 保存终端输出日志失败:{e}") @@ -882,7 +907,7 @@ class SemanticPruner: if not result: print("警告: 语义剪枝后数据集为空,已回退为未剪枝数据以避免流程中断") return dialogs - + return result def _log(self, msg: str) -> None: @@ -894,113 +919,4 @@ class SemanticPruner: pass print(msg) - def _sanitize_log_line(self, line: str) -> str: - """移除行首的方括号标签前缀,例如 [剪枝-数据集] 或 [剪枝-对话]。""" - try: - return re.sub(r"^\[[^\]]+\]\s*", "", line) - except Exception: - return line - def _parse_logs_to_structured(self, logs: List[str]) -> dict: - """将已去前缀的日志列表解析为结构化 JSON,便于数据对接。""" - summary = { - "scene": self.config.pruning_scene, - "dialog_total": None, - "deletion_ratio": None, - "enabled": None, - "related_count": None, - "unrelated_count": None, - "related_indices": [], - "unrelated_indices": [], - "total_deleted_messages": None, - "remaining_dialogs": None, - } - dialogs = [] - - # 解析函数 - def parse_int(value: str) -> Optional[int]: - try: - return int(value) - except Exception: - return None - - def parse_float(value: str) -> Optional[float]: - try: - return float(value) - except Exception: - return None - - def parse_indices(s: str) -> List[int]: - s = s.strip() - if not s: - return [] - parts = [p.strip() for p in s.split(",") if p.strip()] - out: List[int] = [] - for p in parts: - try: - out.append(int(p)) - except Exception: - pass - return out - - # 正则 - re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)") - re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)") - re_indices = re.compile(r"相关对话:第\[(.*?)\]段;不相关对话:第\[(.*?)\]段") - re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b") - re_total_del = re.compile(r"总删除\s+(\d+)\s+条") - re_remaining = re.compile(r"剩余对话数=(\d+)") - - for line in logs: - # 第一行:总览 - m = re_header.search(line) - if m: - summary["dialog_total"] = parse_int(m.group(1)) - # 顶层 scene 依配置,这里不覆盖,但也可校验 m.group(2) - summary["deletion_ratio"] = parse_float(m.group(3)) - summary["enabled"] = True if m.group(4) == "True" else False - continue - - # 第二行:相关/不相关数量 - m = re_counts.search(line) - if m: - summary["related_count"] = parse_int(m.group(1)) - summary["unrelated_count"] = parse_int(m.group(2)) - continue - - # 第三行:相关/不相关索引 - m = re_indices.search(line) - if m: - summary["related_indices"] = parse_indices(m.group(1)) - summary["unrelated_indices"] = parse_indices(m.group(2)) - continue - - # 对话级统计 - m = re_dialog.search(line) - if m: - dialogs.append({ - "index": parse_int(m.group(1)), - "total_messages": parse_int(m.group(2)), - "deleted": parse_int(m.group(3)), - "kept": parse_int(m.group(4)), - }) - continue - - # 全局删除总数 - m = re_total_del.search(line) - if m: - summary["total_deleted_messages"] = parse_int(m.group(1)) - continue - - # 剩余对话数 - m = re_remaining.search(line) - if m: - summary["remaining_dialogs"] = parse_int(m.group(1)) - continue - - return { - "scene": summary["scene"], - "timestamp": datetime.now().isoformat(), - "summary": {k: v for k, v in summary.items() if k != "scene"}, - "dialogs": dialogs, - } diff --git a/api/app/services/pilot_run_service.py b/api/app/services/pilot_run_service.py index b473140d..fc749157 100644 --- a/api/app/services/pilot_run_service.py +++ b/api/app/services/pilot_run_service.py @@ -121,7 +121,7 @@ async def run_pilot_extraction( "pruning_scene": memory_config.pruning_scene, "pruning_threshold": memory_config.pruning_threshold, "scene_id": str(memory_config.scene_id) if memory_config.scene_id else None, - "ontology_class_infos": memory_config.ontology_classes, + "ontology_class_infos": memory_config.ontology_class_infos, } config = PruningConfig(**pruning_config_dict) @@ -232,9 +232,11 @@ async def run_pilot_extraction( "chunker_strategy": memory_config.chunker_strategy, } - # 添加剪枝统计信息 - if pruning_stats: - preprocessing_summary["pruning"] = pruning_stats + # 添加剪枝统计信息(始终包含 pruning 字段,确保前端不会因字段缺失报错) + preprocessing_summary["pruning"] = pruning_stats if pruning_stats else { + "enabled": memory_config.pruning_enabled, + "deleted_count": 0, + } await progress_callback("text_preprocessing_complete", "预处理文本完成(剪枝 + 分块)", preprocessing_summary)