[changes] Remove regular logs and apply strict rules.

2026-03-19 12:26:16 +08:00
parent d12ad213e0
commit c2fc4ab4ff
3 changed files with 74 additions and 156 deletions
--- a/api/app/core/memory/agent/utils/get_dialogs.py
+++ b/api/app/core/memory/agent/utils/get_dialogs.py
@@ -84,7 +84,7 @@ async def get_chunked_dialogs(
                            pruning_scene=memory_config.pruning_scene or "education",
                            pruning_threshold=memory_config.pruning_threshold,
                            scene_id=str(memory_config.scene_id) if memory_config.scene_id else None,
-                            ontology_class_infos=memory_config.ontology_classes,
+                            ontology_class_infos=memory_config.ontology_class_infos,
                        )
                        logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}")
                        
--- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
+++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
@@ -523,12 +523,10 @@ class SemanticPruner:
                3. 两次豁免均未命中 → 删除

        第三层（strict，阈值 [0.6, 0.9]）：
-            保留标准：场景相关性优先，豁免权极度收窄。
+            保留标准：场景相关性优先，无任何豁免。
            - 填充消息 → 删除（最高优先级）
            - 场景相关消息 → 保留
-            - 场景无关消息 → 直接删除，仅保留一个例外：
-                LLM 同时将该消息放入 preserve_keywords（自相矛盾时以情感标记为准）→ 保留
-            注意：strict 模式下情感词兜底不再生效，场景相关性是最终裁决标准。
+            - 场景无关消息 → 直接删除，preserve_keywords 和情感词在此模式下均不生效

        至少保留 1 条消息（兜底取第一条）。
        """
@@ -563,14 +561,10 @@ class SemanticPruner:

            if is_scene_unrelated:
                if mode == "strict":
-                    # strict：场景无关 → 删除
-                    # 唯一例外：LLM 同时将该消息标记为 preserve_keywords，
-                    # 说明 LLM 自相矛盾（既认为场景无关又认为值得保留），以 preserve_keywords 为准
-                    if extraction.preserve_keywords and self._msg_matches_tokens(m, extraction.preserve_keywords):
-                        self._log(f"  [保护-情感] '{msg_text[:40]}' → preserve_keywords 兜底保护，保留")
-                    else:
-                        to_delete_ids.add(id(m))
-                        self._log(f"  [场景无关-严格] '{msg_text[:40]}' → 删除")
+                    # strict：场景无关直接删除，不做任何豁免
+                    # 场景相关性是唯一裁决标准，preserve_keywords 在此模式下不生效
+                    to_delete_ids.add(id(m))
+                    self._log(f"  [场景无关-严格] '{msg_text[:40]}' → 删除")
                elif mode == "semantic":
                    # semantic：场景无关但有内容价值 → 保留
                    # 豁免第一层：命中 scene_preserve_tokens（关键词/结构化信息保护）
@@ -720,14 +714,30 @@ class SemanticPruner:
        self._log(
            f"[剪枝-数据集] 对话总数={len(dialogs)} 场景={self.config.pruning_scene} 删除比例={proportion} 开关={self.config.pruning_switch} 模式=消息级独立判断"
        )
-        
+
        pruning_mode = self._get_pruning_mode()
        self._log(f"[剪枝-数据集] 阈值={proportion} → 剪枝阶段={pruning_mode}")
-        
+
        result: List[DialogData] = []
        total_original_msgs = 0
        total_deleted_msgs = 0

+        # 统计对象：直接收集结构化数据，无需事后正则解析
+        stats = {
+            "scene": self.config.pruning_scene,
+            "dialog_total": len(dialogs),
+            "deletion_ratio": proportion,
+            "enabled": self.config.pruning_switch,
+            "pruning_mode": pruning_mode,
+            "related_count": 0,
+            "unrelated_count": 0,
+            "related_indices": [],
+            "unrelated_indices": [],
+            "total_deleted_messages": 0,
+            "remaining_dialogs": 0,
+            "dialogs": [],
+        }
+
        # 并发执行所有对话的 LLM 抽取（获取 preserve_keywords 等保护信息）
        semaphore = asyncio.Semaphore(self.max_concurrent)

@@ -749,6 +759,8 @@ class SemanticPruner:

            # 相关对话：根据阶段决定处理力度
            if extraction.is_related:
+                stats["related_count"] += 1
+                stats["related_indices"].append(d_idx)
                kept = self._apply_related_dialog_pruning(
                    msgs, extraction, f"对话 {d_idx+1}", pruning_mode
                )
@@ -756,8 +768,18 @@ class SemanticPruner:
                total_deleted_msgs += deleted_count
                dd.context.msgs = kept
                result.append(dd)
+                stats["dialogs"].append({
+                    "index": d_idx + 1,
+                    "is_related": True,
+                    "total_messages": original_count,
+                    "deleted": deleted_count,
+                    "kept": len(kept),
+                })
                continue

+            stats["unrelated_count"] += 1
+            stats["unrelated_indices"].append(d_idx)
+
            # 从 LLM 抽取结果中获取所有需要保留的 token
            preserve_tokens = self._build_preserve_tokens(extraction)

@@ -792,16 +814,16 @@ class SemanticPruner:

            # important_msgs 仅用于日志统计
            important_msgs = llm_protected_msgs
-            
+
            # 计算删除配额
            delete_target = int(original_count * proportion)
            if proportion > 0 and original_count > 0 and delete_target == 0:
                delete_target = 1
-            
+
            # 确保至少保留1条消息
            max_deletable = max(0, original_count - 1)
            delete_target = min(delete_target, max_deletable)
-            
+
            # 删除策略：优先删填充消息，再按出现顺序删其余可删消息
            to_delete_indices = set()
            deleted_details = []
@@ -819,62 +841,65 @@ class SemanticPruner:
                    break
                to_delete_indices.add(idx)
                deleted_details.append(f"[{idx}] 可删: '{msg.msg[:50]}'")
-            
+
            # 执行删除
            kept_msgs = []
            for idx, m in enumerate(msgs):
                if idx not in to_delete_indices:
                    kept_msgs.append(m)
-            
+
            # 确保至少保留1条
            if not kept_msgs and msgs:
                kept_msgs = [msgs[0]]
-            
+
            dd.context.msgs = kept_msgs
            deleted_count = original_count - len(kept_msgs)
            total_deleted_msgs += deleted_count
-            
+
            # 输出删除详情
            if deleted_details:
                self._log(f"[剪枝-删除详情] 对话 {d_idx+1} 删除了以下消息:")
                for detail in deleted_details:
                    self._log(f"  {detail}")
-            
+
            # ========== 问答对统计（已注释） ==========
            # qa_info = f"，问答对={len(qa_pairs)}" if qa_pairs else ""
            # ========================================
-            
+
            self._log(
                f"[剪枝-对话] 对话 {d_idx+1} 总消息={original_count} "
                f"(保护={len(important_msgs)} 填充={len(filler_msgs)} 可删={len(deletable_msgs)}) "
                f"删除={deleted_count} 保留={len(kept_msgs)}"
            )
-            
-            result.append(dd)
-        
-        self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")

-        # 补充统计日志（供 _parse_logs_to_structured 正则解析）
-        related_count = sum(1 for ex in extraction_results if ex.is_related)
-        unrelated_count = len(dialogs) - related_count
-        related_indices = [str(i) for i, ex in enumerate(extraction_results) if ex.is_related]
-        unrelated_indices = [str(i) for i, ex in enumerate(extraction_results) if not ex.is_related]
-        self._log(f"[剪枝-数据集] 相关对话数={related_count} 不相关对话数={unrelated_count}")
-        self._log(
-            f"[剪枝-数据集] 相关对话：第[{', '.join(related_indices)}]段；"
-            f"不相关对话：第[{', '.join(unrelated_indices)}]段"
-        )
+            stats["dialogs"].append({
+                "index": d_idx + 1,
+                "is_related": False,
+                "total_messages": original_count,
+                "protected": len(important_msgs),
+                "fillers": len(filler_msgs),
+                "deletable": len(deletable_msgs),
+                "deleted": deleted_count,
+                "kept": len(kept_msgs),
+            })
+
+            result.append(dd)
+
+        # 补全统计对象
+        stats["total_deleted_messages"] = total_deleted_msgs
+        stats["remaining_dialogs"] = len(result)
+
+        self._log(f"[剪枝-数据集] 剩余对话数={len(result)}")
+        self._log(f"[剪枝-数据集] 相关对话数={stats['related_count']} 不相关对话数={stats['unrelated_count']}")
        self._log(f"[剪枝-数据集] 总删除 {total_deleted_msgs} 条")

-        # 保存日志
+        # 直接序列化统计对象，无需正则解析
        try:
            from app.core.config import settings
            settings.ensure_memory_output_dir()
            log_output_path = settings.get_memory_output_path("pruned_terminal.json")
-            sanitized_logs = [self._sanitize_log_line(l) for l in self.run_logs]
-            payload = self._parse_logs_to_structured(sanitized_logs)
            with open(log_output_path, "w", encoding="utf-8") as f:
-                json.dump(payload, f, ensure_ascii=False, indent=2)
+                json.dump(stats, f, ensure_ascii=False, indent=2)
        except Exception as e:
            self._log(f"[剪枝-数据集] 保存终端输出日志失败：{e}")

@@ -882,7 +907,7 @@ class SemanticPruner:
        if not result:
            print("警告: 语义剪枝后数据集为空，已回退为未剪枝数据以避免流程中断")
            return dialogs
-        
+
        return result

    def _log(self, msg: str) -> None:
@@ -894,113 +919,4 @@ class SemanticPruner:
            pass
        print(msg)

-    def _sanitize_log_line(self, line: str) -> str:
-        """移除行首的方括号标签前缀，例如 [剪枝-数据集] 或 [剪枝-对话]。"""
-        try:
-            return re.sub(r"^\[[^\]]+\]\s*", "", line)
-        except Exception:
-            return line

-    def _parse_logs_to_structured(self, logs: List[str]) -> dict:
-        """将已去前缀的日志列表解析为结构化 JSON，便于数据对接。"""
-        summary = {
-            "scene": self.config.pruning_scene,
-            "dialog_total": None,
-            "deletion_ratio": None,
-            "enabled": None,
-            "related_count": None,
-            "unrelated_count": None,
-            "related_indices": [],
-            "unrelated_indices": [],
-            "total_deleted_messages": None,
-            "remaining_dialogs": None,
-        }
-        dialogs = []
-
-        # 解析函数
-        def parse_int(value: str) -> Optional[int]:
-            try:
-                return int(value)
-            except Exception:
-                return None
-
-        def parse_float(value: str) -> Optional[float]:
-            try:
-                return float(value)
-            except Exception:
-                return None
-
-        def parse_indices(s: str) -> List[int]:
-            s = s.strip()
-            if not s:
-                return []
-            parts = [p.strip() for p in s.split(",") if p.strip()]
-            out: List[int] = []
-            for p in parts:
-                try:
-                    out.append(int(p))
-                except Exception:
-                    pass
-            return out
-
-        # 正则
-        re_header = re.compile(r"对话总数=(\d+)\s+场景=([^\s]+)\s+删除比例=([0-9.]+)\s+开关=(True|False)")
-        re_counts = re.compile(r"相关对话数=(\d+)\s+不相关对话数=(\d+)")
-        re_indices = re.compile(r"相关对话：第\[(.*?)\]段；不相关对话：第\[(.*?)\]段")
-        re_dialog = re.compile(r"对话\s+(\d+)\s+总消息=(\d+).*?删除=(\d+)\s+保留=(\d+)\b")
-        re_total_del = re.compile(r"总删除\s+(\d+)\s+条")
-        re_remaining = re.compile(r"剩余对话数=(\d+)")
-
-        for line in logs:
-            # 第一行：总览
-            m = re_header.search(line)
-            if m:
-                summary["dialog_total"] = parse_int(m.group(1))
-                # 顶层 scene 依配置，这里不覆盖，但也可校验 m.group(2)
-                summary["deletion_ratio"] = parse_float(m.group(3))
-                summary["enabled"] = True if m.group(4) == "True" else False
-                continue
-
-            # 第二行：相关/不相关数量
-            m = re_counts.search(line)
-            if m:
-                summary["related_count"] = parse_int(m.group(1))
-                summary["unrelated_count"] = parse_int(m.group(2))
-                continue
-
-            # 第三行：相关/不相关索引
-            m = re_indices.search(line)
-            if m:
-                summary["related_indices"] = parse_indices(m.group(1))
-                summary["unrelated_indices"] = parse_indices(m.group(2))
-                continue
-
-            # 对话级统计
-            m = re_dialog.search(line)
-            if m:
-                dialogs.append({
-                    "index": parse_int(m.group(1)),
-                    "total_messages": parse_int(m.group(2)),
-                    "deleted": parse_int(m.group(3)),
-                    "kept": parse_int(m.group(4)),
-                })
-                continue
-
-            # 全局删除总数
-            m = re_total_del.search(line)
-            if m:
-                summary["total_deleted_messages"] = parse_int(m.group(1))
-                continue
-
-            # 剩余对话数
-            m = re_remaining.search(line)
-            if m:
-                summary["remaining_dialogs"] = parse_int(m.group(1))
-                continue
-
-        return {
-            "scene": summary["scene"],
-            "timestamp": datetime.now().isoformat(),
-            "summary": {k: v for k, v in summary.items() if k != "scene"},
-            "dialogs": dialogs,
-        }
--- a/api/app/services/pilot_run_service.py
+++ b/api/app/services/pilot_run_service.py
@@ -121,7 +121,7 @@ async def run_pilot_extraction(
                    "pruning_scene": memory_config.pruning_scene,
                    "pruning_threshold": memory_config.pruning_threshold,
                    "scene_id": str(memory_config.scene_id) if memory_config.scene_id else None,
-                    "ontology_class_infos": memory_config.ontology_classes,
+                    "ontology_class_infos": memory_config.ontology_class_infos,
                }
                config = PruningConfig(**pruning_config_dict)
                
@@ -232,9 +232,11 @@ async def run_pilot_extraction(
                "chunker_strategy": memory_config.chunker_strategy,
            }
            
-            # 添加剪枝统计信息
-            if pruning_stats:
-                preprocessing_summary["pruning"] = pruning_stats
+            # 添加剪枝统计信息（始终包含 pruning 字段，确保前端不会因字段缺失报错）
+            preprocessing_summary["pruning"] = pruning_stats if pruning_stats else {
+                "enabled": memory_config.pruning_enabled,
+                "deleted_count": 0,
+            }
            
            await progress_callback("text_preprocessing_complete", "预处理文本完成（剪枝 + 分块）", preprocessing_summary)