[changes]Correct log output, log level, and pruning conditions

2026-03-02 12:09:10 +08:00
parent 7c02fe8148
commit 8e15a340f6
3 changed files with 39 additions and 17 deletions
--- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
+++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py
@@ -82,6 +82,10 @@ class SemanticPruner:
        self.language = language  # 保存语言配置
        self.max_concurrent = max_concurrent  # 新增：最大并发数
        
+        # 详细日志配置：限制逐条消息日志的数量
+        self._detailed_prune_logging = True  # 是否启用详细日志
+        self._max_debug_msgs_per_dialog = 20  # 每个对话最多记录前N条消息的详细日志
+        
        # 加载场景特定配置
        self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(
            self.config.pruning_scene, 
@@ -595,6 +599,11 @@ class SemanticPruner:
            unimportant_msgs = []  # 不重要消息（可删除）
            filler_msgs = []  # 填充消息（优先删除）
            
+            # 判断是否需要详细日志（仅对前N条消息记录）
+            should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog
+            if self._detailed_prune_logging and original_count > self._max_debug_msgs_per_dialog:
+                self._log(f"  对话[{d_idx}]消息数={original_count}，仅采样前{self._max_debug_msgs_per_dialog}条进行详细日志")
+            
            for idx, m in enumerate(msgs):
                msg_text = m.msg.strip()
                
@@ -607,15 +616,18 @@ class SemanticPruner:
                # 填充消息（寒暄、表情等）
                if self._is_filler_message(m):
                    filler_msgs.append((idx, m))
-                    self._log(f"  [{idx}] '{msg_text[:30]}...' → 填充")
+                    if should_log_details or idx < self._max_debug_msgs_per_dialog:
+                        self._log(f"  [{idx}] '{msg_text[:30]}...' → 填充")
                # 重要信息（学号、成绩、时间、金额等）
                elif self._is_important_message(m):
                    important_msgs.append((idx, m))
-                    self._log(f"  [{idx}] '{msg_text[:30]}...' → 重要（场景规则）")
+                    if should_log_details or idx < self._max_debug_msgs_per_dialog:
+                        self._log(f"  [{idx}] '{msg_text[:30]}...' → 重要（场景规则）")
                # 其他消息
                else:
                    unimportant_msgs.append((idx, m))
-                    self._log(f"  [{idx}] '{msg_text[:30]}...' → 不重要")
+                    if should_log_details or idx < self._max_debug_msgs_per_dialog:
+                        self._log(f"  [{idx}] '{msg_text[:30]}...' → 不重要")
            
            # 计算删除配额
            delete_target = int(original_count * proportion)
--- a/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py
+++ b/api/app/core/memory/storage_services/extraction_engine/extraction_orchestrator.py
@@ -1932,17 +1932,17 @@ def preprocess_data(
    Returns:
        经过清洗转换后的 DialogData 列表
    """
-    print("\n=== 数据预处理 ===")
+    logger.debug("=== 数据预处理 ===")
    from app.core.memory.storage_services.extraction_engine.data_preprocessing.data_preprocessor import (
        DataPreprocessor,
    )
    preprocessor = DataPreprocessor()
    try:
        cleaned_data = preprocessor.preprocess(input_path=input_path, output_path=output_path, skip_cleaning=skip_cleaning, indices=indices)
-        print(f"数据预处理完成！共处理了 {len(cleaned_data)} 条对话数据")
+        logger.debug(f"数据预处理完成！共处理了 {len(cleaned_data)} 条对话数据")
        return cleaned_data
    except Exception as e:
-        print(f"数据预处理过程中出现错误: {e}")
+        logger.error(f"数据预处理过程中出现错误: {e}")
        raise


@@ -1961,7 +1961,7 @@ async def get_chunked_dialogs_from_preprocessed(
    Returns:
        带 chunks 的 DialogData 列表
    """
-    print(f"\n=== 批量对话分块处理 (使用 {chunker_strategy}) ===")
+    logger.debug(f"=== 批量对话分块处理 (使用 {chunker_strategy}) ===")
    if not data:
        raise ValueError("预处理数据为空，无法进行分块")
        
@@ -2006,7 +2006,7 @@ async def get_chunked_dialogs_with_preprocessing(
    Returns:
        带 chunks 的 DialogData 列表
    """
-    print("\n=== 完整数据处理流程（包含预处理）===")
+    logger.debug("=== 完整数据处理流程（包含预处理）===")

    if input_data_path is None:
        input_data_path = os.path.join(
@@ -2038,11 +2038,11 @@ async def get_chunked_dialogs_with_preprocessing(
        if pruning_config:
            # 使用传入的配置
            config = PruningConfig(**pruning_config)
-            print(f"[剪枝] 使用传入配置: switch={config.pruning_switch}, scene={config.pruning_scene}, threshold={config.pruning_threshold}")
+            logger.debug(f"[剪枝] 使用传入配置: switch={config.pruning_switch}, scene={config.pruning_scene}, threshold={config.pruning_threshold}")
        else:
            # 使用默认配置（关闭剪枝）
            config = None
-            print("[剪枝] 未提供配置，使用默认配置（剪枝关闭）")
+            logger.debug("[剪枝] 未提供配置，使用默认配置（剪枝关闭）")
        
        pruner = SemanticPruner(config=config, llm_client=llm_client)
        
@@ -2057,12 +2057,12 @@ async def get_chunked_dialogs_with_preprocessing(
        if len(preprocessed_data) == 1 and single_dialog_original_msgs is not None:
            remaining_msgs = len(preprocessed_data[0].context.msgs) if preprocessed_data[0].context else 0
            deleted_msgs = max(0, single_dialog_original_msgs - remaining_msgs)
-            print(
+            logger.debug(
                f"语义剪枝完成！剩余 1 条对话！原始消息数：{single_dialog_original_msgs}，"
                f"保留消息数：{remaining_msgs}，删除 {deleted_msgs} 条。"
            )
        else:
-            print(f"语义剪枝完成！剩余 {len(preprocessed_data)} 条对话")
+            logger.debug(f"语义剪枝完成！剩余 {len(preprocessed_data)} 条对话")
            
        # 保存剪枝后的数据
        try:
@@ -2073,9 +2073,9 @@ async def get_chunked_dialogs_with_preprocessing(
            dp = DataPreprocessor(output_file_path=pruned_output_path)
            dp.save_data(preprocessed_data, output_path=pruned_output_path)
        except Exception as se:
-            print(f"保存剪枝结果失败：{se}")
+            logger.error(f"保存剪枝结果失败：{se}")
    except Exception as e:
-        print(f"语义剪枝过程中出现错误，跳过剪枝: {e}")
+        logger.error(f"语义剪枝过程中出现错误，跳过剪枝: {e}")
        
    # 步骤3: 对话分块
    return await get_chunked_dialogs_from_preprocessed(
--- a/api/app/services/pilot_run_service.py
+++ b/api/app/services/pilot_run_service.py
@@ -140,12 +140,22 @@ async def run_pilot_extraction(
                    remaining_msg_count = len(remaining_messages)
                    deleted_msg_count = original_msg_count - remaining_msg_count
                    
-                    # 找出被删除的消息（通过内容对比）
-                    remaining_contents = {msg["content"] for msg in remaining_messages}
+                    # 找出被删除的消息（基于索引精确匹配）
+                    # 为剩余消息创建带索引的列表，用于精确追踪
+                    remaining_with_index = []
+                    remaining_idx = 0
+                    for orig_idx, orig_msg in enumerate(original_messages):
+                        if remaining_idx < len(remaining_messages) and \
+                           orig_msg["role"] == remaining_messages[remaining_idx]["role"] and \
+                           orig_msg["content"] == remaining_messages[remaining_idx]["content"]:
+                            remaining_with_index.append(orig_idx)
+                            remaining_idx += 1
+                    
+                    # 找出未在保留列表中的消息索引
                    deleted_messages = [
                        {"index": idx, "role": msg["role"], "content": msg["content"]}
                        for idx, msg in enumerate(original_messages)
-                        if msg["content"] not in remaining_contents
+                        if idx not in remaining_with_index
                    ]
                    
                    # 保存剪枝统计信息（用于最终汇总，只保留deleted_count）