[changes]Correct log output, log level, and pruning conditions
This commit is contained in:
@@ -82,6 +82,10 @@ class SemanticPruner:
|
||||
self.language = language # 保存语言配置
|
||||
self.max_concurrent = max_concurrent # 新增:最大并发数
|
||||
|
||||
# 详细日志配置:限制逐条消息日志的数量
|
||||
self._detailed_prune_logging = True # 是否启用详细日志
|
||||
self._max_debug_msgs_per_dialog = 20 # 每个对话最多记录前N条消息的详细日志
|
||||
|
||||
# 加载场景特定配置
|
||||
self.scene_config: ScenePatterns = SceneConfigRegistry.get_config(
|
||||
self.config.pruning_scene,
|
||||
@@ -595,6 +599,11 @@ class SemanticPruner:
|
||||
unimportant_msgs = [] # 不重要消息(可删除)
|
||||
filler_msgs = [] # 填充消息(优先删除)
|
||||
|
||||
# 判断是否需要详细日志(仅对前N条消息记录)
|
||||
should_log_details = self._detailed_prune_logging and original_count <= self._max_debug_msgs_per_dialog
|
||||
if self._detailed_prune_logging and original_count > self._max_debug_msgs_per_dialog:
|
||||
self._log(f" 对话[{d_idx}]消息数={original_count},仅采样前{self._max_debug_msgs_per_dialog}条进行详细日志")
|
||||
|
||||
for idx, m in enumerate(msgs):
|
||||
msg_text = m.msg.strip()
|
||||
|
||||
@@ -607,15 +616,18 @@ class SemanticPruner:
|
||||
# 填充消息(寒暄、表情等)
|
||||
if self._is_filler_message(m):
|
||||
filler_msgs.append((idx, m))
|
||||
self._log(f" [{idx}] '{msg_text[:30]}...' → 填充")
|
||||
if should_log_details or idx < self._max_debug_msgs_per_dialog:
|
||||
self._log(f" [{idx}] '{msg_text[:30]}...' → 填充")
|
||||
# 重要信息(学号、成绩、时间、金额等)
|
||||
elif self._is_important_message(m):
|
||||
important_msgs.append((idx, m))
|
||||
self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(场景规则)")
|
||||
if should_log_details or idx < self._max_debug_msgs_per_dialog:
|
||||
self._log(f" [{idx}] '{msg_text[:30]}...' → 重要(场景规则)")
|
||||
# 其他消息
|
||||
else:
|
||||
unimportant_msgs.append((idx, m))
|
||||
self._log(f" [{idx}] '{msg_text[:30]}...' → 不重要")
|
||||
if should_log_details or idx < self._max_debug_msgs_per_dialog:
|
||||
self._log(f" [{idx}] '{msg_text[:30]}...' → 不重要")
|
||||
|
||||
# 计算删除配额
|
||||
delete_target = int(original_count * proportion)
|
||||
|
||||
@@ -1932,17 +1932,17 @@ def preprocess_data(
|
||||
Returns:
|
||||
经过清洗转换后的 DialogData 列表
|
||||
"""
|
||||
print("\n=== 数据预处理 ===")
|
||||
logger.debug("=== 数据预处理 ===")
|
||||
from app.core.memory.storage_services.extraction_engine.data_preprocessing.data_preprocessor import (
|
||||
DataPreprocessor,
|
||||
)
|
||||
preprocessor = DataPreprocessor()
|
||||
try:
|
||||
cleaned_data = preprocessor.preprocess(input_path=input_path, output_path=output_path, skip_cleaning=skip_cleaning, indices=indices)
|
||||
print(f"数据预处理完成!共处理了 {len(cleaned_data)} 条对话数据")
|
||||
logger.debug(f"数据预处理完成!共处理了 {len(cleaned_data)} 条对话数据")
|
||||
return cleaned_data
|
||||
except Exception as e:
|
||||
print(f"数据预处理过程中出现错误: {e}")
|
||||
logger.error(f"数据预处理过程中出现错误: {e}")
|
||||
raise
|
||||
|
||||
|
||||
@@ -1961,7 +1961,7 @@ async def get_chunked_dialogs_from_preprocessed(
|
||||
Returns:
|
||||
带 chunks 的 DialogData 列表
|
||||
"""
|
||||
print(f"\n=== 批量对话分块处理 (使用 {chunker_strategy}) ===")
|
||||
logger.debug(f"=== 批量对话分块处理 (使用 {chunker_strategy}) ===")
|
||||
if not data:
|
||||
raise ValueError("预处理数据为空,无法进行分块")
|
||||
|
||||
@@ -2006,7 +2006,7 @@ async def get_chunked_dialogs_with_preprocessing(
|
||||
Returns:
|
||||
带 chunks 的 DialogData 列表
|
||||
"""
|
||||
print("\n=== 完整数据处理流程(包含预处理)===")
|
||||
logger.debug("=== 完整数据处理流程(包含预处理)===")
|
||||
|
||||
if input_data_path is None:
|
||||
input_data_path = os.path.join(
|
||||
@@ -2038,11 +2038,11 @@ async def get_chunked_dialogs_with_preprocessing(
|
||||
if pruning_config:
|
||||
# 使用传入的配置
|
||||
config = PruningConfig(**pruning_config)
|
||||
print(f"[剪枝] 使用传入配置: switch={config.pruning_switch}, scene={config.pruning_scene}, threshold={config.pruning_threshold}")
|
||||
logger.debug(f"[剪枝] 使用传入配置: switch={config.pruning_switch}, scene={config.pruning_scene}, threshold={config.pruning_threshold}")
|
||||
else:
|
||||
# 使用默认配置(关闭剪枝)
|
||||
config = None
|
||||
print("[剪枝] 未提供配置,使用默认配置(剪枝关闭)")
|
||||
logger.debug("[剪枝] 未提供配置,使用默认配置(剪枝关闭)")
|
||||
|
||||
pruner = SemanticPruner(config=config, llm_client=llm_client)
|
||||
|
||||
@@ -2057,12 +2057,12 @@ async def get_chunked_dialogs_with_preprocessing(
|
||||
if len(preprocessed_data) == 1 and single_dialog_original_msgs is not None:
|
||||
remaining_msgs = len(preprocessed_data[0].context.msgs) if preprocessed_data[0].context else 0
|
||||
deleted_msgs = max(0, single_dialog_original_msgs - remaining_msgs)
|
||||
print(
|
||||
logger.debug(
|
||||
f"语义剪枝完成!剩余 1 条对话!原始消息数:{single_dialog_original_msgs},"
|
||||
f"保留消息数:{remaining_msgs},删除 {deleted_msgs} 条。"
|
||||
)
|
||||
else:
|
||||
print(f"语义剪枝完成!剩余 {len(preprocessed_data)} 条对话")
|
||||
logger.debug(f"语义剪枝完成!剩余 {len(preprocessed_data)} 条对话")
|
||||
|
||||
# 保存剪枝后的数据
|
||||
try:
|
||||
@@ -2073,9 +2073,9 @@ async def get_chunked_dialogs_with_preprocessing(
|
||||
dp = DataPreprocessor(output_file_path=pruned_output_path)
|
||||
dp.save_data(preprocessed_data, output_path=pruned_output_path)
|
||||
except Exception as se:
|
||||
print(f"保存剪枝结果失败:{se}")
|
||||
logger.error(f"保存剪枝结果失败:{se}")
|
||||
except Exception as e:
|
||||
print(f"语义剪枝过程中出现错误,跳过剪枝: {e}")
|
||||
logger.error(f"语义剪枝过程中出现错误,跳过剪枝: {e}")
|
||||
|
||||
# 步骤3: 对话分块
|
||||
return await get_chunked_dialogs_from_preprocessed(
|
||||
|
||||
@@ -140,12 +140,22 @@ async def run_pilot_extraction(
|
||||
remaining_msg_count = len(remaining_messages)
|
||||
deleted_msg_count = original_msg_count - remaining_msg_count
|
||||
|
||||
# 找出被删除的消息(通过内容对比)
|
||||
remaining_contents = {msg["content"] for msg in remaining_messages}
|
||||
# 找出被删除的消息(基于索引精确匹配)
|
||||
# 为剩余消息创建带索引的列表,用于精确追踪
|
||||
remaining_with_index = []
|
||||
remaining_idx = 0
|
||||
for orig_idx, orig_msg in enumerate(original_messages):
|
||||
if remaining_idx < len(remaining_messages) and \
|
||||
orig_msg["role"] == remaining_messages[remaining_idx]["role"] and \
|
||||
orig_msg["content"] == remaining_messages[remaining_idx]["content"]:
|
||||
remaining_with_index.append(orig_idx)
|
||||
remaining_idx += 1
|
||||
|
||||
# 找出未在保留列表中的消息索引
|
||||
deleted_messages = [
|
||||
{"index": idx, "role": msg["role"], "content": msg["content"]}
|
||||
for idx, msg in enumerate(original_messages)
|
||||
if msg["content"] not in remaining_contents
|
||||
if idx not in remaining_with_index
|
||||
]
|
||||
|
||||
# 保存剪枝统计信息(用于最终汇总,只保留deleted_count)
|
||||
|
||||
Reference in New Issue
Block a user