From 349d46e043eb1698f547d564829a5fc2cd444bf8 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Mon, 9 Mar 2026 11:26:54 +0800 Subject: [PATCH] [changes] Add restriction words to avoid the "implicit" and "emotional" content from being mistakenly pruned. --- .../data_preprocessing/data_pruning.py | 10 ++++++++++ .../data_preprocessing/scene_config.py | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py index 904b238f..ecbe0411 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py @@ -252,6 +252,16 @@ class SemanticPruner: if re.fullmatch(r"[。!?,.!?…·\s]+", t): return True + # 安全防线:包含情绪词或兴趣词的消息,无论多短都不视为填充 + # 避免"我好开心呀"、"好喜欢打羽毛球呀"等被误删 + _emotion_interest_guard = re.compile( + r"开心|高兴|快乐|幸福|感动|难过|悲伤|伤心|委屈|失落|沮丧|郁闷|" + r"生气|愤怒|烦躁|焦虑|害怕|担心|压力|兴奋|期待|" + r"喜欢|热爱|爱好|兴趣|擅长|享受|沉迷|着迷|讨厌|厌恶" + ) + if _emotion_interest_guard.search(t): + return False + return False async def _batch_evaluate_importance_with_llm( diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py index ed9592af..a79ebea5 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/scene_config.py @@ -51,6 +51,22 @@ class SceneConfigRegistry: (r"今天|明天|后天|昨天|前天", 3), # 相对时间(提高权重) (r"下周|下月|下年|上周|上月|上年|本周|本月|本年", 3), (r"今年|去年|明年", 3), + # ---- 情绪内容(所有场景通用,用于情绪提取) ---- + (r"开心|高兴|快乐|兴奋|愉快|幸福|满足|喜悦|欣喜", 4), + (r"难过|悲伤|伤心|痛苦|委屈|失落|沮丧|郁闷|忧郁|绝望", 4), + (r"生气|愤怒|烦躁|焦虑|紧张|害怕|恐惧|担心|担忧|压力", 4), + (r"感动|温暖|感激|感谢|惊喜|期待|憧憬|向往", 3), + (r"无聊|无奈|尴尬|后悔|遗憾|羞愧|惭愧", 3), + (r"好[开高快]心|很[开高快]心|超[开高快]心|非常[开高快]心", 4), + (r"好难过|好伤心|好悲伤|好委屈|好痛苦", 4), + (r"好开心|好高兴|好快乐|好幸福|好感动", 4), + # ---- 兴趣/爱好内容(所有场景通用,用于兴趣提取) ---- + (r"喜欢|热爱|爱好|兴趣|擅长|享受|沉迷|着迷|痴迷", 4), + (r"不喜欢|讨厌|厌恶|反感|排斥", 3), + (r"羽毛球|篮球|足球|排球|乒乓球|网球|棒球|高尔夫", 4), + (r"游泳|跑步|健身|瑜伽|舞蹈|武术|骑行|登山|徒步", 4), + (r"音乐|唱歌|吉他|钢琴|绘画|摄影|书法|手工|烹饪", 4), + (r"游戏|电影|动漫|小说|阅读|旅游|美食|宠物", 3), ] BASE_LOW_PRIORITY = [ @@ -58,6 +74,8 @@ class SceneConfigRegistry: (r"\d{1,2}点\d{0,2}分?", 2), # 时间点 X点Y分 或 X点 (r"上午|下午|中午|晚上|早上|傍晚|凌晨", 2), # 时段(提高权重并扩充) (r"AM|PM|am|pm", 1), + # ---- 情绪程度副词(辅助情绪识别) ---- + (r"特别|非常|超级|极其|十分|很|好[开高快]|太.*了", 1), ] BASE_FILLERS = {