Merge branch 'refs/heads/develop' into fix/memory_bug_fix

# Conflicts: # api/app/services/memory_agent_service.py
2026-01-23 14:57:25 +08:00
parent ebe018347b 313f19eba4
commit 21c0383efb
38 changed files with 685 additions and 353 deletions
--- a/api/app/services/memory_storage_service.py
+++ b/api/app/services/memory_storage_service.py
@@ -12,7 +12,11 @@ from datetime import datetime
 from typing import Any, AsyncGenerator, Dict, List, Optional

 from app.core.logging_config import get_config_logger, get_logger
-from app.core.memory.analytics.hot_memory_tags import get_hot_memory_tags
+from app.core.memory.analytics.hot_memory_tags import (
+    get_hot_memory_tags,
+    get_raw_tags_from_db,
+    filter_tags_with_llm,
+)
 from app.core.memory.analytics.recent_activity_stats import get_recent_activity_stats
 from app.models.user_model import User
 from app.repositories.memory_config_repository import MemoryConfigRepository
@@ -237,7 +241,8 @@ class DataConfigService: # 数据配置服务类（PostgreSQL）
            ValueError: 当配置无效或参数缺失时
            RuntimeError: 当管线执行失败时
        """
-        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        from pathlib import Path
+        project_root = str(Path(__file__).resolve().parents[2])
        
        try:
            # 发出初始进度事件
@@ -512,27 +517,79 @@ async def analytics_hot_memory_tags(
 ) -> List[Dict[str, Any]]:
    """
    获取热门记忆标签，按数量排序并返回前N个
+    
+    优化策略：
+    1. 先从所有用户收集原始标签（不调用LLM）
+    2. 聚合并合并相同标签的频率
+    3. 排序后取前N个
+    4. 只调用一次LLM进行筛选
    """
    workspace_id = current_user.current_workspace_id
    # 获取更多标签供LLM筛选（获取limit*4个标签）
    raw_limit = limit * 4
    from app.services.memory_dashboard_service import get_workspace_end_users
-    end_users = get_workspace_end_users(db, workspace_id, current_user)
+    # 使用 asyncio.to_thread 避免阻塞事件循环
+    end_users = await asyncio.to_thread(get_workspace_end_users, db, workspace_id, current_user)
    
-    tags = []
-    for end_user in end_users:
-        tag = await get_hot_memory_tags(str(end_user.id), limit=raw_limit)
-        if tag:
-            # 将每个用户的标签列表展平到总列表中
-            tags.extend(tag)
-
-    # 按频率降序排序（虽然数据库已经排序，但为了确保正确性再次排序）
-    sorted_tags = sorted(tags, key=lambda x: x[1], reverse=True)
+    if not end_users:
+        return []
    
-    # 只返回前limit个
-    top_tags = sorted_tags[:limit]
-    
-    return [{"name": t, "frequency": f} for t, f in top_tags]
+    # 步骤1: 收集所有用户的原始标签（不调用LLM）
+    connector = Neo4jConnector()
+    try:
+        all_raw_tags = []
+        for end_user in end_users:
+            raw_tags = await get_raw_tags_from_db(
+                connector, 
+                str(end_user.id), 
+                limit=raw_limit, 
+                by_user=False
+            )
+            if raw_tags:
+                all_raw_tags.extend(raw_tags)
+        
+        if not all_raw_tags:
+            return []
+        
+        # 步骤2: 聚合相同标签的频率
+        tag_frequency_map = {}
+        for tag_name, frequency in all_raw_tags:
+            if tag_name in tag_frequency_map:
+                tag_frequency_map[tag_name] += frequency
+            else:
+                tag_frequency_map[tag_name] = frequency
+        
+        # 步骤3: 按频率降序排序，取前raw_limit个
+        sorted_tags = sorted(
+            tag_frequency_map.items(), 
+            key=lambda x: x[1], 
+            reverse=True
+        )[:raw_limit]
+        
+        if not sorted_tags:
+            return []
+        
+        # 步骤4: 只调用一次LLM进行筛选
+        tag_names = [tag for tag, _ in sorted_tags]
+        
+        # 使用第一个用户的group_id来获取LLM配置
+        # 因为同一工作空间下的用户应该使用相同的配置
+        first_end_user_id = str(end_users[0].id)
+        filtered_tag_names = await filter_tags_with_llm(tag_names, first_end_user_id)
+        
+        # 步骤5: 根据LLM筛选结果构建最终列表（保留频率）
+        final_tags = []
+        for tag, freq in sorted_tags:
+            if tag in filtered_tag_names:
+                final_tags.append((tag, freq))
+        
+        # 步骤6: 只返回前limit个
+        top_tags = final_tags[:limit]
+        
+        return [{"name": t, "frequency": f} for t, f in top_tags]
+        
+    finally:
+        await connector.close()


 async def analytics_recent_activity_stats() -> Dict[str, Any]: