Fix/interest distribution (#445)

* [fix] Revising the judgment method for the interest analysis tags * [fix] Revising the judgment method for the interest analysis tags * [add] Set cache for the distribution of interest tags * [fix] Revising the judgment method for the interest analysis tags * [add] Set cache for the distribution of interest tags * [changes] 1.Use structured logs; 2.Align the type and default value of "end_user_id" with the semantic meaning of "required".
2026-03-04 14:06:50 +08:00
parent da4a1f536d c488eb0cd0
commit 32e79c5df0
10 changed files with 390 additions and 34 deletions
--- a/api/app/core/config.py
+++ b/api/app/core/config.py
@@ -229,7 +229,7 @@ class Settings:
    # General Ontology Type Configuration
    # ========================================================================
    # 通用本体文件路径列表（逗号分隔）
-    GENERAL_ONTOLOGY_FILES: str = os.getenv("GENERAL_ONTOLOGY_FILES", "app/core/memory/ontology_services/General_purpose_entity.ttl")
+    GENERAL_ONTOLOGY_FILES: str = os.getenv("GENERAL_ONTOLOGY_FILES", "api/app/core/memory/ontology_services/General_purpose_entity.ttl")

    # 是否启用通用本体类型功能
    ENABLE_GENERAL_ONTOLOGY_TYPES: bool = os.getenv("ENABLE_GENERAL_ONTOLOGY_TYPES", "true").lower() == "true"
--- a/api/app/core/memory/analytics/hot_memory_tags.py
+++ b/api/app/core/memory/analytics/hot_memory_tags.py
@@ -1,9 +1,12 @@
 import asyncio
 import json
+import logging
 import os
 from typing import List, Tuple

 from app.core.config import settings
+
+logger = logging.getLogger(__name__)
 from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
 from app.db import get_db_context
 from app.repositories.neo4j.neo4j_connector import Neo4jConnector
@@ -16,6 +19,10 @@ class FilteredTags(BaseModel):
    """用于接收LLM筛选后的核心标签列表的模型。"""
    meaningful_tags: List[str] = Field(..., description="从原始列表中筛选出的具有核心代表意义的名词列表。")

+class InterestTags(BaseModel):
+    """用于接收LLM筛选后的兴趣活动标签列表的模型。"""
+    interest_tags: List[str] = Field(..., description="从原始列表中筛选出的代表用户兴趣活动的标签列表。")
+
 async def filter_tags_with_llm(tags: List[str], end_user_id: str) -> List[str]:
    """
    使用LLM筛选标签列表，仅保留具有代表性的核心名词。
@@ -85,10 +92,74 @@ async def filter_tags_with_llm(tags: List[str], end_user_id: str) -> List[str]:
        return structured_response.meaningful_tags

    except Exception as e:
-        print(f"LLM筛选过程中发生错误: {e}")
+        logger.error(f"LLM筛选过程中发生错误: {e}", exc_info=True)
        # 在LLM失败时返回原始标签，确保流程继续
        return tags

+async def filter_interests_with_llm(tags: List[str], end_user_id: str, language: str = "zh") -> List[str]:
+    """
+    使用LLM从标签列表中筛选出代表用户兴趣活动的标签。
+    
+    与 filter_tags_with_llm 不同，此函数专注于识别"活动/行为"类兴趣，
+    过滤掉纯物品、工具、地点等不代表用户主动参与活动的名词。
+    
+    Args:
+        tags: 原始标签列表
+        end_user_id: 用户ID，用于获取LLM配置
+        
+    Returns:
+        筛选后的兴趣活动标签列表
+    """
+    try:
+        with get_db_context() as db:
+            from app.services.memory_agent_service import (
+                get_end_user_connected_config,
+            )
+            connected_config = get_end_user_connected_config(end_user_id, db)
+            config_id = connected_config.get("memory_config_id")
+            workspace_id = connected_config.get("workspace_id")
+
+            if not config_id and not workspace_id:
+                raise ValueError(
+                    f"No memory_config_id found for end_user_id: {end_user_id}."
+                )
+
+            config_service = MemoryConfigService(db)
+            memory_config = config_service.load_memory_config(
+                config_id=config_id,
+                workspace_id=workspace_id
+            )
+
+            if not memory_config.llm_model_id:
+                raise ValueError(
+                    f"No llm_model_id found in memory config {config_id}."
+                )
+
+            factory = MemoryClientFactory(db)
+            llm_client = factory.get_llm_client(memory_config.llm_model_id)
+
+        tag_list_str = ", ".join(tags)
+        from app.core.memory.utils.prompt.prompt_utils import render_interest_filter_prompt
+        rendered_prompt = render_interest_filter_prompt(tag_list_str, language=language)
+        messages = [
+            {
+                "role": "user",
+                "content": rendered_prompt
+            }
+        ]
+
+        structured_response = await llm_client.response_structured(
+            messages=messages,
+            response_model=InterestTags
+        )
+
+        return structured_response.interest_tags
+
+    except Exception as e:
+        logger.error(f"兴趣标签LLM筛选过程中发生错误: {e}", exc_info=True)
+        return tags
+
+
 async def get_raw_tags_from_db(
    connector: Neo4jConnector,
    end_user_id: str,
@@ -183,3 +254,56 @@ async def get_hot_memory_tags(end_user_id: str, limit: int = 10, by_user: bool =
    finally:
        # 确保关闭连接
        await connector.close()
+
+async def get_interest_distribution(end_user_id: str, limit: int = 10, by_user: bool = False, language: str = "zh") -> List[Tuple[str, int]]:
+    """
+    获取用户的兴趣分布标签。
+    
+    与 get_hot_memory_tags 不同，此函数使用专门针对"活动/行为"的LLM prompt，
+    过滤掉纯物品、工具、地点等，只保留能代表用户兴趣爱好的活动类标签。
+
+    Args:
+        end_user_id: 必需参数。如果by_user=False，则为end_user_id；如果by_user=True，则为user_id
+        limit: 最终返回的标签数量限制（默认10）
+        by_user: 是否按user_id查询（默认False，按end_user_id查询）
+
+    Raises:
+        ValueError: 如果end_user_id未提供或为空
+    """
+    if not end_user_id or not end_user_id.strip():
+        raise ValueError(
+            "end_user_id is required. Please provide a valid end_user_id or user_id."
+        )
+
+    connector = Neo4jConnector()
+    try:
+        # 查询更多原始标签，给LLM提供充足上下文
+        query_limit = 40
+        raw_tags_with_freq = await get_raw_tags_from_db(connector, end_user_id, query_limit, by_user=by_user)
+        if not raw_tags_with_freq:
+            return []
+
+        raw_tag_names = [tag for tag, freq in raw_tags_with_freq]
+        raw_freq_map = {tag: freq for tag, freq in raw_tags_with_freq}
+
+        # 使用兴趣活动专用prompt进行筛选（支持语义推断出新标签）
+        interest_tag_names = await filter_interests_with_llm(raw_tag_names, end_user_id, language=language)
+
+        # 构建最终标签列表：
+        # - 原始标签中存在的，保留原始频率
+        # - LLM推断出的新标签（不在原始列表中），赋予默认频率1
+        final_tags = []
+        seen = set()
+        for tag in interest_tag_names:
+            if tag in seen:
+                continue
+            seen.add(tag)
+            freq = raw_freq_map.get(tag, 1)
+            final_tags.append((tag, freq))
+
+        # 按频率降序排列
+        final_tags.sort(key=lambda x: x[1], reverse=True)
+
+        return final_tags[:limit]
+    finally:
+        await connector.close()
--- a/api/app/core/memory/utils/prompt/prompt_utils.py
+++ b/api/app/core/memory/utils/prompt/prompt_utils.py
@@ -548,3 +548,20 @@ async def render_ontology_extraction_prompt(
    })
    
    return rendered_prompt
+
+
+def render_interest_filter_prompt(tag_list: str, language: str = "zh") -> str:
+    """
+    Renders the interest filter prompt using the interest_filter.jinja2 template.
+
+    Args:
+        tag_list: Comma-separated string of raw tags to filter
+        language: Output language ("zh" for Chinese, "en" for English)
+
+    Returns:
+        Rendered prompt content as string
+    """
+    template = prompt_env.get_template("interest_filter.jinja2")
+    rendered_prompt = template.render(tag_list=tag_list, language=language)
+    log_prompt_rendering('interest filter', rendered_prompt)
+    return rendered_prompt
--- a/api/app/core/memory/utils/prompt/prompts/interest_filter.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/interest_filter.jinja2
@@ -0,0 +1,67 @@
+{% if language == "zh" %}
+You are a user interest analysis expert. Your task is to infer and extract the user's core hobby/interest activities from a tag list. The tags may be specific project names, tool names, or compound nouns — your job is to identify the underlying interest they represent.
+
+**Step 1 - Infer the underlying interest from each tag**:
+Look at each tag and ask: "What hobby or interest does this tag suggest the user has?"
+
+Examples of inference:
+- '攀岩', '室内攀岩馆', '攀岩者数据仪表盘', '路线解锁地图', '指力', '路线等级', '当日攀岩流畅度' → '攀岩'
+- '风光摄影元数据增强器', 'EXIF数据', '.CR2文件', '.NEF文件', '日出拍摄点', '曝光补偿', '光圈', '太阳高度角', '云量预测图层' → '摄影'
+- '晨间冥想坚持天数', '身心协同峰值' → '冥想'
+- '川味可视化', '川菜' → '烹饪'
+- '开源项目命名建议', 'climbviz', '可视化', '力量增长雷达图' → '编程' 或 '数据可视化'
+- '吉他', '指弹', '琴谱' → '吉他'
+- '跑步', '5公里', '跑鞋' → '跑步'
+- '瑜伽垫', '瑜伽课' → '瑜伽'
+
+**Step 2 - Consolidate and deduplicate**:
+- Merge tags that point to the same interest into one representative label
+- Use concise, standard hobby names (e.g., '攀岩', '摄影', '编程', '烹饪', '冥想', '吉他', '跑步')
+- If multiple tags all point to '攀岩', output '攀岩' only once
+
+**Step 3 - Filter out non-interest tags**:
+Remove tags that do NOT suggest any hobby or interest:
+- Generic system/assistant terms (e.g., '助手', '用户', 'AI')
+- Pure abstract metrics with no clear hobby link (e.g., '完成时间', '日期', '自我评分')
+- Location names with no clear hobby link (e.g., '青城山后山' alone — but if combined with photography context, infer '摄影')
+
+**Output format**: Return a list of concise interest activity names in Chinese.
+
+**Example**:
+Input: ['攀岩', '攀岩者数据仪表盘', '路线解锁地图', '指力', '风光摄影元数据增强器', 'EXIF数据', '晨间冥想坚持天数', '川味可视化', '可视化', '助手', '完成时间']
+Output: ['攀岩', '摄影', '冥想', '烹饪', '编程']
+
+Now process the following tag list and return the inferred interest activities in Chinese: {{ tag_list }}
+{% else %}
+You are a user interest analysis expert. Your task is to infer and extract the user's core hobby/interest activities from a tag list. The tags may be specific project names, tool names, or compound nouns — your job is to identify the underlying interest they represent.
+
+**Step 1 - Infer the underlying interest from each tag**:
+Look at each tag and ask: "What hobby or interest does this tag suggest the user has?"
+
+Examples of inference:
+- 'rock climbing', 'indoor climbing gym', 'climber dashboard', 'route map', 'finger strength' → 'rock climbing'
+- 'landscape photography metadata enhancer', 'EXIF data', 'sunrise shooting spot', 'exposure compensation' → 'photography'
+- 'morning meditation streak', 'mind-body peak' → 'meditation'
+- 'Sichuan cuisine visualization', 'Sichuan food' → 'cooking'
+- 'open source project', 'data visualization tool', 'Python' → 'programming'
+- 'guitar', 'fingerpicking', 'sheet music' → 'guitar'
+- 'running', '5km', 'running shoes' → 'running'
+
+**Step 2 - Consolidate and deduplicate**:
+- Merge tags that point to the same interest into one representative label
+- Use concise, standard hobby names (e.g., 'rock climbing', 'photography', 'programming', 'cooking', 'meditation')
+- If multiple tags all point to 'rock climbing', output 'rock climbing' only once
+
+**Step 3 - Filter out non-interest tags**:
+Remove tags that do NOT suggest any hobby or interest:
+- Generic system/assistant terms (e.g., 'assistant', 'user', 'AI')
+- Pure abstract metrics with no clear hobby link (e.g., 'completion time', 'date', 'self-rating')
+
+**Output format**: Return a list of concise interest activity names in English.
+
+**Example**:
+Input: ['rock climbing', 'climber dashboard', 'route map', 'finger strength', 'landscape photography metadata enhancer', 'EXIF data', 'morning meditation streak', 'Sichuan cuisine visualization', 'visualization', 'assistant', 'completion time']
+Output: ['rock climbing', 'photography', 'meditation', 'cooking', 'programming']
+
+Now process the following tag list and return the inferred interest activities in English: {{ tag_list }}
+{% endif %}