Fix/interest distribution (#445)
* [fix] Revising the judgment method for the interest analysis tags
* [fix] Revising the judgment method for the interest analysis tags
* [add] Set cache for the distribution of interest tags
* [fix] Revising the judgment method for the interest analysis tags
* [add] Set cache for the distribution of interest tags
* [changes] 1.Use structured logs;
2.Align the type and default value of "end_user_id" with the semantic meaning of "required".
This commit is contained in:
@@ -229,7 +229,7 @@ class Settings:
|
||||
# General Ontology Type Configuration
|
||||
# ========================================================================
|
||||
# 通用本体文件路径列表(逗号分隔)
|
||||
GENERAL_ONTOLOGY_FILES: str = os.getenv("GENERAL_ONTOLOGY_FILES", "app/core/memory/ontology_services/General_purpose_entity.ttl")
|
||||
GENERAL_ONTOLOGY_FILES: str = os.getenv("GENERAL_ONTOLOGY_FILES", "api/app/core/memory/ontology_services/General_purpose_entity.ttl")
|
||||
|
||||
# 是否启用通用本体类型功能
|
||||
ENABLE_GENERAL_ONTOLOGY_TYPES: bool = os.getenv("ENABLE_GENERAL_ONTOLOGY_TYPES", "true").lower() == "true"
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
|
||||
from app.db import get_db_context
|
||||
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
|
||||
@@ -16,6 +19,10 @@ class FilteredTags(BaseModel):
|
||||
"""用于接收LLM筛选后的核心标签列表的模型。"""
|
||||
meaningful_tags: List[str] = Field(..., description="从原始列表中筛选出的具有核心代表意义的名词列表。")
|
||||
|
||||
class InterestTags(BaseModel):
|
||||
"""用于接收LLM筛选后的兴趣活动标签列表的模型。"""
|
||||
interest_tags: List[str] = Field(..., description="从原始列表中筛选出的代表用户兴趣活动的标签列表。")
|
||||
|
||||
async def filter_tags_with_llm(tags: List[str], end_user_id: str) -> List[str]:
|
||||
"""
|
||||
使用LLM筛选标签列表,仅保留具有代表性的核心名词。
|
||||
@@ -85,10 +92,74 @@ async def filter_tags_with_llm(tags: List[str], end_user_id: str) -> List[str]:
|
||||
return structured_response.meaningful_tags
|
||||
|
||||
except Exception as e:
|
||||
print(f"LLM筛选过程中发生错误: {e}")
|
||||
logger.error(f"LLM筛选过程中发生错误: {e}", exc_info=True)
|
||||
# 在LLM失败时返回原始标签,确保流程继续
|
||||
return tags
|
||||
|
||||
async def filter_interests_with_llm(tags: List[str], end_user_id: str, language: str = "zh") -> List[str]:
|
||||
"""
|
||||
使用LLM从标签列表中筛选出代表用户兴趣活动的标签。
|
||||
|
||||
与 filter_tags_with_llm 不同,此函数专注于识别"活动/行为"类兴趣,
|
||||
过滤掉纯物品、工具、地点等不代表用户主动参与活动的名词。
|
||||
|
||||
Args:
|
||||
tags: 原始标签列表
|
||||
end_user_id: 用户ID,用于获取LLM配置
|
||||
|
||||
Returns:
|
||||
筛选后的兴趣活动标签列表
|
||||
"""
|
||||
try:
|
||||
with get_db_context() as db:
|
||||
from app.services.memory_agent_service import (
|
||||
get_end_user_connected_config,
|
||||
)
|
||||
connected_config = get_end_user_connected_config(end_user_id, db)
|
||||
config_id = connected_config.get("memory_config_id")
|
||||
workspace_id = connected_config.get("workspace_id")
|
||||
|
||||
if not config_id and not workspace_id:
|
||||
raise ValueError(
|
||||
f"No memory_config_id found for end_user_id: {end_user_id}."
|
||||
)
|
||||
|
||||
config_service = MemoryConfigService(db)
|
||||
memory_config = config_service.load_memory_config(
|
||||
config_id=config_id,
|
||||
workspace_id=workspace_id
|
||||
)
|
||||
|
||||
if not memory_config.llm_model_id:
|
||||
raise ValueError(
|
||||
f"No llm_model_id found in memory config {config_id}."
|
||||
)
|
||||
|
||||
factory = MemoryClientFactory(db)
|
||||
llm_client = factory.get_llm_client(memory_config.llm_model_id)
|
||||
|
||||
tag_list_str = ", ".join(tags)
|
||||
from app.core.memory.utils.prompt.prompt_utils import render_interest_filter_prompt
|
||||
rendered_prompt = render_interest_filter_prompt(tag_list_str, language=language)
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": rendered_prompt
|
||||
}
|
||||
]
|
||||
|
||||
structured_response = await llm_client.response_structured(
|
||||
messages=messages,
|
||||
response_model=InterestTags
|
||||
)
|
||||
|
||||
return structured_response.interest_tags
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"兴趣标签LLM筛选过程中发生错误: {e}", exc_info=True)
|
||||
return tags
|
||||
|
||||
|
||||
async def get_raw_tags_from_db(
|
||||
connector: Neo4jConnector,
|
||||
end_user_id: str,
|
||||
@@ -183,3 +254,56 @@ async def get_hot_memory_tags(end_user_id: str, limit: int = 10, by_user: bool =
|
||||
finally:
|
||||
# 确保关闭连接
|
||||
await connector.close()
|
||||
|
||||
async def get_interest_distribution(end_user_id: str, limit: int = 10, by_user: bool = False, language: str = "zh") -> List[Tuple[str, int]]:
|
||||
"""
|
||||
获取用户的兴趣分布标签。
|
||||
|
||||
与 get_hot_memory_tags 不同,此函数使用专门针对"活动/行为"的LLM prompt,
|
||||
过滤掉纯物品、工具、地点等,只保留能代表用户兴趣爱好的活动类标签。
|
||||
|
||||
Args:
|
||||
end_user_id: 必需参数。如果by_user=False,则为end_user_id;如果by_user=True,则为user_id
|
||||
limit: 最终返回的标签数量限制(默认10)
|
||||
by_user: 是否按user_id查询(默认False,按end_user_id查询)
|
||||
|
||||
Raises:
|
||||
ValueError: 如果end_user_id未提供或为空
|
||||
"""
|
||||
if not end_user_id or not end_user_id.strip():
|
||||
raise ValueError(
|
||||
"end_user_id is required. Please provide a valid end_user_id or user_id."
|
||||
)
|
||||
|
||||
connector = Neo4jConnector()
|
||||
try:
|
||||
# 查询更多原始标签,给LLM提供充足上下文
|
||||
query_limit = 40
|
||||
raw_tags_with_freq = await get_raw_tags_from_db(connector, end_user_id, query_limit, by_user=by_user)
|
||||
if not raw_tags_with_freq:
|
||||
return []
|
||||
|
||||
raw_tag_names = [tag for tag, freq in raw_tags_with_freq]
|
||||
raw_freq_map = {tag: freq for tag, freq in raw_tags_with_freq}
|
||||
|
||||
# 使用兴趣活动专用prompt进行筛选(支持语义推断出新标签)
|
||||
interest_tag_names = await filter_interests_with_llm(raw_tag_names, end_user_id, language=language)
|
||||
|
||||
# 构建最终标签列表:
|
||||
# - 原始标签中存在的,保留原始频率
|
||||
# - LLM推断出的新标签(不在原始列表中),赋予默认频率1
|
||||
final_tags = []
|
||||
seen = set()
|
||||
for tag in interest_tag_names:
|
||||
if tag in seen:
|
||||
continue
|
||||
seen.add(tag)
|
||||
freq = raw_freq_map.get(tag, 1)
|
||||
final_tags.append((tag, freq))
|
||||
|
||||
# 按频率降序排列
|
||||
final_tags.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return final_tags[:limit]
|
||||
finally:
|
||||
await connector.close()
|
||||
|
||||
@@ -548,3 +548,20 @@ async def render_ontology_extraction_prompt(
|
||||
})
|
||||
|
||||
return rendered_prompt
|
||||
|
||||
|
||||
def render_interest_filter_prompt(tag_list: str, language: str = "zh") -> str:
|
||||
"""
|
||||
Renders the interest filter prompt using the interest_filter.jinja2 template.
|
||||
|
||||
Args:
|
||||
tag_list: Comma-separated string of raw tags to filter
|
||||
language: Output language ("zh" for Chinese, "en" for English)
|
||||
|
||||
Returns:
|
||||
Rendered prompt content as string
|
||||
"""
|
||||
template = prompt_env.get_template("interest_filter.jinja2")
|
||||
rendered_prompt = template.render(tag_list=tag_list, language=language)
|
||||
log_prompt_rendering('interest filter', rendered_prompt)
|
||||
return rendered_prompt
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
{% if language == "zh" %}
|
||||
You are a user interest analysis expert. Your task is to infer and extract the user's core hobby/interest activities from a tag list. The tags may be specific project names, tool names, or compound nouns — your job is to identify the underlying interest they represent.
|
||||
|
||||
**Step 1 - Infer the underlying interest from each tag**:
|
||||
Look at each tag and ask: "What hobby or interest does this tag suggest the user has?"
|
||||
|
||||
Examples of inference:
|
||||
- '攀岩', '室内攀岩馆', '攀岩者数据仪表盘', '路线解锁地图', '指力', '路线等级', '当日攀岩流畅度' → '攀岩'
|
||||
- '风光摄影元数据增强器', 'EXIF数据', '.CR2文件', '.NEF文件', '日出拍摄点', '曝光补偿', '光圈', '太阳高度角', '云量预测图层' → '摄影'
|
||||
- '晨间冥想坚持天数', '身心协同峰值' → '冥想'
|
||||
- '川味可视化', '川菜' → '烹饪'
|
||||
- '开源项目命名建议', 'climbviz', '可视化', '力量增长雷达图' → '编程' 或 '数据可视化'
|
||||
- '吉他', '指弹', '琴谱' → '吉他'
|
||||
- '跑步', '5公里', '跑鞋' → '跑步'
|
||||
- '瑜伽垫', '瑜伽课' → '瑜伽'
|
||||
|
||||
**Step 2 - Consolidate and deduplicate**:
|
||||
- Merge tags that point to the same interest into one representative label
|
||||
- Use concise, standard hobby names (e.g., '攀岩', '摄影', '编程', '烹饪', '冥想', '吉他', '跑步')
|
||||
- If multiple tags all point to '攀岩', output '攀岩' only once
|
||||
|
||||
**Step 3 - Filter out non-interest tags**:
|
||||
Remove tags that do NOT suggest any hobby or interest:
|
||||
- Generic system/assistant terms (e.g., '助手', '用户', 'AI')
|
||||
- Pure abstract metrics with no clear hobby link (e.g., '完成时间', '日期', '自我评分')
|
||||
- Location names with no clear hobby link (e.g., '青城山后山' alone — but if combined with photography context, infer '摄影')
|
||||
|
||||
**Output format**: Return a list of concise interest activity names in Chinese.
|
||||
|
||||
**Example**:
|
||||
Input: ['攀岩', '攀岩者数据仪表盘', '路线解锁地图', '指力', '风光摄影元数据增强器', 'EXIF数据', '晨间冥想坚持天数', '川味可视化', '可视化', '助手', '完成时间']
|
||||
Output: ['攀岩', '摄影', '冥想', '烹饪', '编程']
|
||||
|
||||
Now process the following tag list and return the inferred interest activities in Chinese: {{ tag_list }}
|
||||
{% else %}
|
||||
You are a user interest analysis expert. Your task is to infer and extract the user's core hobby/interest activities from a tag list. The tags may be specific project names, tool names, or compound nouns — your job is to identify the underlying interest they represent.
|
||||
|
||||
**Step 1 - Infer the underlying interest from each tag**:
|
||||
Look at each tag and ask: "What hobby or interest does this tag suggest the user has?"
|
||||
|
||||
Examples of inference:
|
||||
- 'rock climbing', 'indoor climbing gym', 'climber dashboard', 'route map', 'finger strength' → 'rock climbing'
|
||||
- 'landscape photography metadata enhancer', 'EXIF data', 'sunrise shooting spot', 'exposure compensation' → 'photography'
|
||||
- 'morning meditation streak', 'mind-body peak' → 'meditation'
|
||||
- 'Sichuan cuisine visualization', 'Sichuan food' → 'cooking'
|
||||
- 'open source project', 'data visualization tool', 'Python' → 'programming'
|
||||
- 'guitar', 'fingerpicking', 'sheet music' → 'guitar'
|
||||
- 'running', '5km', 'running shoes' → 'running'
|
||||
|
||||
**Step 2 - Consolidate and deduplicate**:
|
||||
- Merge tags that point to the same interest into one representative label
|
||||
- Use concise, standard hobby names (e.g., 'rock climbing', 'photography', 'programming', 'cooking', 'meditation')
|
||||
- If multiple tags all point to 'rock climbing', output 'rock climbing' only once
|
||||
|
||||
**Step 3 - Filter out non-interest tags**:
|
||||
Remove tags that do NOT suggest any hobby or interest:
|
||||
- Generic system/assistant terms (e.g., 'assistant', 'user', 'AI')
|
||||
- Pure abstract metrics with no clear hobby link (e.g., 'completion time', 'date', 'self-rating')
|
||||
|
||||
**Output format**: Return a list of concise interest activity names in English.
|
||||
|
||||
**Example**:
|
||||
Input: ['rock climbing', 'climber dashboard', 'route map', 'finger strength', 'landscape photography metadata enhancer', 'EXIF data', 'morning meditation streak', 'Sichuan cuisine visualization', 'visualization', 'assistant', 'completion time']
|
||||
Output: ['rock climbing', 'photography', 'meditation', 'cooking', 'programming']
|
||||
|
||||
Now process the following tag list and return the inferred interest activities in English: {{ tag_list }}
|
||||
{% endif %}
|
||||
Reference in New Issue
Block a user