Fix/memory insights (#30)

* [fix]fix memory insights

* [fix]fix memory insights

* [fix]Based on the correction of the code by sourcery-ai
This commit is contained in:
乐力齐
2026-01-06 14:05:15 +08:00
committed by GitHub
parent 85c7e531e4
commit a0f19ace92
5 changed files with 294 additions and 516 deletions

View File

@@ -5,19 +5,16 @@ This module provides analytics and insights for the memory system.
Available functions:
- get_hot_memory_tags: Get hot memory tags by frequency
- MemoryInsight: Generate memory insight reports
- get_recent_activity_stats: Get recent activity statistics
- generate_user_summary: Generate user summary
Note: MemoryInsight and generate_user_summary have been moved to
app.services.user_memory_service for better architecture.
"""
from app.core.memory.analytics.hot_memory_tags import get_hot_memory_tags
from app.core.memory.analytics.memory_insight import MemoryInsight
from app.core.memory.analytics.recent_activity_stats import get_recent_activity_stats
from app.core.memory.analytics.user_summary import generate_user_summary
__all__ = [
"get_hot_memory_tags",
"MemoryInsight",
"get_recent_activity_stats",
"generate_user_summary",
]

View File

@@ -1,327 +0,0 @@
"""
This module provides the MemoryInsight class for analyzing user memory data.
MemoryInsight 是一个工具类,提供基础的数据获取和分析功能:
- get_domain_distribution(): 获取记忆领域分布
- get_active_periods(): 获取活跃时段
- get_social_connections(): 获取社交关联
业务逻辑如生成洞察报告应该在服务层user_memory_service.py中实现。
This script can be executed directly to test the memory insight generation for a test user.
"""
import asyncio
import json
import os
import sys
from collections import Counter
from datetime import datetime
# To run this script directly, we need to add the src directory to the Python path
# to resolve the inconsistent imports in other modules.
src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if src_path not in sys.path:
sys.path.insert(0, src_path)
from app.core.memory.analytics.hot_memory_tags import get_hot_memory_tags
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
from app.db import get_db_context
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
from app.services.memory_config_service import MemoryConfigService
from pydantic import BaseModel, Field
#TODO: Fix this
# Default values (previously from definitions.py)
DEFAULT_LLM_ID = os.getenv("SELECTED_LLM_ID", "openai/qwen-plus")
DEFAULT_GROUP_ID = os.getenv("SELECTED_GROUP_ID", "group_123")
# 定义用于LLM结构化输出的Pydantic模型
class TagClassification(BaseModel):
"""
Represents the classification of a tag into a specific domain.
"""
domain: str = Field(
...,
description="The domain the tag belongs to, chosen from the predefined list.",
examples=["教育", "学习", "工作", "旅行", "家庭", "运动", "社交", "娱乐", "健康", "其他"],
)
class InsightReport(BaseModel):
"""
Represents the final insight report generated by the LLM.
"""
report: str = Field(
...,
description="A comprehensive insight report in Chinese, summarizing the user's memory patterns.",
)
class MemoryInsight:
"""
Provides insights into user memories by analyzing various aspects of their data.
"""
def __init__(self, user_id: str):
self.user_id = user_id
self.neo4j_connector = Neo4jConnector()
# Get config_id using get_end_user_connected_config
with get_db_context() as db:
try:
from app.services.memory_agent_service import (
get_end_user_connected_config,
)
connected_config = get_end_user_connected_config(user_id, db)
config_id = connected_config.get("memory_config_id")
if config_id:
# Use the config_id to get the proper LLM client
config_service = MemoryConfigService(db)
memory_config = config_service.load_memory_config(config_id)
factory = MemoryClientFactory(db)
self.llm_client = factory.get_llm_client(memory_config.llm_model_id)
else:
# TODO: Remove DEFAULT_LLM_ID fallback once all users have proper config
# Fallback to default LLM if no config found
factory = MemoryClientFactory(db)
self.llm_client = factory.get_llm_client(DEFAULT_LLM_ID)
except Exception as e:
print(f"Failed to get user connected config, using default LLM: {e}")
# TODO: Remove DEFAULT_LLM_ID fallback once all users have proper config
# Fallback to default LLM
factory = MemoryClientFactory(db)
self.llm_client = factory.get_llm_client(DEFAULT_LLM_ID)
async def close(self):
"""关闭数据库连接。"""
await self.neo4j_connector.close()
async def get_domain_distribution(self) -> dict[str, float]:
"""
Calculates the distribution of memory domains based on hot tags.
"""
hot_tags = await get_hot_memory_tags(self.user_id)
if not hot_tags:
return {}
domain_counts = Counter()
for tag, _ in hot_tags:
prompt = f"""请将以下标签归类到最合适的领域中。
可选领域及其关键词:
- 教育:学校、课程、考试、培训、教学、学科、教师、学生、班级、作业、成绩、毕业、入学、校园、大学、中学、小学、教材、学位等
- 学习:自学、阅读、书籍、技能提升、知识积累、笔记、复习、练习、研究、历史知识、科学知识、文化知识、学术讨论、知识问答等
- 工作:职业、项目、会议、同事、业务、公司、办公、任务、客户、合同、职场、工作计划等
- 旅行:旅游、景点、出行、度假、酒店、机票、导游、风景、旅行计划等
- 家庭:亲人、父母、子女、配偶、家事、家庭活动、亲情、家庭聚会等
- 运动:健身、体育、锻炼、跑步、游泳、球类、瑜伽、运动计划等
- 社交:朋友、聚会、社交活动、派对、聊天、交友、社交网络等
- 娱乐:游戏、电影、音乐、休闲、综艺、动漫、小说、娱乐活动等
- 健康:医疗、养生、心理健康、体检、药物、疾病、保健、健康管理等
- 其他:确实无法归入以上任何类别的内容
标签: {tag}
分析步骤:
1. 仔细理解标签的核心含义和使用场景
2. 对比各个领域的关键词,找到最匹配的领域
3. 特别注意:
- 历史、科学、文化等知识性内容应归类为"学习"
- 学校、课程、考试等正式教育场景应归类为"教育"
- 只有在标签完全不属于上述9个具体领域时才选择"其他"
4. 如果标签与某个领域有任何相关性,就选择该领域,不要选"其他"
请直接返回最合适的领域名称。"""
messages = [
{"role": "system", "content": "你是一个专业的标签分类助手。你必须仔细分析标签的实际含义和使用场景优先选择9个具体领域之一。'其他'类别只用于完全无法归类的极少数情况。特别注意:历史、科学、文化等知识性对话应归类为'学习'领域;学校、课程、考试等正式教育场景应归类为'教育'领域。"},
{"role": "user", "content": prompt}
]
# 直接调用并等待结果
classification = await self.llm_client.response_structured(
messages=messages,
response_model=TagClassification,
)
if classification and hasattr(classification, 'domain') and classification.domain:
domain_counts[classification.domain] += 1
total_tags = sum(domain_counts.values())
if total_tags == 0:
return {}
domain_distribution = {
domain: count / total_tags for domain, count in domain_counts.items()
}
return dict(
sorted(domain_distribution.items(), key=lambda item: item[1], reverse=True)
)
async def get_active_periods(self) -> list[int]:
"""
Identifies the top 2 most active months for the user.
Only returns months if there is valid and diverse time data.
This method checks if the time data represents real user memory timestamps
rather than auto-generated system timestamps by verifying:
1. Time data exists and is parseable
2. Time data is distributed across multiple months (not concentrated in 1-2 months)
"""
query = f"""
MATCH (d:Dialogue)
WHERE d.group_id = '{self.user_id}' AND d.created_at IS NOT NULL AND d.created_at <> ''
RETURN d.created_at AS creation_time
"""
records = await self.neo4j_connector.execute_query(query)
if not records:
return []
month_counts = Counter()
valid_dates_count = 0
for record in records:
creation_time_str = record.get("creation_time")
if not creation_time_str:
continue
try:
# 尝试解析时间字符串
dt_object = datetime.fromisoformat(creation_time_str.replace("Z", "+00:00"))
month_counts[dt_object.month] += 1
valid_dates_count += 1
except (ValueError, TypeError, AttributeError):
# 如果解析失败,跳过这条记录
continue
# 如果没有有效的时间数据,返回空列表
if not month_counts or valid_dates_count == 0:
return []
# 检查时间分布是否过于集中(可能是批量导入的数据)
# 如果超过80%的数据集中在1-2个月认为这是系统时间戳而非真实时间
unique_months = len(month_counts)
if unique_months <= 2:
# 只有1-2个月有数据很可能是批量导入
most_common_count = month_counts.most_common(1)[0][1]
if most_common_count / valid_dates_count > 0.8:
# 超过80%集中在一个月,认为是系统时间戳
return []
# 如果时间分布较为分散3个月以上认为是真实时间数据
if unique_months >= 3:
most_common_months = month_counts.most_common(2)
return [month for month, _ in most_common_months]
# 2个月的情况检查是否分布均匀
if unique_months == 2:
counts = list(month_counts.values())
# 如果两个月的数据量相差不大比例在0.3-3之间认为是真实数据
ratio = min(counts) / max(counts)
if ratio > 0.3:
most_common_months = month_counts.most_common(2)
return [month for month, _ in most_common_months]
# 其他情况返回空列表
return []
async def get_social_connections(self) -> dict | None:
"""
Finds the user with whom the most memories are shared.
使用 Chunk-Statement 的 CONTAINS 关系,因为系统中不创建 Dialogue-Statement 的 MENTIONS 关系。
"""
# 通过 Chunk 和 Statement 的 CONTAINS 关系来查找共同记忆
query = f"""
MATCH (c1:Chunk {{group_id: '{self.user_id}'}})
OPTIONAL MATCH (c1)-[:CONTAINS]->(s:Statement)
OPTIONAL MATCH (s)<-[:CONTAINS]-(c2:Chunk)
WHERE c1.group_id <> c2.group_id AND s IS NOT NULL AND c2 IS NOT NULL
WITH c2.group_id AS other_user_id, COUNT(DISTINCT s) AS common_statements
WHERE common_statements > 0
RETURN other_user_id, common_statements
ORDER BY common_statements DESC
LIMIT 1
"""
records = await self.neo4j_connector.execute_query(query)
if not records or not records[0].get("other_user_id"):
return None
most_connected_user = records[0]["other_user_id"]
common_memories_count = records[0]["common_statements"]
# 使用 Chunk 的时间范围
time_range_query = f"""
MATCH (c:Chunk)
WHERE c.group_id IN ['{self.user_id}', '{most_connected_user}']
RETURN min(c.created_at) AS start_time, max(c.created_at) AS end_time
"""
time_records = await self.neo4j_connector.execute_query(time_range_query)
start_year, end_year = "N/A", "N/A"
if time_records and time_records[0]["start_time"]:
start_year = datetime.fromisoformat(time_records[0]["start_time"].replace("Z", "+00:00")).year
end_year = datetime.fromisoformat(time_records[0]["end_time"].replace("Z", "+00:00")).year
return {
"user_id": most_connected_user,
"common_memories_count": common_memories_count,
"time_range": f"{start_year}-{end_year}",
}
async def close(self):
"""
Closes the database connection.
"""
await self.neo4j_connector.close()
async def main():
"""
Initializes and runs the memory insight analysis for a test user.
"""
# 默认从环境变量读取
test_user_id = DEFAULT_GROUP_ID
print(f"正在为用户 {test_user_id} 生成记忆洞察报告...\n")
try:
# 使用服务层函数生成报告
from app.services.user_memory_service import analytics_memory_insight_report
result = await analytics_memory_insight_report(end_user_id=test_user_id)
report = result.get("report", "")
print("--- 记忆洞察报告 ---")
print(report)
print("---------------------")
# 将结果写入统一的 User-Dashboard.json使用全局配置路径
try:
from app.core.config import settings
settings.ensure_memory_output_dir()
output_dir = settings.MEMORY_OUTPUT_DIR
try:
os.makedirs(output_dir, exist_ok=True)
except Exception:
pass
dashboard_path = os.path.join(output_dir, "User-Dashboard.json")
existing = {}
if os.path.exists(dashboard_path):
with open(dashboard_path, "r", encoding="utf-8") as rf:
existing = json.load(rf)
existing["memory_insight"] = {
"group_id": test_user_id,
"report": report
}
with open(dashboard_path, "w", encoding="utf-8") as wf:
json.dump(existing, wf, ensure_ascii=False, indent=2)
print(f"已写入 {dashboard_path} -> memory_insight")
except Exception as e:
print(f"写入 User-Dashboard.json 失败: {e}")
except Exception as e:
print(f"生成报告时出错: {e}")
if __name__ == "__main__":
# This setup allows running the async main function
if sys.platform.startswith('win') and sys.version_info >= (3, 8):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())

View File

@@ -1,157 +0,0 @@
"""
Generate a concise "关于我" style user summary using data from Neo4j
and the existing LLM configuration (mirrors hot_memory_tags.py setup).
Usage:
python -m analytics.user_summary --user_id <group_id>
"""
import asyncio
import json
import os
import sys
from dataclasses import dataclass
from typing import List, Tuple
# Ensure absolute imports work whether executed directly or via module
try:
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
sys.path.insert(0, src_path)
if project_root not in sys.path:
sys.path.insert(0, project_root)
except Exception:
pass
from app.core.memory.analytics.hot_memory_tags import get_hot_memory_tags
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
from app.db import get_db_context
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
from app.services.memory_config_service import MemoryConfigService
#TODO: Fix this
# Default values (previously from definitions.py)
DEFAULT_LLM_ID = os.getenv("SELECTED_LLM_ID", "openai/qwen-plus")
DEFAULT_GROUP_ID = os.getenv("SELECTED_GROUP_ID", "group_123")
@dataclass
class StatementRecord:
statement: str
created_at: str | None
class UserSummary:
"""Builds a textual user summary for a given user/group id."""
def __init__(self, user_id: str):
self.user_id = user_id
self.connector = Neo4jConnector()
# Get config_id using get_end_user_connected_config
with get_db_context() as db:
try:
from app.services.memory_agent_service import (
get_end_user_connected_config,
)
connected_config = get_end_user_connected_config(user_id, db)
config_id = connected_config.get("memory_config_id")
if config_id:
# Use the config_id to get the proper LLM client
config_service = MemoryConfigService(db)
memory_config = config_service.load_memory_config(config_id)
factory = MemoryClientFactory(db)
self.llm = factory.get_llm_client(memory_config.llm_model_id)
else:
# TODO: Remove DEFAULT_LLM_ID fallback once all users have proper config
# Fallback to default LLM if no config found
factory = MemoryClientFactory(db)
self.llm = factory.get_llm_client(DEFAULT_LLM_ID)
except Exception as e:
print(f"Failed to get user connected config, using default LLM: {e}")
# TODO: Remove DEFAULT_LLM_ID fallback once all users have proper config
# Fallback to default LLM
factory = MemoryClientFactory(db)
self.llm = factory.get_llm_client(DEFAULT_LLM_ID)
async def close(self):
await self.connector.close()
async def _get_recent_statements(self, limit: int = 80) -> List[StatementRecord]: # TODO Used by user_memory_service
"""Fetch recent statements authored by the user/group for context."""
query = (
"MATCH (s:Statement) "
"WHERE s.group_id = $group_id AND s.statement IS NOT NULL "
"RETURN s.statement AS statement, s.created_at AS created_at "
"ORDER BY created_at DESC LIMIT $limit"
)
rows = await self.connector.execute_query(query, group_id=self.user_id, limit=limit)
records: List[StatementRecord] = []
for r in rows:
try:
records.append(StatementRecord(statement=r.get("statement", ""), created_at=r.get("created_at")))
except Exception:
continue
return records
async def _get_top_entities(self, limit: int = 30) -> List[Tuple[str, int]]:
"""Reuse hot tag logic to get meaningful entities and their frequencies."""
# get_hot_memory_tags internally filters out non-meaningful nouns with LLM
return await get_hot_memory_tags(self.user_id, limit=limit) # TODO Used by user_memory_service
async def generate_user_summary(user_id: str | None = None) -> str: # TODO useless
"""
生成用户摘要的便捷函数
Args:
user_id: 可选的用户ID
Returns:
用户摘要字符串
"""
# 导入服务层函数
from app.services.user_memory_service import analytics_user_summary
# 调用服务层函数
result = await analytics_user_summary(user_id)
return result.get("summary", "")
if __name__ == "__main__":
print("开始生成用户摘要…")
try:
# 直接使用 runtime.json 中的 group_id
summary = asyncio.run(generate_user_summary())
print("\n— 用户摘要 —\n")
print(summary)
# 将结果写入统一的 User-Dashboard.json
try:
from app.core.config import settings
settings.ensure_memory_output_dir()
output_dir = settings.MEMORY_OUTPUT_DIR
try:
os.makedirs(output_dir, exist_ok=True)
except Exception:
pass
dashboard_path = os.path.join(output_dir, "User-Dashboard.json")
existing = {}
if os.path.exists(dashboard_path):
with open(dashboard_path, "r", encoding="utf-8") as rf:
existing = json.load(rf)
existing["user_summary"] = {
"group_id": DEFAULT_GROUP_ID,
"summary": summary
}
with open(dashboard_path, "w", encoding="utf-8") as wf:
json.dump(existing, wf, ensure_ascii=False, indent=2)
print(f"已写入 {dashboard_path} -> user_summary")
except Exception as e:
print(f"写入 User-Dashboard.json 失败: {e}")
except Exception as e:
print(f"生成摘要失败: {e}")
print("请检查: 1) Neo4j 是否可用2) config.json 与 .env 的 LLM/Neo4j 配置是否正确3) 数据是否包含该用户的内容。")