Initial commit

2025-11-30 18:22:17 +08:00
commit aea2fe391e
449 changed files with 83030 additions and 0 deletions
--- a/app/core/rag_utils/README.md
+++ b/app/core/rag_utils/README.md
@@ -0,0 +1,116 @@
+# RAG Chunk 分析工具
+
+这个模块提供了对 RAG chunk 内容进行分析的工具函数，包括：
+
+## 功能模块
+
+### 1. chunk_summary.py - Chunk 摘要生成
+- `generate_chunk_summary(chunks, max_chunks=10)`: 为给定的 chunk 列表生成简洁摘要
+- 使用 LLM 提取核心信息和关键要点
+- 摘要长度控制在 100-150 字
+
+### 2. chunk_tags.py - 标签提取
+- `extract_chunk_tags(chunks, max_tags=10, max_chunks=10)`: 从 chunk 中提取关键标签
+- `extract_chunk_tags_with_frequency(chunks, max_tags=10)`: 提取标签并统计频率
+- 使用 LLM 识别核心概念和专业术语
+- 自动过滤无意义词汇
+
+### 3. chunk_insight.py - 洞察分析
+- `generate_chunk_insight(chunks, max_chunks=15)`: 生成深度洞察报告
+- `classify_chunk_domain(chunk)`: 对 chunk 进行领域分类
+- `analyze_domain_distribution(chunks, max_chunks=20)`: 分析领域分布
+- 提供内容的主题、特点和价值分析
+
+## 使用示例
+
+```python
+from app.core.rag_utils import (
+    generate_chunk_summary,
+    extract_chunk_tags,
+    generate_chunk_insight
+)
+
+# 示例 chunk 数据
+chunks = [
+    "机器学习是人工智能的一个重要分支...",
+    "深度学习使用神经网络进行特征学习...",
+    # ...
+]
+
+# 生成摘要
+summary = await generate_chunk_summary(chunks, max_chunks=10)
+print(f"摘要: {summary}")
+
+# 提取标签
+tags = await extract_chunk_tags(chunks, max_tags=10)
+print(f"标签: {tags}")
+
+# 生成洞察
+insight = await generate_chunk_insight(chunks, max_chunks=15)
+print(f"洞察: {insight}")
+```
+
+## API 接口
+
+在 `memory_dashboard_controller.py` 中提供了两个对外接口：
+
+### 1. GET /dashboard/chunk_summary_tag
+获取 chunk 总结和提取的标签
+
+**参数:**
+- `end_user_id` (必填): 宿主ID
+- `limit` (可选, 默认15): 返回的chunk数量
+- `max_tags` (可选, 默认10): 最大标签数量
+
+**返回:**
+```json
+{
+    "code": 200,
+    "msg": "chunk摘要和标签获取成功",
+    "data": {
+        "summary": "chunk内容的总结...",
+        "tags": [
+            {"tag": "机器学习", "frequency": 5},
+            {"tag": "深度学习", "frequency": 3}
+        ]
+    }
+}
+```
+
+### 2. GET /dashboard/chunk_insight
+获取 chunk 的洞察内容
+
+**参数:**
+- `end_user_id` (必填): 宿主ID
+- `limit` (可选, 默认15): 返回的chunk数量
+
+**返回:**
+```json
+{
+    "code": 200,
+    "msg": "chunk洞察获取成功",
+    "data": {
+        "insight": "该知识库主要聚焦于技术领域(60%)..."
+    }
+}
+```
+
+## 技术特点
+
+1. **异步处理**: 所有函数都是异步的，支持高并发
+2. **LLM 驱动**: 使用大语言模型进行智能分析
+3. **可配置**: 支持自定义处理的 chunk 数量和标签数量
+4. **错误处理**: 完善的异常处理和日志记录
+5. **模块化设计**: 每个功能独立，易于维护和扩展
+
+## 依赖
+
+- `app.core.memory.utils.llm_utils`: LLM 客户端
+- `app.core.logging_config`: 日志配置
+- `pydantic`: 数据验证和结构化输出
+
+## 注意事项
+
+1. 所有函数都需要在异步上下文中调用（使用 `await`）
+2. 处理大量 chunk 时建议设置合理的 `max_chunks` 参数以控制 token 消耗
+3. LLM 调用可能需要一定时间，建议在前端显示加载状态
--- a/app/core/rag_utils/init.py
+++ b/app/core/rag_utils/init.py
@@ -0,0 +1,14 @@
+"""
+RAG chunk analysis utilities.
+"""
+
+from .chunk_summary import generate_chunk_summary
+from .chunk_tags import extract_chunk_tags, extract_chunk_persona
+from .chunk_insight import generate_chunk_insight
+
+__all__ = [
+    "generate_chunk_summary",
+    "extract_chunk_tags",
+    "extract_chunk_persona",
+    "generate_chunk_insight",
+]
--- a/app/core/rag_utils/chunk_insight.py
+++ b/app/core/rag_utils/chunk_insight.py
@@ -0,0 +1,205 @@
+"""
+Generate insights from RAG chunks.
+
+This module provides functionality to analyze chunk content and generate insights using LLM.
+"""
+
+import asyncio
+from typing import List, Dict, Any
+from collections import Counter
+from pydantic import BaseModel, Field
+
+from app.core.memory.utils.llm.llm_utils import get_llm_client
+from app.core.logging_config import get_business_logger
+
+business_logger = get_business_logger()
+
+
+class ChunkInsight(BaseModel):
+    """Pydantic model for chunk insight."""
+    insight: str = Field(..., description="对chunk内容的深度洞察分析")
+
+
+class DomainClassification(BaseModel):
+    """Pydantic model for domain classification."""
+    domain: str = Field(
+        ...,
+        description="内容所属的领域分类",
+        examples=["技术", "商业", "教育", "生活", "娱乐", "健康", "其他"]
+    )
+
+
+async def classify_chunk_domain(chunk: str) -> str:
+    """
+    Classify a chunk into a specific domain.
+    
+    Args:
+        chunk: Chunk content string
+    
+    Returns:
+        Domain name
+    """
+    try:
+        llm_client = get_llm_client()
+        
+        prompt = f"""请将以下文本内容归类到最合适的领域中。
+
+可选领域及其关键词：
+- 技术：编程、软件、硬件、算法、数据、网络、系统、开发、工程等
+- 商业：市场、销售、管理、财务、投资、创业、营销、战略等
+- 教育：学习、课程、培训、教学、知识、技能、考试、研究等
+- 生活：日常、家庭、饮食、购物、旅行、休闲、娱乐等
+- 娱乐：游戏、电影、音乐、体育、艺术、文化等
+- 健康：医疗、养生、运动、心理、保健、疾病等
+- 其他：无法归入以上类别的内容
+
+文本内容: {chunk[:500]}...
+
+请直接返回最合适的领域名称。"""
+        
+        messages = [
+            {"role": "system", "content": "你是一个专业的文本分类助手。请仔细分析文本内容，选择最合适的领域分类。"},
+            {"role": "user", "content": prompt}
+        ]
+        
+        classification = await llm_client.response_structured(
+            messages=messages,
+            response_model=DomainClassification
+        )
+        
+        return classification.domain if classification else "其他"
+        
+    except Exception as e:
+        business_logger.error(f"分类chunk领域失败: {str(e)}")
+        return "其他"
+
+
+async def analyze_domain_distribution(chunks: List[str], max_chunks: int = 20) -> Dict[str, float]:
+    """
+    Analyze the domain distribution of chunks.
+    
+    Args:
+        chunks: List of chunk content strings
+        max_chunks: Maximum number of chunks to analyze
+    
+    Returns:
+        Dictionary of domain -> percentage
+    """
+    if not chunks:
+        return {}
+    
+    try:
+        # 限制分析的chunk数量
+        chunks_to_analyze = chunks[:max_chunks]
+        
+        # 为每个chunk分类
+        domain_counts = Counter()
+        for chunk in chunks_to_analyze:
+            domain = await classify_chunk_domain(chunk)
+            domain_counts[domain] += 1
+        
+        # 计算百分比
+        total = sum(domain_counts.values())
+        domain_distribution = {
+            domain: count / total
+            for domain, count in domain_counts.items()
+        }
+        
+        # 按百分比降序排序
+        return dict(sorted(domain_distribution.items(), key=lambda x: x[1], reverse=True))
+        
+    except Exception as e:
+        business_logger.error(f"分析领域分布失败: {str(e)}")
+        return {}
+
+
+async def generate_chunk_insight(chunks: List[str], max_chunks: int = 15) -> str:
+    """
+    Generate insights from the given chunks.
+    
+    Args:
+        chunks: List of chunk content strings
+        max_chunks: Maximum number of chunks to analyze
+    
+    Returns:
+        A comprehensive insight report
+    """
+    if not chunks:
+        business_logger.warning("没有提供chunk内容用于生成洞察")
+        return "暂无足够数据生成洞察报告"
+    
+    try:
+        # 1. 分析领域分布
+        domain_dist = await analyze_domain_distribution(chunks, max_chunks=max_chunks)
+        
+        # 2. 统计基本信息
+        total_chunks = len(chunks)
+        avg_length = sum(len(chunk) for chunk in chunks) / total_chunks if total_chunks > 0 else 0
+        
+        # 3. 构建洞察prompt
+        prompt_parts = []
+        
+        if domain_dist:
+            top_domains = ", ".join([f"{k}({v:.0%})" for k, v in list(domain_dist.items())[:3]])
+            prompt_parts.append(f"- 内容领域分布: {top_domains}")
+        
+        prompt_parts.append(f"- 内容规模: 共{total_chunks}个知识片段，平均长度{avg_length:.0f}字")
+        
+        # 添加部分chunk内容作为参考
+        sample_chunks = chunks[:5]
+        sample_content = "\n".join([f"示例{i+1}: {chunk[:200]}..." for i, chunk in enumerate(sample_chunks)])
+        prompt_parts.append(f"\n内容示例:\n{sample_content}")
+        
+        system_prompt = """你是一位专业的知识内容分析师。你的任务是根据提供的信息，生成一段简洁、有洞察力的分析报告。
+
+重要规则：
+1. 报告需要将所有要点流畅地串联成一个段落
+2. 语言风格要专业、客观，同时易于理解
+3. 不要添加任何额外的解释或标题，直接输出报告内容
+4. 基于提供的数据和示例内容进行分析，不要编造信息
+5. 重点关注内容的主题、特点和价值
+6. 报告长度控制在150-200字
+
+例如，如果输入是：
+- 内容领域分布: 技术(60%), 商业(25%), 教育(15%)
+- 内容规模: 共50个知识片段，平均长度320字
+内容示例: [示例内容...]
+
+你的输出应该类似：
+"该知识库主要聚焦于技术领域(60%)，涵盖商业(25%)和教育(15%)相关内容。共包含50个知识片段，平均每个片段约320字，内容详实。从示例来看，内容涉及[具体主题]，体现了[特点]，对[目标用户]具有较高的参考价值。"
+"""
+        
+        user_prompt = "\n".join(prompt_parts)
+        
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+        
+        # 调用LLM生成洞察
+        llm_client = get_llm_client()
+        response = await llm_client.chat(messages=messages)
+        
+        insight = response.content.strip()
+        business_logger.info(f"成功生成chunk洞察，分析了 {min(len(chunks), max_chunks)} 个片段")
+        
+        return insight
+        
+    except Exception as e:
+        business_logger.error(f"生成chunk洞察失败: {str(e)}")
+        return "洞察生成失败"
+
+
+if __name__ == "__main__":
+    # 测试代码
+    test_chunks = [
+        "Python是一种高级编程语言，以其简洁的语法和强大的功能而闻名。它广泛应用于Web开发、数据分析、人工智能等领域。",
+        "机器学习算法可以从数据中自动学习模式，无需显式编程。常见的算法包括决策树、随机森林、神经网络等。",
+        "深度学习是机器学习的一个分支，使用多层神经网络来学习数据的层次化表示。它在图像识别、语音识别等任务中表现出色。",
+        "自然语言处理技术使计算机能够理解和生成人类语言。应用包括机器翻译、情感分析、文本摘要等。",
+        "数据科学结合了统计学、计算机科学和领域知识，用于从数据中提取有价值的洞察。"
+    ]
+    
+    print("开始生成chunk洞察...")
+    insight = asyncio.run(generate_chunk_insight(test_chunks))
+    print(f"\n生成的洞察：\n{insight}")
--- a/app/core/rag_utils/chunk_summary.py
+++ b/app/core/rag_utils/chunk_summary.py
@@ -0,0 +1,99 @@
+"""
+Generate summary for RAG chunks.
+
+This module provides functionality to summarize chunk content using LLM.
+"""
+
+import asyncio
+from typing import List, Dict, Any
+from pydantic import BaseModel, Field
+
+from app.core.memory.utils.llm.llm_utils import get_llm_client
+from app.core.logging_config import get_business_logger
+
+business_logger = get_business_logger()
+
+
+class ChunkSummary(BaseModel):
+    """Pydantic model for chunk summary."""
+    summary: str = Field(..., description="简洁的chunk内容摘要")
+
+
+async def generate_chunk_summary(chunks: List[str], max_chunks: int = 10) -> str:
+    """
+    Generate a summary for the given chunks.
+    
+    Args:
+        chunks: List of chunk content strings
+        max_chunks: Maximum number of chunks to process (default: 10)
+    
+    Returns:
+        A concise summary of the chunks
+    """
+    if not chunks:
+        business_logger.warning("没有提供chunk内容用于生成摘要")
+        return "暂无内容"
+    
+    try:
+        # 限制处理的chunk数量，避免token过多
+        chunks_to_process = chunks[:max_chunks]
+        
+        # 合并chunk内容
+        combined_content = "\n\n".join([f"片段{i+1}: {chunk}" for i, chunk in enumerate(chunks_to_process)])
+        
+        # 构建prompt
+        system_prompt = (
+            "你是一位专业的文本摘要助手。请基于提供的文本片段，生成简洁的摘要。要求：\n"
+            "- 摘要长度控制在100-150字；\n"
+            "- 提取核心信息和关键要点；\n"
+            "- 使用客观、清晰的语言；\n"
+            "- 避免冗余和重复；\n"
+            "- 如果内容涉及多个主题，按重要性排序呈现。"
+        )
+        
+        user_prompt = f"请为以下文本片段生成摘要：\n\n{combined_content}"
+        
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        
+        # 调用LLM生成摘要
+        llm_client = get_llm_client()
+        response = await llm_client.chat(messages=messages)
+        
+        summary = response.content.strip()
+        business_logger.info(f"成功生成chunk摘要，处理了 {len(chunks_to_process)} 个片段")
+        
+        return summary
+        
+    except Exception as e:
+        business_logger.error(f"生成chunk摘要失败: {str(e)}")
+        return "摘要生成失败"
+
+
+async def generate_chunk_summary_batch(chunks_list: List[List[str]]) -> List[str]:
+    """
+    Generate summaries for multiple chunk lists in batch.
+    
+    Args:
+        chunks_list: List of chunk lists
+    
+    Returns:
+        List of summaries
+    """
+    tasks = [generate_chunk_summary(chunks) for chunks in chunks_list]
+    return await asyncio.gather(*tasks)
+
+
+if __name__ == "__main__":
+    # 测试代码
+    test_chunks = [
+        "这是第一段测试内容，讲述了关于机器学习的基础知识。",
+        "第二段内容介绍了深度学习的应用场景和发展历史。",
+        "第三段讨论了自然语言处理技术的最新进展。"
+    ]
+    
+    print("开始生成chunk摘要...")
+    summary = asyncio.run(generate_chunk_summary(test_chunks))
+    print(f"\n生成的摘要：\n{summary}")
--- a/app/core/rag_utils/chunk_tags.py
+++ b/app/core/rag_utils/chunk_tags.py
@@ -0,0 +1,191 @@
+"""
+Extract tags from RAG chunks.
+
+This module provides functionality to extract meaningful tags from chunk content using LLM.
+"""
+
+import asyncio
+from collections import Counter
+from typing import List, Tuple
+from pydantic import BaseModel, Field
+
+from app.core.memory.utils.llm.llm_utils import get_llm_client
+from app.core.logging_config import get_business_logger
+
+business_logger = get_business_logger()
+
+
+class ExtractedTags(BaseModel):
+    """Pydantic model for extracted tags."""
+    tags: List[str] = Field(..., description="从文本中提取的关键标签列表")
+
+
+class ExtractedPersona(BaseModel):
+    """Pydantic model for extracted persona."""
+    personas: List[str] = Field(..., description="从文本中提取的人物形象列表，如'产品设计师'、'旅行爱好者'等")
+
+
+async def extract_chunk_tags(chunks: List[str], max_tags: int = 10, max_chunks: int = 10) -> List[Tuple[str, int]]:
+    """
+    Extract meaningful tags from the given chunks.
+    
+    Args:
+        chunks: List of chunk content strings
+        max_tags: Maximum number of tags to return (default: 10)
+        max_chunks: Maximum number of chunks to process (default: 10)
+    
+    Returns:
+        List of tuples (tag, frequency), sorted by frequency in descending order
+    """
+    if not chunks:
+        business_logger.warning("没有提供chunk内容用于提取标签")
+        return []
+    
+    try:
+        # 限制处理的chunk数量
+        chunks_to_process = chunks[:max_chunks]
+        
+        # 构建prompt
+        system_prompt = (
+            "你是一位专业的文本分析专家，擅长从文本中提取关键标签。请遵循以下规则：\n\n"
+            "1. **提取核心概念**: 识别文本中最重要的名词、专业术语、主题词；\n"
+            "2. **过滤无意义词**: 排除过于宽泛的词（如'内容'、'信息'、'数据'）；\n"
+            "3. **保持具体性**: 优先选择具体的、有代表性的词语；\n"
+            "4. **标签数量**: 提取5-15个最具代表性的标签；\n"
+            "5. **去重合并**: 语义相近的标签只保留一个最核心的。\n\n"
+            "标签应该是名词或名词短语，能够准确概括文本的核心内容。"
+        )
+        
+        llm_client = get_llm_client()
+        
+        # 为每个chunk单独提取标签，然后统计频率
+        all_tags = []
+        for chunk in chunks_to_process:
+            single_chunk_prompt = f"请从以下文本中提取关键标签：\n\n{chunk}"
+            single_messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": single_chunk_prompt},
+            ]
+            
+            try:
+                single_response = await llm_client.response_structured(
+                    messages=single_messages,
+                    response_model=ExtractedTags
+                )
+                all_tags.extend(single_response.tags)
+            except Exception as e:
+                business_logger.warning(f"处理单个chunk时出错: {str(e)}")
+                continue
+        
+        # 统计标签频率
+        tag_counter = Counter(all_tags)
+        
+        # 获取最常见的标签，限制数量
+        most_common_tags = tag_counter.most_common(max_tags)
+        
+        business_logger.info(f"成功提取 {len(most_common_tags)} 个标签，处理了 {len(chunks_to_process)} 个片段")
+        
+        return most_common_tags
+        
+    except Exception as e:
+        business_logger.error(f"提取chunk标签失败: {str(e)}")
+        return []
+
+
+async def extract_chunk_tags_with_frequency(chunks: List[str], max_tags: int = 10) -> List[Tuple[str, int]]:
+    """
+    Extract tags with actual frequency calculation across all chunks.
+    
+    This is an alias for extract_chunk_tags for backward compatibility.
+    
+    Args:
+        chunks: List of chunk content strings
+        max_tags: Maximum number of tags to return
+    
+    Returns:
+        List of tuples (tag, frequency), sorted by frequency
+    """
+    return await extract_chunk_tags(chunks, max_tags=max_tags, max_chunks=len(chunks))
+
+
+async def extract_chunk_persona(chunks: List[str], max_personas: int = 5, max_chunks: int = 20) -> List[str]:
+    """
+    Extract persona (人物形象) from the given chunks.
+    
+    Args:
+        chunks: List of chunk content strings
+        max_personas: Maximum number of personas to return (default: 5)
+        max_chunks: Maximum number of chunks to process (default: 20)
+    
+    Returns:
+        List of persona strings like "产品设计师", "旅行爱好者", "摄影发烧友"
+    """
+    if not chunks:
+        business_logger.warning("没有提供chunk内容用于提取人物形象")
+        return []
+    
+    try:
+        # 限制处理的chunk数量
+        chunks_to_process = chunks[:max_chunks]
+        
+        # 合并chunk内容
+        combined_content = "\n\n".join([f"片段{i+1}: {chunk}" for i, chunk in enumerate(chunks_to_process)])
+        
+        # 构建prompt
+        system_prompt = (
+            "你是一位专业的人物画像分析专家，擅长从文本中提取人物形象标签。请遵循以下规则：\n\n"
+            "1. **职业身份**: 识别职业、专业领域（如'产品设计师'、'软件工程师'、'创业者'）；\n"
+            "2. **兴趣爱好**: 提取核心兴趣和爱好（如'旅行爱好者'、'摄影发烧友'、'咖啡控'）；\n"
+            "3. **生活方式**: 概括生活态度和习惯（如'极简主义者'、'户外探险家'、'阅读爱好者'）；\n"
+            "4. **个性特征**: 提炼显著的性格特点（如'思考者'、'行动派'、'完美主义者'）；\n"
+            "5. **数量控制**: 提取3-8个最具代表性的人物形象标签；\n"
+            "6. **简洁明确**: 每个标签应该是简短的名词或名词短语（2-6个字）。\n\n"
+            "人物形象标签应该能够准确刻画这个人的核心特征和身份定位。"
+        )
+        
+        user_prompt = f"请从以下文本中提取人物形象标签：\n\n{combined_content}"
+        
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        
+        # 调用LLM提取人物形象
+        llm_client = get_llm_client()
+        structured_response = await llm_client.response_structured(
+            messages=messages,
+            response_model=ExtractedPersona
+        )
+        
+        # 去重并限制数量
+        personas = list(dict.fromkeys(structured_response.personas))[:max_personas]
+        
+        business_logger.info(f"成功提取 {len(personas)} 个人物形象，处理了 {len(chunks_to_process)} 个片段")
+        
+        return personas
+        
+    except Exception as e:
+        business_logger.error(f"提取人物形象失败: {str(e)}")
+        return []
+
+
+if __name__ == "__main__":
+    # 测试代码
+    test_chunks = [
+        "我是一名产品设计师，平时喜欢旅行和摄影。周末经常去户外徒步，探索新的风景。",
+        "最近在学习咖啡拉花，已经能做出简单的图案了。每天早上都会给自己冲一杯手冲咖啡。",
+        "喜欢阅读各类书籍，尤其是设计和心理学相关的。记录生活是我的习惯，用镜头捕捉美好瞬间。"
+    ]
+    
+    print("开始提取chunk标签...")
+    tags = asyncio.run(extract_chunk_tags(test_chunks))
+    print(f"\n提取的标签：")
+    for tag, freq in tags:
+        print(f"- {tag} (频率: {freq})")
+    
+    print("\n" + "="*50)
+    print("开始提取人物形象...")
+    personas = asyncio.run(extract_chunk_persona(test_chunks))
+    print(f"\n提取的人物形象：")
+    for persona in personas:
+        print(f"- {persona}")