Initial commit

This commit is contained in:
Ke Sun
2025-11-30 18:22:17 +08:00
commit aea2fe391e
449 changed files with 83030 additions and 0 deletions

View File

@@ -0,0 +1,116 @@
# RAG Chunk 分析工具
这个模块提供了对 RAG chunk 内容进行分析的工具函数,包括:
## 功能模块
### 1. chunk_summary.py - Chunk 摘要生成
- `generate_chunk_summary(chunks, max_chunks=10)`: 为给定的 chunk 列表生成简洁摘要
- 使用 LLM 提取核心信息和关键要点
- 摘要长度控制在 100-150 字
### 2. chunk_tags.py - 标签提取
- `extract_chunk_tags(chunks, max_tags=10, max_chunks=10)`: 从 chunk 中提取关键标签
- `extract_chunk_tags_with_frequency(chunks, max_tags=10)`: 提取标签并统计频率
- 使用 LLM 识别核心概念和专业术语
- 自动过滤无意义词汇
### 3. chunk_insight.py - 洞察分析
- `generate_chunk_insight(chunks, max_chunks=15)`: 生成深度洞察报告
- `classify_chunk_domain(chunk)`: 对 chunk 进行领域分类
- `analyze_domain_distribution(chunks, max_chunks=20)`: 分析领域分布
- 提供内容的主题、特点和价值分析
## 使用示例
```python
from app.core.rag_utils import (
generate_chunk_summary,
extract_chunk_tags,
generate_chunk_insight
)
# 示例 chunk 数据
chunks = [
"机器学习是人工智能的一个重要分支...",
"深度学习使用神经网络进行特征学习...",
# ...
]
# 生成摘要
summary = await generate_chunk_summary(chunks, max_chunks=10)
print(f"摘要: {summary}")
# 提取标签
tags = await extract_chunk_tags(chunks, max_tags=10)
print(f"标签: {tags}")
# 生成洞察
insight = await generate_chunk_insight(chunks, max_chunks=15)
print(f"洞察: {insight}")
```
## API 接口
`memory_dashboard_controller.py` 中提供了两个对外接口:
### 1. GET /dashboard/chunk_summary_tag
获取 chunk 总结和提取的标签
**参数:**
- `end_user_id` (必填): 宿主ID
- `limit` (可选, 默认15): 返回的chunk数量
- `max_tags` (可选, 默认10): 最大标签数量
**返回:**
```json
{
"code": 200,
"msg": "chunk摘要和标签获取成功",
"data": {
"summary": "chunk内容的总结...",
"tags": [
{"tag": "机器学习", "frequency": 5},
{"tag": "深度学习", "frequency": 3}
]
}
}
```
### 2. GET /dashboard/chunk_insight
获取 chunk 的洞察内容
**参数:**
- `end_user_id` (必填): 宿主ID
- `limit` (可选, 默认15): 返回的chunk数量
**返回:**
```json
{
"code": 200,
"msg": "chunk洞察获取成功",
"data": {
"insight": "该知识库主要聚焦于技术领域(60%)..."
}
}
```
## 技术特点
1. **异步处理**: 所有函数都是异步的,支持高并发
2. **LLM 驱动**: 使用大语言模型进行智能分析
3. **可配置**: 支持自定义处理的 chunk 数量和标签数量
4. **错误处理**: 完善的异常处理和日志记录
5. **模块化设计**: 每个功能独立,易于维护和扩展
## 依赖
- `app.core.memory.utils.llm_utils`: LLM 客户端
- `app.core.logging_config`: 日志配置
- `pydantic`: 数据验证和结构化输出
## 注意事项
1. 所有函数都需要在异步上下文中调用(使用 `await`
2. 处理大量 chunk 时建议设置合理的 `max_chunks` 参数以控制 token 消耗
3. LLM 调用可能需要一定时间,建议在前端显示加载状态

View File

@@ -0,0 +1,14 @@
"""
RAG chunk analysis utilities.
"""
from .chunk_summary import generate_chunk_summary
from .chunk_tags import extract_chunk_tags, extract_chunk_persona
from .chunk_insight import generate_chunk_insight
__all__ = [
"generate_chunk_summary",
"extract_chunk_tags",
"extract_chunk_persona",
"generate_chunk_insight",
]

View File

@@ -0,0 +1,205 @@
"""
Generate insights from RAG chunks.
This module provides functionality to analyze chunk content and generate insights using LLM.
"""
import asyncio
from typing import List, Dict, Any
from collections import Counter
from pydantic import BaseModel, Field
from app.core.memory.utils.llm.llm_utils import get_llm_client
from app.core.logging_config import get_business_logger
business_logger = get_business_logger()
class ChunkInsight(BaseModel):
"""Pydantic model for chunk insight."""
insight: str = Field(..., description="对chunk内容的深度洞察分析")
class DomainClassification(BaseModel):
"""Pydantic model for domain classification."""
domain: str = Field(
...,
description="内容所属的领域分类",
examples=["技术", "商业", "教育", "生活", "娱乐", "健康", "其他"]
)
async def classify_chunk_domain(chunk: str) -> str:
"""
Classify a chunk into a specific domain.
Args:
chunk: Chunk content string
Returns:
Domain name
"""
try:
llm_client = get_llm_client()
prompt = f"""请将以下文本内容归类到最合适的领域中。
可选领域及其关键词:
- 技术:编程、软件、硬件、算法、数据、网络、系统、开发、工程等
- 商业:市场、销售、管理、财务、投资、创业、营销、战略等
- 教育:学习、课程、培训、教学、知识、技能、考试、研究等
- 生活:日常、家庭、饮食、购物、旅行、休闲、娱乐等
- 娱乐:游戏、电影、音乐、体育、艺术、文化等
- 健康:医疗、养生、运动、心理、保健、疾病等
- 其他:无法归入以上类别的内容
文本内容: {chunk[:500]}...
请直接返回最合适的领域名称。"""
messages = [
{"role": "system", "content": "你是一个专业的文本分类助手。请仔细分析文本内容,选择最合适的领域分类。"},
{"role": "user", "content": prompt}
]
classification = await llm_client.response_structured(
messages=messages,
response_model=DomainClassification
)
return classification.domain if classification else "其他"
except Exception as e:
business_logger.error(f"分类chunk领域失败: {str(e)}")
return "其他"
async def analyze_domain_distribution(chunks: List[str], max_chunks: int = 20) -> Dict[str, float]:
"""
Analyze the domain distribution of chunks.
Args:
chunks: List of chunk content strings
max_chunks: Maximum number of chunks to analyze
Returns:
Dictionary of domain -> percentage
"""
if not chunks:
return {}
try:
# 限制分析的chunk数量
chunks_to_analyze = chunks[:max_chunks]
# 为每个chunk分类
domain_counts = Counter()
for chunk in chunks_to_analyze:
domain = await classify_chunk_domain(chunk)
domain_counts[domain] += 1
# 计算百分比
total = sum(domain_counts.values())
domain_distribution = {
domain: count / total
for domain, count in domain_counts.items()
}
# 按百分比降序排序
return dict(sorted(domain_distribution.items(), key=lambda x: x[1], reverse=True))
except Exception as e:
business_logger.error(f"分析领域分布失败: {str(e)}")
return {}
async def generate_chunk_insight(chunks: List[str], max_chunks: int = 15) -> str:
"""
Generate insights from the given chunks.
Args:
chunks: List of chunk content strings
max_chunks: Maximum number of chunks to analyze
Returns:
A comprehensive insight report
"""
if not chunks:
business_logger.warning("没有提供chunk内容用于生成洞察")
return "暂无足够数据生成洞察报告"
try:
# 1. 分析领域分布
domain_dist = await analyze_domain_distribution(chunks, max_chunks=max_chunks)
# 2. 统计基本信息
total_chunks = len(chunks)
avg_length = sum(len(chunk) for chunk in chunks) / total_chunks if total_chunks > 0 else 0
# 3. 构建洞察prompt
prompt_parts = []
if domain_dist:
top_domains = ", ".join([f"{k}({v:.0%})" for k, v in list(domain_dist.items())[:3]])
prompt_parts.append(f"- 内容领域分布: {top_domains}")
prompt_parts.append(f"- 内容规模: 共{total_chunks}个知识片段,平均长度{avg_length:.0f}")
# 添加部分chunk内容作为参考
sample_chunks = chunks[:5]
sample_content = "\n".join([f"示例{i+1}: {chunk[:200]}..." for i, chunk in enumerate(sample_chunks)])
prompt_parts.append(f"\n内容示例:\n{sample_content}")
system_prompt = """你是一位专业的知识内容分析师。你的任务是根据提供的信息,生成一段简洁、有洞察力的分析报告。
重要规则:
1. 报告需要将所有要点流畅地串联成一个段落
2. 语言风格要专业、客观,同时易于理解
3. 不要添加任何额外的解释或标题,直接输出报告内容
4. 基于提供的数据和示例内容进行分析,不要编造信息
5. 重点关注内容的主题、特点和价值
6. 报告长度控制在150-200字
例如,如果输入是:
- 内容领域分布: 技术(60%), 商业(25%), 教育(15%)
- 内容规模: 共50个知识片段平均长度320字
内容示例: [示例内容...]
你的输出应该类似:
"该知识库主要聚焦于技术领域(60%),涵盖商业(25%)和教育(15%)相关内容。共包含50个知识片段平均每个片段约320字内容详实。从示例来看内容涉及[具体主题],体现了[特点],对[目标用户]具有较高的参考价值。"
"""
user_prompt = "\n".join(prompt_parts)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
# 调用LLM生成洞察
llm_client = get_llm_client()
response = await llm_client.chat(messages=messages)
insight = response.content.strip()
business_logger.info(f"成功生成chunk洞察分析了 {min(len(chunks), max_chunks)} 个片段")
return insight
except Exception as e:
business_logger.error(f"生成chunk洞察失败: {str(e)}")
return "洞察生成失败"
if __name__ == "__main__":
# 测试代码
test_chunks = [
"Python是一种高级编程语言以其简洁的语法和强大的功能而闻名。它广泛应用于Web开发、数据分析、人工智能等领域。",
"机器学习算法可以从数据中自动学习模式,无需显式编程。常见的算法包括决策树、随机森林、神经网络等。",
"深度学习是机器学习的一个分支,使用多层神经网络来学习数据的层次化表示。它在图像识别、语音识别等任务中表现出色。",
"自然语言处理技术使计算机能够理解和生成人类语言。应用包括机器翻译、情感分析、文本摘要等。",
"数据科学结合了统计学、计算机科学和领域知识,用于从数据中提取有价值的洞察。"
]
print("开始生成chunk洞察...")
insight = asyncio.run(generate_chunk_insight(test_chunks))
print(f"\n生成的洞察:\n{insight}")

View File

@@ -0,0 +1,99 @@
"""
Generate summary for RAG chunks.
This module provides functionality to summarize chunk content using LLM.
"""
import asyncio
from typing import List, Dict, Any
from pydantic import BaseModel, Field
from app.core.memory.utils.llm.llm_utils import get_llm_client
from app.core.logging_config import get_business_logger
business_logger = get_business_logger()
class ChunkSummary(BaseModel):
"""Pydantic model for chunk summary."""
summary: str = Field(..., description="简洁的chunk内容摘要")
async def generate_chunk_summary(chunks: List[str], max_chunks: int = 10) -> str:
"""
Generate a summary for the given chunks.
Args:
chunks: List of chunk content strings
max_chunks: Maximum number of chunks to process (default: 10)
Returns:
A concise summary of the chunks
"""
if not chunks:
business_logger.warning("没有提供chunk内容用于生成摘要")
return "暂无内容"
try:
# 限制处理的chunk数量避免token过多
chunks_to_process = chunks[:max_chunks]
# 合并chunk内容
combined_content = "\n\n".join([f"片段{i+1}: {chunk}" for i, chunk in enumerate(chunks_to_process)])
# 构建prompt
system_prompt = (
"你是一位专业的文本摘要助手。请基于提供的文本片段,生成简洁的摘要。要求:\n"
"- 摘要长度控制在100-150字\n"
"- 提取核心信息和关键要点;\n"
"- 使用客观、清晰的语言;\n"
"- 避免冗余和重复;\n"
"- 如果内容涉及多个主题,按重要性排序呈现。"
)
user_prompt = f"请为以下文本片段生成摘要:\n\n{combined_content}"
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
# 调用LLM生成摘要
llm_client = get_llm_client()
response = await llm_client.chat(messages=messages)
summary = response.content.strip()
business_logger.info(f"成功生成chunk摘要处理了 {len(chunks_to_process)} 个片段")
return summary
except Exception as e:
business_logger.error(f"生成chunk摘要失败: {str(e)}")
return "摘要生成失败"
async def generate_chunk_summary_batch(chunks_list: List[List[str]]) -> List[str]:
"""
Generate summaries for multiple chunk lists in batch.
Args:
chunks_list: List of chunk lists
Returns:
List of summaries
"""
tasks = [generate_chunk_summary(chunks) for chunks in chunks_list]
return await asyncio.gather(*tasks)
if __name__ == "__main__":
# 测试代码
test_chunks = [
"这是第一段测试内容,讲述了关于机器学习的基础知识。",
"第二段内容介绍了深度学习的应用场景和发展历史。",
"第三段讨论了自然语言处理技术的最新进展。"
]
print("开始生成chunk摘要...")
summary = asyncio.run(generate_chunk_summary(test_chunks))
print(f"\n生成的摘要:\n{summary}")

View File

@@ -0,0 +1,191 @@
"""
Extract tags from RAG chunks.
This module provides functionality to extract meaningful tags from chunk content using LLM.
"""
import asyncio
from collections import Counter
from typing import List, Tuple
from pydantic import BaseModel, Field
from app.core.memory.utils.llm.llm_utils import get_llm_client
from app.core.logging_config import get_business_logger
business_logger = get_business_logger()
class ExtractedTags(BaseModel):
"""Pydantic model for extracted tags."""
tags: List[str] = Field(..., description="从文本中提取的关键标签列表")
class ExtractedPersona(BaseModel):
"""Pydantic model for extracted persona."""
personas: List[str] = Field(..., description="从文本中提取的人物形象列表,如'产品设计师''旅行爱好者'")
async def extract_chunk_tags(chunks: List[str], max_tags: int = 10, max_chunks: int = 10) -> List[Tuple[str, int]]:
"""
Extract meaningful tags from the given chunks.
Args:
chunks: List of chunk content strings
max_tags: Maximum number of tags to return (default: 10)
max_chunks: Maximum number of chunks to process (default: 10)
Returns:
List of tuples (tag, frequency), sorted by frequency in descending order
"""
if not chunks:
business_logger.warning("没有提供chunk内容用于提取标签")
return []
try:
# 限制处理的chunk数量
chunks_to_process = chunks[:max_chunks]
# 构建prompt
system_prompt = (
"你是一位专业的文本分析专家,擅长从文本中提取关键标签。请遵循以下规则:\n\n"
"1. **提取核心概念**: 识别文本中最重要的名词、专业术语、主题词;\n"
"2. **过滤无意义词**: 排除过于宽泛的词(如'内容''信息''数据'\n"
"3. **保持具体性**: 优先选择具体的、有代表性的词语;\n"
"4. **标签数量**: 提取5-15个最具代表性的标签\n"
"5. **去重合并**: 语义相近的标签只保留一个最核心的。\n\n"
"标签应该是名词或名词短语,能够准确概括文本的核心内容。"
)
llm_client = get_llm_client()
# 为每个chunk单独提取标签然后统计频率
all_tags = []
for chunk in chunks_to_process:
single_chunk_prompt = f"请从以下文本中提取关键标签:\n\n{chunk}"
single_messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": single_chunk_prompt},
]
try:
single_response = await llm_client.response_structured(
messages=single_messages,
response_model=ExtractedTags
)
all_tags.extend(single_response.tags)
except Exception as e:
business_logger.warning(f"处理单个chunk时出错: {str(e)}")
continue
# 统计标签频率
tag_counter = Counter(all_tags)
# 获取最常见的标签,限制数量
most_common_tags = tag_counter.most_common(max_tags)
business_logger.info(f"成功提取 {len(most_common_tags)} 个标签,处理了 {len(chunks_to_process)} 个片段")
return most_common_tags
except Exception as e:
business_logger.error(f"提取chunk标签失败: {str(e)}")
return []
async def extract_chunk_tags_with_frequency(chunks: List[str], max_tags: int = 10) -> List[Tuple[str, int]]:
"""
Extract tags with actual frequency calculation across all chunks.
This is an alias for extract_chunk_tags for backward compatibility.
Args:
chunks: List of chunk content strings
max_tags: Maximum number of tags to return
Returns:
List of tuples (tag, frequency), sorted by frequency
"""
return await extract_chunk_tags(chunks, max_tags=max_tags, max_chunks=len(chunks))
async def extract_chunk_persona(chunks: List[str], max_personas: int = 5, max_chunks: int = 20) -> List[str]:
"""
Extract persona (人物形象) from the given chunks.
Args:
chunks: List of chunk content strings
max_personas: Maximum number of personas to return (default: 5)
max_chunks: Maximum number of chunks to process (default: 20)
Returns:
List of persona strings like "产品设计师", "旅行爱好者", "摄影发烧友"
"""
if not chunks:
business_logger.warning("没有提供chunk内容用于提取人物形象")
return []
try:
# 限制处理的chunk数量
chunks_to_process = chunks[:max_chunks]
# 合并chunk内容
combined_content = "\n\n".join([f"片段{i+1}: {chunk}" for i, chunk in enumerate(chunks_to_process)])
# 构建prompt
system_prompt = (
"你是一位专业的人物画像分析专家,擅长从文本中提取人物形象标签。请遵循以下规则:\n\n"
"1. **职业身份**: 识别职业、专业领域(如'产品设计师''软件工程师''创业者'\n"
"2. **兴趣爱好**: 提取核心兴趣和爱好(如'旅行爱好者''摄影发烧友''咖啡控'\n"
"3. **生活方式**: 概括生活态度和习惯(如'极简主义者''户外探险家''阅读爱好者'\n"
"4. **个性特征**: 提炼显著的性格特点(如'思考者''行动派''完美主义者'\n"
"5. **数量控制**: 提取3-8个最具代表性的人物形象标签\n"
"6. **简洁明确**: 每个标签应该是简短的名词或名词短语2-6个字\n\n"
"人物形象标签应该能够准确刻画这个人的核心特征和身份定位。"
)
user_prompt = f"请从以下文本中提取人物形象标签:\n\n{combined_content}"
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
# 调用LLM提取人物形象
llm_client = get_llm_client()
structured_response = await llm_client.response_structured(
messages=messages,
response_model=ExtractedPersona
)
# 去重并限制数量
personas = list(dict.fromkeys(structured_response.personas))[:max_personas]
business_logger.info(f"成功提取 {len(personas)} 个人物形象,处理了 {len(chunks_to_process)} 个片段")
return personas
except Exception as e:
business_logger.error(f"提取人物形象失败: {str(e)}")
return []
if __name__ == "__main__":
# 测试代码
test_chunks = [
"我是一名产品设计师,平时喜欢旅行和摄影。周末经常去户外徒步,探索新的风景。",
"最近在学习咖啡拉花,已经能做出简单的图案了。每天早上都会给自己冲一杯手冲咖啡。",
"喜欢阅读各类书籍,尤其是设计和心理学相关的。记录生活是我的习惯,用镜头捕捉美好瞬间。"
]
print("开始提取chunk标签...")
tags = asyncio.run(extract_chunk_tags(test_chunks))
print(f"\n提取的标签:")
for tag, freq in tags:
print(f"- {tag} (频率: {freq})")
print("\n" + "="*50)
print("开始提取人物形象...")
personas = asyncio.run(extract_chunk_persona(test_chunks))
print(f"\n提取的人物形象:")
for persona in personas:
print(f"- {persona}")