Initial commit
This commit is contained in:
116
app/core/rag_utils/README.md
Normal file
116
app/core/rag_utils/README.md
Normal file
@@ -0,0 +1,116 @@
|
||||
# RAG Chunk 分析工具
|
||||
|
||||
这个模块提供了对 RAG chunk 内容进行分析的工具函数,包括:
|
||||
|
||||
## 功能模块
|
||||
|
||||
### 1. chunk_summary.py - Chunk 摘要生成
|
||||
- `generate_chunk_summary(chunks, max_chunks=10)`: 为给定的 chunk 列表生成简洁摘要
|
||||
- 使用 LLM 提取核心信息和关键要点
|
||||
- 摘要长度控制在 100-150 字
|
||||
|
||||
### 2. chunk_tags.py - 标签提取
|
||||
- `extract_chunk_tags(chunks, max_tags=10, max_chunks=10)`: 从 chunk 中提取关键标签
|
||||
- `extract_chunk_tags_with_frequency(chunks, max_tags=10)`: 提取标签并统计频率
|
||||
- 使用 LLM 识别核心概念和专业术语
|
||||
- 自动过滤无意义词汇
|
||||
|
||||
### 3. chunk_insight.py - 洞察分析
|
||||
- `generate_chunk_insight(chunks, max_chunks=15)`: 生成深度洞察报告
|
||||
- `classify_chunk_domain(chunk)`: 对 chunk 进行领域分类
|
||||
- `analyze_domain_distribution(chunks, max_chunks=20)`: 分析领域分布
|
||||
- 提供内容的主题、特点和价值分析
|
||||
|
||||
## 使用示例
|
||||
|
||||
```python
|
||||
from app.core.rag_utils import (
|
||||
generate_chunk_summary,
|
||||
extract_chunk_tags,
|
||||
generate_chunk_insight
|
||||
)
|
||||
|
||||
# 示例 chunk 数据
|
||||
chunks = [
|
||||
"机器学习是人工智能的一个重要分支...",
|
||||
"深度学习使用神经网络进行特征学习...",
|
||||
# ...
|
||||
]
|
||||
|
||||
# 生成摘要
|
||||
summary = await generate_chunk_summary(chunks, max_chunks=10)
|
||||
print(f"摘要: {summary}")
|
||||
|
||||
# 提取标签
|
||||
tags = await extract_chunk_tags(chunks, max_tags=10)
|
||||
print(f"标签: {tags}")
|
||||
|
||||
# 生成洞察
|
||||
insight = await generate_chunk_insight(chunks, max_chunks=15)
|
||||
print(f"洞察: {insight}")
|
||||
```
|
||||
|
||||
## API 接口
|
||||
|
||||
在 `memory_dashboard_controller.py` 中提供了两个对外接口:
|
||||
|
||||
### 1. GET /dashboard/chunk_summary_tag
|
||||
获取 chunk 总结和提取的标签
|
||||
|
||||
**参数:**
|
||||
- `end_user_id` (必填): 宿主ID
|
||||
- `limit` (可选, 默认15): 返回的chunk数量
|
||||
- `max_tags` (可选, 默认10): 最大标签数量
|
||||
|
||||
**返回:**
|
||||
```json
|
||||
{
|
||||
"code": 200,
|
||||
"msg": "chunk摘要和标签获取成功",
|
||||
"data": {
|
||||
"summary": "chunk内容的总结...",
|
||||
"tags": [
|
||||
{"tag": "机器学习", "frequency": 5},
|
||||
{"tag": "深度学习", "frequency": 3}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. GET /dashboard/chunk_insight
|
||||
获取 chunk 的洞察内容
|
||||
|
||||
**参数:**
|
||||
- `end_user_id` (必填): 宿主ID
|
||||
- `limit` (可选, 默认15): 返回的chunk数量
|
||||
|
||||
**返回:**
|
||||
```json
|
||||
{
|
||||
"code": 200,
|
||||
"msg": "chunk洞察获取成功",
|
||||
"data": {
|
||||
"insight": "该知识库主要聚焦于技术领域(60%)..."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 技术特点
|
||||
|
||||
1. **异步处理**: 所有函数都是异步的,支持高并发
|
||||
2. **LLM 驱动**: 使用大语言模型进行智能分析
|
||||
3. **可配置**: 支持自定义处理的 chunk 数量和标签数量
|
||||
4. **错误处理**: 完善的异常处理和日志记录
|
||||
5. **模块化设计**: 每个功能独立,易于维护和扩展
|
||||
|
||||
## 依赖
|
||||
|
||||
- `app.core.memory.utils.llm_utils`: LLM 客户端
|
||||
- `app.core.logging_config`: 日志配置
|
||||
- `pydantic`: 数据验证和结构化输出
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. 所有函数都需要在异步上下文中调用(使用 `await`)
|
||||
2. 处理大量 chunk 时建议设置合理的 `max_chunks` 参数以控制 token 消耗
|
||||
3. LLM 调用可能需要一定时间,建议在前端显示加载状态
|
||||
14
app/core/rag_utils/__init__.py
Normal file
14
app/core/rag_utils/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""
|
||||
RAG chunk analysis utilities.
|
||||
"""
|
||||
|
||||
from .chunk_summary import generate_chunk_summary
|
||||
from .chunk_tags import extract_chunk_tags, extract_chunk_persona
|
||||
from .chunk_insight import generate_chunk_insight
|
||||
|
||||
__all__ = [
|
||||
"generate_chunk_summary",
|
||||
"extract_chunk_tags",
|
||||
"extract_chunk_persona",
|
||||
"generate_chunk_insight",
|
||||
]
|
||||
205
app/core/rag_utils/chunk_insight.py
Normal file
205
app/core/rag_utils/chunk_insight.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
Generate insights from RAG chunks.
|
||||
|
||||
This module provides functionality to analyze chunk content and generate insights using LLM.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import List, Dict, Any
|
||||
from collections import Counter
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.core.memory.utils.llm.llm_utils import get_llm_client
|
||||
from app.core.logging_config import get_business_logger
|
||||
|
||||
business_logger = get_business_logger()
|
||||
|
||||
|
||||
class ChunkInsight(BaseModel):
|
||||
"""Pydantic model for chunk insight."""
|
||||
insight: str = Field(..., description="对chunk内容的深度洞察分析")
|
||||
|
||||
|
||||
class DomainClassification(BaseModel):
|
||||
"""Pydantic model for domain classification."""
|
||||
domain: str = Field(
|
||||
...,
|
||||
description="内容所属的领域分类",
|
||||
examples=["技术", "商业", "教育", "生活", "娱乐", "健康", "其他"]
|
||||
)
|
||||
|
||||
|
||||
async def classify_chunk_domain(chunk: str) -> str:
|
||||
"""
|
||||
Classify a chunk into a specific domain.
|
||||
|
||||
Args:
|
||||
chunk: Chunk content string
|
||||
|
||||
Returns:
|
||||
Domain name
|
||||
"""
|
||||
try:
|
||||
llm_client = get_llm_client()
|
||||
|
||||
prompt = f"""请将以下文本内容归类到最合适的领域中。
|
||||
|
||||
可选领域及其关键词:
|
||||
- 技术:编程、软件、硬件、算法、数据、网络、系统、开发、工程等
|
||||
- 商业:市场、销售、管理、财务、投资、创业、营销、战略等
|
||||
- 教育:学习、课程、培训、教学、知识、技能、考试、研究等
|
||||
- 生活:日常、家庭、饮食、购物、旅行、休闲、娱乐等
|
||||
- 娱乐:游戏、电影、音乐、体育、艺术、文化等
|
||||
- 健康:医疗、养生、运动、心理、保健、疾病等
|
||||
- 其他:无法归入以上类别的内容
|
||||
|
||||
文本内容: {chunk[:500]}...
|
||||
|
||||
请直接返回最合适的领域名称。"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的文本分类助手。请仔细分析文本内容,选择最合适的领域分类。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
classification = await llm_client.response_structured(
|
||||
messages=messages,
|
||||
response_model=DomainClassification
|
||||
)
|
||||
|
||||
return classification.domain if classification else "其他"
|
||||
|
||||
except Exception as e:
|
||||
business_logger.error(f"分类chunk领域失败: {str(e)}")
|
||||
return "其他"
|
||||
|
||||
|
||||
async def analyze_domain_distribution(chunks: List[str], max_chunks: int = 20) -> Dict[str, float]:
|
||||
"""
|
||||
Analyze the domain distribution of chunks.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk content strings
|
||||
max_chunks: Maximum number of chunks to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary of domain -> percentage
|
||||
"""
|
||||
if not chunks:
|
||||
return {}
|
||||
|
||||
try:
|
||||
# 限制分析的chunk数量
|
||||
chunks_to_analyze = chunks[:max_chunks]
|
||||
|
||||
# 为每个chunk分类
|
||||
domain_counts = Counter()
|
||||
for chunk in chunks_to_analyze:
|
||||
domain = await classify_chunk_domain(chunk)
|
||||
domain_counts[domain] += 1
|
||||
|
||||
# 计算百分比
|
||||
total = sum(domain_counts.values())
|
||||
domain_distribution = {
|
||||
domain: count / total
|
||||
for domain, count in domain_counts.items()
|
||||
}
|
||||
|
||||
# 按百分比降序排序
|
||||
return dict(sorted(domain_distribution.items(), key=lambda x: x[1], reverse=True))
|
||||
|
||||
except Exception as e:
|
||||
business_logger.error(f"分析领域分布失败: {str(e)}")
|
||||
return {}
|
||||
|
||||
|
||||
async def generate_chunk_insight(chunks: List[str], max_chunks: int = 15) -> str:
|
||||
"""
|
||||
Generate insights from the given chunks.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk content strings
|
||||
max_chunks: Maximum number of chunks to analyze
|
||||
|
||||
Returns:
|
||||
A comprehensive insight report
|
||||
"""
|
||||
if not chunks:
|
||||
business_logger.warning("没有提供chunk内容用于生成洞察")
|
||||
return "暂无足够数据生成洞察报告"
|
||||
|
||||
try:
|
||||
# 1. 分析领域分布
|
||||
domain_dist = await analyze_domain_distribution(chunks, max_chunks=max_chunks)
|
||||
|
||||
# 2. 统计基本信息
|
||||
total_chunks = len(chunks)
|
||||
avg_length = sum(len(chunk) for chunk in chunks) / total_chunks if total_chunks > 0 else 0
|
||||
|
||||
# 3. 构建洞察prompt
|
||||
prompt_parts = []
|
||||
|
||||
if domain_dist:
|
||||
top_domains = ", ".join([f"{k}({v:.0%})" for k, v in list(domain_dist.items())[:3]])
|
||||
prompt_parts.append(f"- 内容领域分布: {top_domains}")
|
||||
|
||||
prompt_parts.append(f"- 内容规模: 共{total_chunks}个知识片段,平均长度{avg_length:.0f}字")
|
||||
|
||||
# 添加部分chunk内容作为参考
|
||||
sample_chunks = chunks[:5]
|
||||
sample_content = "\n".join([f"示例{i+1}: {chunk[:200]}..." for i, chunk in enumerate(sample_chunks)])
|
||||
prompt_parts.append(f"\n内容示例:\n{sample_content}")
|
||||
|
||||
system_prompt = """你是一位专业的知识内容分析师。你的任务是根据提供的信息,生成一段简洁、有洞察力的分析报告。
|
||||
|
||||
重要规则:
|
||||
1. 报告需要将所有要点流畅地串联成一个段落
|
||||
2. 语言风格要专业、客观,同时易于理解
|
||||
3. 不要添加任何额外的解释或标题,直接输出报告内容
|
||||
4. 基于提供的数据和示例内容进行分析,不要编造信息
|
||||
5. 重点关注内容的主题、特点和价值
|
||||
6. 报告长度控制在150-200字
|
||||
|
||||
例如,如果输入是:
|
||||
- 内容领域分布: 技术(60%), 商业(25%), 教育(15%)
|
||||
- 内容规模: 共50个知识片段,平均长度320字
|
||||
内容示例: [示例内容...]
|
||||
|
||||
你的输出应该类似:
|
||||
"该知识库主要聚焦于技术领域(60%),涵盖商业(25%)和教育(15%)相关内容。共包含50个知识片段,平均每个片段约320字,内容详实。从示例来看,内容涉及[具体主题],体现了[特点],对[目标用户]具有较高的参考价值。"
|
||||
"""
|
||||
|
||||
user_prompt = "\n".join(prompt_parts)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
]
|
||||
|
||||
# 调用LLM生成洞察
|
||||
llm_client = get_llm_client()
|
||||
response = await llm_client.chat(messages=messages)
|
||||
|
||||
insight = response.content.strip()
|
||||
business_logger.info(f"成功生成chunk洞察,分析了 {min(len(chunks), max_chunks)} 个片段")
|
||||
|
||||
return insight
|
||||
|
||||
except Exception as e:
|
||||
business_logger.error(f"生成chunk洞察失败: {str(e)}")
|
||||
return "洞察生成失败"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试代码
|
||||
test_chunks = [
|
||||
"Python是一种高级编程语言,以其简洁的语法和强大的功能而闻名。它广泛应用于Web开发、数据分析、人工智能等领域。",
|
||||
"机器学习算法可以从数据中自动学习模式,无需显式编程。常见的算法包括决策树、随机森林、神经网络等。",
|
||||
"深度学习是机器学习的一个分支,使用多层神经网络来学习数据的层次化表示。它在图像识别、语音识别等任务中表现出色。",
|
||||
"自然语言处理技术使计算机能够理解和生成人类语言。应用包括机器翻译、情感分析、文本摘要等。",
|
||||
"数据科学结合了统计学、计算机科学和领域知识,用于从数据中提取有价值的洞察。"
|
||||
]
|
||||
|
||||
print("开始生成chunk洞察...")
|
||||
insight = asyncio.run(generate_chunk_insight(test_chunks))
|
||||
print(f"\n生成的洞察:\n{insight}")
|
||||
99
app/core/rag_utils/chunk_summary.py
Normal file
99
app/core/rag_utils/chunk_summary.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
Generate summary for RAG chunks.
|
||||
|
||||
This module provides functionality to summarize chunk content using LLM.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import List, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.core.memory.utils.llm.llm_utils import get_llm_client
|
||||
from app.core.logging_config import get_business_logger
|
||||
|
||||
business_logger = get_business_logger()
|
||||
|
||||
|
||||
class ChunkSummary(BaseModel):
|
||||
"""Pydantic model for chunk summary."""
|
||||
summary: str = Field(..., description="简洁的chunk内容摘要")
|
||||
|
||||
|
||||
async def generate_chunk_summary(chunks: List[str], max_chunks: int = 10) -> str:
|
||||
"""
|
||||
Generate a summary for the given chunks.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk content strings
|
||||
max_chunks: Maximum number of chunks to process (default: 10)
|
||||
|
||||
Returns:
|
||||
A concise summary of the chunks
|
||||
"""
|
||||
if not chunks:
|
||||
business_logger.warning("没有提供chunk内容用于生成摘要")
|
||||
return "暂无内容"
|
||||
|
||||
try:
|
||||
# 限制处理的chunk数量,避免token过多
|
||||
chunks_to_process = chunks[:max_chunks]
|
||||
|
||||
# 合并chunk内容
|
||||
combined_content = "\n\n".join([f"片段{i+1}: {chunk}" for i, chunk in enumerate(chunks_to_process)])
|
||||
|
||||
# 构建prompt
|
||||
system_prompt = (
|
||||
"你是一位专业的文本摘要助手。请基于提供的文本片段,生成简洁的摘要。要求:\n"
|
||||
"- 摘要长度控制在100-150字;\n"
|
||||
"- 提取核心信息和关键要点;\n"
|
||||
"- 使用客观、清晰的语言;\n"
|
||||
"- 避免冗余和重复;\n"
|
||||
"- 如果内容涉及多个主题,按重要性排序呈现。"
|
||||
)
|
||||
|
||||
user_prompt = f"请为以下文本片段生成摘要:\n\n{combined_content}"
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
# 调用LLM生成摘要
|
||||
llm_client = get_llm_client()
|
||||
response = await llm_client.chat(messages=messages)
|
||||
|
||||
summary = response.content.strip()
|
||||
business_logger.info(f"成功生成chunk摘要,处理了 {len(chunks_to_process)} 个片段")
|
||||
|
||||
return summary
|
||||
|
||||
except Exception as e:
|
||||
business_logger.error(f"生成chunk摘要失败: {str(e)}")
|
||||
return "摘要生成失败"
|
||||
|
||||
|
||||
async def generate_chunk_summary_batch(chunks_list: List[List[str]]) -> List[str]:
|
||||
"""
|
||||
Generate summaries for multiple chunk lists in batch.
|
||||
|
||||
Args:
|
||||
chunks_list: List of chunk lists
|
||||
|
||||
Returns:
|
||||
List of summaries
|
||||
"""
|
||||
tasks = [generate_chunk_summary(chunks) for chunks in chunks_list]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试代码
|
||||
test_chunks = [
|
||||
"这是第一段测试内容,讲述了关于机器学习的基础知识。",
|
||||
"第二段内容介绍了深度学习的应用场景和发展历史。",
|
||||
"第三段讨论了自然语言处理技术的最新进展。"
|
||||
]
|
||||
|
||||
print("开始生成chunk摘要...")
|
||||
summary = asyncio.run(generate_chunk_summary(test_chunks))
|
||||
print(f"\n生成的摘要:\n{summary}")
|
||||
191
app/core/rag_utils/chunk_tags.py
Normal file
191
app/core/rag_utils/chunk_tags.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
Extract tags from RAG chunks.
|
||||
|
||||
This module provides functionality to extract meaningful tags from chunk content using LLM.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from collections import Counter
|
||||
from typing import List, Tuple
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.core.memory.utils.llm.llm_utils import get_llm_client
|
||||
from app.core.logging_config import get_business_logger
|
||||
|
||||
business_logger = get_business_logger()
|
||||
|
||||
|
||||
class ExtractedTags(BaseModel):
|
||||
"""Pydantic model for extracted tags."""
|
||||
tags: List[str] = Field(..., description="从文本中提取的关键标签列表")
|
||||
|
||||
|
||||
class ExtractedPersona(BaseModel):
|
||||
"""Pydantic model for extracted persona."""
|
||||
personas: List[str] = Field(..., description="从文本中提取的人物形象列表,如'产品设计师'、'旅行爱好者'等")
|
||||
|
||||
|
||||
async def extract_chunk_tags(chunks: List[str], max_tags: int = 10, max_chunks: int = 10) -> List[Tuple[str, int]]:
|
||||
"""
|
||||
Extract meaningful tags from the given chunks.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk content strings
|
||||
max_tags: Maximum number of tags to return (default: 10)
|
||||
max_chunks: Maximum number of chunks to process (default: 10)
|
||||
|
||||
Returns:
|
||||
List of tuples (tag, frequency), sorted by frequency in descending order
|
||||
"""
|
||||
if not chunks:
|
||||
business_logger.warning("没有提供chunk内容用于提取标签")
|
||||
return []
|
||||
|
||||
try:
|
||||
# 限制处理的chunk数量
|
||||
chunks_to_process = chunks[:max_chunks]
|
||||
|
||||
# 构建prompt
|
||||
system_prompt = (
|
||||
"你是一位专业的文本分析专家,擅长从文本中提取关键标签。请遵循以下规则:\n\n"
|
||||
"1. **提取核心概念**: 识别文本中最重要的名词、专业术语、主题词;\n"
|
||||
"2. **过滤无意义词**: 排除过于宽泛的词(如'内容'、'信息'、'数据');\n"
|
||||
"3. **保持具体性**: 优先选择具体的、有代表性的词语;\n"
|
||||
"4. **标签数量**: 提取5-15个最具代表性的标签;\n"
|
||||
"5. **去重合并**: 语义相近的标签只保留一个最核心的。\n\n"
|
||||
"标签应该是名词或名词短语,能够准确概括文本的核心内容。"
|
||||
)
|
||||
|
||||
llm_client = get_llm_client()
|
||||
|
||||
# 为每个chunk单独提取标签,然后统计频率
|
||||
all_tags = []
|
||||
for chunk in chunks_to_process:
|
||||
single_chunk_prompt = f"请从以下文本中提取关键标签:\n\n{chunk}"
|
||||
single_messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": single_chunk_prompt},
|
||||
]
|
||||
|
||||
try:
|
||||
single_response = await llm_client.response_structured(
|
||||
messages=single_messages,
|
||||
response_model=ExtractedTags
|
||||
)
|
||||
all_tags.extend(single_response.tags)
|
||||
except Exception as e:
|
||||
business_logger.warning(f"处理单个chunk时出错: {str(e)}")
|
||||
continue
|
||||
|
||||
# 统计标签频率
|
||||
tag_counter = Counter(all_tags)
|
||||
|
||||
# 获取最常见的标签,限制数量
|
||||
most_common_tags = tag_counter.most_common(max_tags)
|
||||
|
||||
business_logger.info(f"成功提取 {len(most_common_tags)} 个标签,处理了 {len(chunks_to_process)} 个片段")
|
||||
|
||||
return most_common_tags
|
||||
|
||||
except Exception as e:
|
||||
business_logger.error(f"提取chunk标签失败: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
async def extract_chunk_tags_with_frequency(chunks: List[str], max_tags: int = 10) -> List[Tuple[str, int]]:
|
||||
"""
|
||||
Extract tags with actual frequency calculation across all chunks.
|
||||
|
||||
This is an alias for extract_chunk_tags for backward compatibility.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk content strings
|
||||
max_tags: Maximum number of tags to return
|
||||
|
||||
Returns:
|
||||
List of tuples (tag, frequency), sorted by frequency
|
||||
"""
|
||||
return await extract_chunk_tags(chunks, max_tags=max_tags, max_chunks=len(chunks))
|
||||
|
||||
|
||||
async def extract_chunk_persona(chunks: List[str], max_personas: int = 5, max_chunks: int = 20) -> List[str]:
|
||||
"""
|
||||
Extract persona (人物形象) from the given chunks.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk content strings
|
||||
max_personas: Maximum number of personas to return (default: 5)
|
||||
max_chunks: Maximum number of chunks to process (default: 20)
|
||||
|
||||
Returns:
|
||||
List of persona strings like "产品设计师", "旅行爱好者", "摄影发烧友"
|
||||
"""
|
||||
if not chunks:
|
||||
business_logger.warning("没有提供chunk内容用于提取人物形象")
|
||||
return []
|
||||
|
||||
try:
|
||||
# 限制处理的chunk数量
|
||||
chunks_to_process = chunks[:max_chunks]
|
||||
|
||||
# 合并chunk内容
|
||||
combined_content = "\n\n".join([f"片段{i+1}: {chunk}" for i, chunk in enumerate(chunks_to_process)])
|
||||
|
||||
# 构建prompt
|
||||
system_prompt = (
|
||||
"你是一位专业的人物画像分析专家,擅长从文本中提取人物形象标签。请遵循以下规则:\n\n"
|
||||
"1. **职业身份**: 识别职业、专业领域(如'产品设计师'、'软件工程师'、'创业者');\n"
|
||||
"2. **兴趣爱好**: 提取核心兴趣和爱好(如'旅行爱好者'、'摄影发烧友'、'咖啡控');\n"
|
||||
"3. **生活方式**: 概括生活态度和习惯(如'极简主义者'、'户外探险家'、'阅读爱好者');\n"
|
||||
"4. **个性特征**: 提炼显著的性格特点(如'思考者'、'行动派'、'完美主义者');\n"
|
||||
"5. **数量控制**: 提取3-8个最具代表性的人物形象标签;\n"
|
||||
"6. **简洁明确**: 每个标签应该是简短的名词或名词短语(2-6个字)。\n\n"
|
||||
"人物形象标签应该能够准确刻画这个人的核心特征和身份定位。"
|
||||
)
|
||||
|
||||
user_prompt = f"请从以下文本中提取人物形象标签:\n\n{combined_content}"
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
# 调用LLM提取人物形象
|
||||
llm_client = get_llm_client()
|
||||
structured_response = await llm_client.response_structured(
|
||||
messages=messages,
|
||||
response_model=ExtractedPersona
|
||||
)
|
||||
|
||||
# 去重并限制数量
|
||||
personas = list(dict.fromkeys(structured_response.personas))[:max_personas]
|
||||
|
||||
business_logger.info(f"成功提取 {len(personas)} 个人物形象,处理了 {len(chunks_to_process)} 个片段")
|
||||
|
||||
return personas
|
||||
|
||||
except Exception as e:
|
||||
business_logger.error(f"提取人物形象失败: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试代码
|
||||
test_chunks = [
|
||||
"我是一名产品设计师,平时喜欢旅行和摄影。周末经常去户外徒步,探索新的风景。",
|
||||
"最近在学习咖啡拉花,已经能做出简单的图案了。每天早上都会给自己冲一杯手冲咖啡。",
|
||||
"喜欢阅读各类书籍,尤其是设计和心理学相关的。记录生活是我的习惯,用镜头捕捉美好瞬间。"
|
||||
]
|
||||
|
||||
print("开始提取chunk标签...")
|
||||
tags = asyncio.run(extract_chunk_tags(test_chunks))
|
||||
print(f"\n提取的标签:")
|
||||
for tag, freq in tags:
|
||||
print(f"- {tag} (频率: {freq})")
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("开始提取人物形象...")
|
||||
personas = asyncio.run(extract_chunk_persona(test_chunks))
|
||||
print(f"\n提取的人物形象:")
|
||||
for persona in personas:
|
||||
print(f"- {persona}")
|
||||
Reference in New Issue
Block a user