[add] RAG storage displays the page effect

This commit is contained in:
lanceyq
2026-03-12 18:35:09 +08:00
parent 6029a5a9a8
commit 110de0afbc
2 changed files with 63 additions and 43 deletions

View File

@@ -403,14 +403,15 @@ def get_current_user_rag_total_num(
@router.get("/rag_content", response_model=ApiResponse)
def get_rag_content(
end_user_id: str = Query(..., description="宿主ID"),
limit: int = Query(15, description="返回记录数"),
page: int = Query(1, gt=0, description="页码从1开始"),
pagesize: int = Query(15, gt=0, le=100, description="每页返回记录数"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""
获取当前宿主知识库中的chunk内容
获取当前宿主知识库中的chunk内容(分页)
"""
data = memory_dashboard_service.get_rag_content(end_user_id, limit, db, current_user)
data = memory_dashboard_service.get_rag_content(end_user_id, page, pagesize, db, current_user)
return success(data=data, msg="宿主RAGchunk数据获取成功")

View File

@@ -535,7 +535,8 @@ def get_users_total_chunk_batch(
def get_rag_content(
end_user_id: str,
limit: int,
page: int,
pagesize: int,
db: Session,
current_user: User
) -> dict:
@@ -543,9 +544,9 @@ def get_rag_content(
先在documents表中查询file_name=='end_user_id'+'.txt'的id和kb_id,
然后调用/chunks/{kb_id}/{document_id}/chunks接口的相关代码获取所有内容
接着对获取的内容进行提取只要page_content的内容
最后返回数据
最后返回分页数据
"""
business_logger.info(f"获取RAG内容: end_user_id={end_user_id}, limit={limit}, 操作者: {current_user.username}")
business_logger.info(f"获取RAG内容: end_user_id={end_user_id}, page={page}, pagesize={pagesize}, 操作者: {current_user.username}")
try:
from app.models.document_model import Document
@@ -562,63 +563,76 @@ def get_rag_content(
if not documents:
business_logger.warning(f"未找到文件: {file_name}")
return {
"total": 0,
"contents": []
"page": {
"page": page,
"pagesize": pagesize,
"total": 0,
"hasnext": False,
},
"items": []
}
business_logger.info(f"找到 {len(documents)} 个文档记录")
# 3. 获取所有chunks的page_content
all_contents = []
total_chunks = 0
# 3. 按全局偏移量计算当前页数据
# 全局偏移范围:[offset_start, offset_end)
offset_start = (page - 1) * pagesize
offset_end = offset_start + pagesize
global_total = 0 # 所有文档的 chunk 总数
page_contents = [] # 当前页的内容
for document in documents:
try:
# 获取知识库信息
kb = knowledge_repository.get_knowledge_by_id(db, document.kb_id)
if not kb:
business_logger.warning(f"知识库不存在: kb_id={document.kb_id}")
continue
# 初始化向量服务
vector_service = ElasticSearchVectorFactory().init_vector(knowledge=kb)
# 获取该文档的所有chunks分页获取
page = 1
pagesize = 100 # 每页100条
# 先用 pagesize=1 获取该文档的 chunk 总数
doc_total, _ = vector_service.search_by_segment(
document_id=str(document.id),
query=None,
pagesize=1,
page=1,
asc=True
)
while True:
total, items = vector_service.search_by_segment(
doc_offset_start = global_total # 该文档在全局中的起始偏移
doc_offset_end = global_total + doc_total # 该文档在全局中的结束偏移
global_total += doc_total
# 当前页与该文档无交集,跳过
if doc_offset_end <= offset_start or doc_offset_start >= offset_end:
continue
# 计算需要从该文档取的局部范围
local_start = max(offset_start - doc_offset_start, 0)
local_end = min(offset_end - doc_offset_start, doc_total)
need_count = local_end - local_start
# 换算成 ES 分页参数ES page 从1开始
es_page = (local_start // pagesize) + 1
es_offset_in_page = local_start % pagesize
fetched = []
while len(fetched) < es_offset_in_page + need_count:
_, items = vector_service.search_by_segment(
document_id=str(document.id),
query=None,
pagesize=pagesize,
page=page,
page=es_page,
asc=True
)
if not items:
break
# 提取page_content
for item in items:
all_contents.append(item.page_content)
total_chunks += 1
# # 如果达到limit限制直接返回
# if limit > 0 and total_chunks >= limit:
# business_logger.info(f"已达到limit限制: {limit}")
# return {
# "total": total_chunks,
# "contents": all_contents[:limit]
# }
# 检查是否还有下一页
if page * pagesize >= total:
break
page += 1
fetched.extend(items)
es_page += 1
business_logger.info(f"文档 {document.id} 获取了 {len(items)} 个chunks")
slice_items = fetched[es_offset_in_page: es_offset_in_page + need_count]
page_contents.extend([item.page_content for item in slice_items])
except Exception as e:
business_logger.error(f"获取文档 {document.id} 的chunks失败: {str(e)}")
@@ -626,11 +640,16 @@ def get_rag_content(
# 4. 返回结果
result = {
"total": total_chunks,
"contents": all_contents[:limit] if limit > 0 else all_contents
"page": {
"page": page,
"pagesize": pagesize,
"total": global_total,
"hasnext": offset_end < global_total,
},
"items": page_contents
}
business_logger.info(f"成功获取RAG内容: total={total_chunks}, 返回={len(result['contents'])}")
business_logger.info(f"成功获取RAG内容: total={global_total}, page={page}, 返回={len(page_contents)}")
return result
except Exception as e: