Merge pull request #544 from SuanmoSuanyangTechnology/fix/RAG-show
[add] RAG storage displays the page effect
This commit is contained in:
@@ -403,14 +403,15 @@ def get_current_user_rag_total_num(
|
|||||||
@router.get("/rag_content", response_model=ApiResponse)
|
@router.get("/rag_content", response_model=ApiResponse)
|
||||||
def get_rag_content(
|
def get_rag_content(
|
||||||
end_user_id: str = Query(..., description="宿主ID"),
|
end_user_id: str = Query(..., description="宿主ID"),
|
||||||
limit: int = Query(15, description="返回记录数"),
|
page: int = Query(1, gt=0, description="页码,从1开始"),
|
||||||
|
pagesize: int = Query(15, gt=0, le=100, description="每页返回记录数"),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
current_user: User = Depends(get_current_user),
|
current_user: User = Depends(get_current_user),
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
获取当前宿主知识库中的chunk内容
|
获取当前宿主知识库中的chunk内容(分页)
|
||||||
"""
|
"""
|
||||||
data = memory_dashboard_service.get_rag_content(end_user_id, limit, db, current_user)
|
data = memory_dashboard_service.get_rag_content(end_user_id, page, pagesize, db, current_user)
|
||||||
return success(data=data, msg="宿主RAGchunk数据获取成功")
|
return success(data=data, msg="宿主RAGchunk数据获取成功")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -535,7 +535,8 @@ def get_users_total_chunk_batch(
|
|||||||
|
|
||||||
def get_rag_content(
|
def get_rag_content(
|
||||||
end_user_id: str,
|
end_user_id: str,
|
||||||
limit: int,
|
page: int,
|
||||||
|
pagesize: int,
|
||||||
db: Session,
|
db: Session,
|
||||||
current_user: User
|
current_user: User
|
||||||
) -> dict:
|
) -> dict:
|
||||||
@@ -543,9 +544,9 @@ def get_rag_content(
|
|||||||
先在documents表中查询file_name=='end_user_id'+'.txt'的id和kb_id,
|
先在documents表中查询file_name=='end_user_id'+'.txt'的id和kb_id,
|
||||||
然后调用/chunks/{kb_id}/{document_id}/chunks接口的相关代码获取所有内容,
|
然后调用/chunks/{kb_id}/{document_id}/chunks接口的相关代码获取所有内容,
|
||||||
接着对获取的内容进行提取,只要page_content的内容,
|
接着对获取的内容进行提取,只要page_content的内容,
|
||||||
最后返回数据
|
最后返回分页数据
|
||||||
"""
|
"""
|
||||||
business_logger.info(f"获取RAG内容: end_user_id={end_user_id}, limit={limit}, 操作者: {current_user.username}")
|
business_logger.info(f"获取RAG内容: end_user_id={end_user_id}, page={page}, pagesize={pagesize}, 操作者: {current_user.username}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from app.models.document_model import Document
|
from app.models.document_model import Document
|
||||||
@@ -562,63 +563,76 @@ def get_rag_content(
|
|||||||
if not documents:
|
if not documents:
|
||||||
business_logger.warning(f"未找到文件: {file_name}")
|
business_logger.warning(f"未找到文件: {file_name}")
|
||||||
return {
|
return {
|
||||||
"total": 0,
|
"page": {
|
||||||
"contents": []
|
"page": page,
|
||||||
|
"pagesize": pagesize,
|
||||||
|
"total": 0,
|
||||||
|
"hasnext": False,
|
||||||
|
},
|
||||||
|
"items": []
|
||||||
}
|
}
|
||||||
|
|
||||||
business_logger.info(f"找到 {len(documents)} 个文档记录")
|
business_logger.info(f"找到 {len(documents)} 个文档记录")
|
||||||
|
|
||||||
# 3. 获取所有chunks的page_content
|
# 3. 按全局偏移量计算当前页数据
|
||||||
all_contents = []
|
# 全局偏移范围:[offset_start, offset_end)
|
||||||
total_chunks = 0
|
offset_start = (page - 1) * pagesize
|
||||||
|
offset_end = offset_start + pagesize
|
||||||
|
|
||||||
|
global_total = 0 # 所有文档的 chunk 总数
|
||||||
|
page_contents = [] # 当前页的内容
|
||||||
|
|
||||||
for document in documents:
|
for document in documents:
|
||||||
try:
|
try:
|
||||||
# 获取知识库信息
|
|
||||||
kb = knowledge_repository.get_knowledge_by_id(db, document.kb_id)
|
kb = knowledge_repository.get_knowledge_by_id(db, document.kb_id)
|
||||||
if not kb:
|
if not kb:
|
||||||
business_logger.warning(f"知识库不存在: kb_id={document.kb_id}")
|
business_logger.warning(f"知识库不存在: kb_id={document.kb_id}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 初始化向量服务
|
|
||||||
vector_service = ElasticSearchVectorFactory().init_vector(knowledge=kb)
|
vector_service = ElasticSearchVectorFactory().init_vector(knowledge=kb)
|
||||||
|
|
||||||
# 获取该文档的所有chunks(分页获取)
|
# 先用 pagesize=1 获取该文档的 chunk 总数
|
||||||
page = 1
|
doc_total, _ = vector_service.search_by_segment(
|
||||||
pagesize = 100 # 每页100条
|
document_id=str(document.id),
|
||||||
|
query=None,
|
||||||
|
pagesize=1,
|
||||||
|
page=1,
|
||||||
|
asc=True
|
||||||
|
)
|
||||||
|
|
||||||
while True:
|
doc_offset_start = global_total # 该文档在全局中的起始偏移
|
||||||
total, items = vector_service.search_by_segment(
|
doc_offset_end = global_total + doc_total # 该文档在全局中的结束偏移
|
||||||
|
global_total += doc_total
|
||||||
|
|
||||||
|
# 当前页与该文档无交集,跳过
|
||||||
|
if doc_offset_end <= offset_start or doc_offset_start >= offset_end:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 计算需要从该文档取的局部范围
|
||||||
|
local_start = max(offset_start - doc_offset_start, 0)
|
||||||
|
local_end = min(offset_end - doc_offset_start, doc_total)
|
||||||
|
need_count = local_end - local_start
|
||||||
|
|
||||||
|
# 换算成 ES 分页参数(ES page 从1开始)
|
||||||
|
es_page = (local_start // pagesize) + 1
|
||||||
|
es_offset_in_page = local_start % pagesize
|
||||||
|
|
||||||
|
fetched = []
|
||||||
|
while len(fetched) < es_offset_in_page + need_count:
|
||||||
|
_, items = vector_service.search_by_segment(
|
||||||
document_id=str(document.id),
|
document_id=str(document.id),
|
||||||
query=None,
|
query=None,
|
||||||
pagesize=pagesize,
|
pagesize=pagesize,
|
||||||
page=page,
|
page=es_page,
|
||||||
asc=True
|
asc=True
|
||||||
)
|
)
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
break
|
break
|
||||||
|
fetched.extend(items)
|
||||||
# 提取page_content
|
es_page += 1
|
||||||
for item in items:
|
|
||||||
all_contents.append(item.page_content)
|
|
||||||
total_chunks += 1
|
|
||||||
|
|
||||||
# # 如果达到limit限制,直接返回
|
|
||||||
# if limit > 0 and total_chunks >= limit:
|
|
||||||
# business_logger.info(f"已达到limit限制: {limit}")
|
|
||||||
# return {
|
|
||||||
# "total": total_chunks,
|
|
||||||
# "contents": all_contents[:limit]
|
|
||||||
# }
|
|
||||||
|
|
||||||
# 检查是否还有下一页
|
|
||||||
if page * pagesize >= total:
|
|
||||||
break
|
|
||||||
|
|
||||||
page += 1
|
|
||||||
|
|
||||||
business_logger.info(f"文档 {document.id} 获取了 {len(items)} 个chunks")
|
slice_items = fetched[es_offset_in_page: es_offset_in_page + need_count]
|
||||||
|
page_contents.extend([item.page_content for item in slice_items])
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
business_logger.error(f"获取文档 {document.id} 的chunks失败: {str(e)}")
|
business_logger.error(f"获取文档 {document.id} 的chunks失败: {str(e)}")
|
||||||
@@ -626,11 +640,16 @@ def get_rag_content(
|
|||||||
|
|
||||||
# 4. 返回结果
|
# 4. 返回结果
|
||||||
result = {
|
result = {
|
||||||
"total": total_chunks,
|
"page": {
|
||||||
"contents": all_contents[:limit] if limit > 0 else all_contents
|
"page": page,
|
||||||
|
"pagesize": pagesize,
|
||||||
|
"total": global_total,
|
||||||
|
"hasnext": offset_end < global_total,
|
||||||
|
},
|
||||||
|
"items": page_contents
|
||||||
}
|
}
|
||||||
|
|
||||||
business_logger.info(f"成功获取RAG内容: total={total_chunks}, 返回={len(result['contents'])} 条")
|
business_logger.info(f"成功获取RAG内容: total={global_total}, page={page}, 返回={len(page_contents)} 条")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user