[modify] rag file system
This commit is contained in:
@@ -210,9 +210,14 @@ def _build_vision_model(file_path: str, db_knowledge):
|
||||
|
||||
|
||||
@celery_app.task(name="app.core.rag.tasks.parse_document")
|
||||
def parse_document(file_path: str, document_id: uuid.UUID):
|
||||
def parse_document(file_key: str, document_id: uuid.UUID, file_name: str = ""):
|
||||
"""
|
||||
Document parsing, vectorization, and storage
|
||||
Document parsing, vectorization, and storage.
|
||||
|
||||
Args:
|
||||
file_key: Storage key for FileStorageService (e.g. "kb/{kb_id}/{file_id}.docx")
|
||||
document_id: Document UUID
|
||||
file_name: Original file name (used for extension detection in chunk())
|
||||
"""
|
||||
|
||||
db_document = None
|
||||
@@ -223,7 +228,6 @@ def parse_document(file_path: str, document_id: uuid.UUID):
|
||||
|
||||
with get_db_context() as db:
|
||||
try:
|
||||
# Celery JSON 序列化会将 UUID 转为字符串,需要确保类型正确
|
||||
if not isinstance(document_id, uuid.UUID):
|
||||
document_id = uuid.UUID(str(document_id))
|
||||
|
||||
@@ -234,7 +238,11 @@ def parse_document(file_path: str, document_id: uuid.UUID):
|
||||
if db_knowledge is None:
|
||||
raise ValueError(f"Knowledge {db_document.kb_id} not found")
|
||||
|
||||
# 1. Document parsing & segmentation
|
||||
# Use file_name from argument or fall back to document record
|
||||
if not file_name:
|
||||
file_name = db_document.file_name
|
||||
|
||||
# 1. Download file from storage backend
|
||||
progress_lines.append(f"{datetime.now().strftime('%H:%M:%S')} Start to parse.")
|
||||
start_time = time.time()
|
||||
db_document.progress = 0.0
|
||||
@@ -245,14 +253,36 @@ def parse_document(file_path: str, document_id: uuid.UUID):
|
||||
db.commit()
|
||||
db.refresh(db_document)
|
||||
|
||||
# Read file content from storage backend (no NFS dependency)
|
||||
from app.services.file_storage_service import FileStorageService
|
||||
import asyncio
|
||||
storage_service = FileStorageService()
|
||||
|
||||
async def _download():
|
||||
return await storage_service.download_file(file_key)
|
||||
|
||||
try:
|
||||
file_binary = asyncio.run(_download())
|
||||
except RuntimeError:
|
||||
# If there's already a running loop (e.g. in some worker configurations)
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
file_binary = loop.run_until_complete(_download())
|
||||
finally:
|
||||
loop.close()
|
||||
if not file_binary:
|
||||
raise IOError(f"Downloaded empty file from storage: {file_key}")
|
||||
logger.info(f"[ParseDoc] Downloaded {len(file_binary)} bytes from storage key: {file_key}")
|
||||
|
||||
def progress_callback(prog=None, msg=None):
|
||||
progress_lines.append(f"{datetime.now().strftime('%H:%M:%S')} parse progress: {prog} msg: {msg}.")
|
||||
|
||||
# Prepare vision_model for parsing
|
||||
vision_model = _build_vision_model(file_path, db_knowledge)
|
||||
vision_model = _build_vision_model(file_name, db_knowledge)
|
||||
|
||||
from app.core.rag.app.naive import chunk
|
||||
res = chunk(filename=file_path,
|
||||
res = chunk(filename=file_name,
|
||||
binary=file_binary,
|
||||
from_page=0,
|
||||
to_page=DEFAULT_PARSE_TO_PAGE,
|
||||
callback=progress_callback,
|
||||
|
||||
Reference in New Issue
Block a user