From c53fcf3981cc7ded5005de8fe04730b1ccd662c0 Mon Sep 17 00:00:00 2001 From: Mark <348207283@qq.com> Date: Mon, 27 Apr 2026 17:10:00 +0800 Subject: [PATCH] [fix] old code file_path --- api/app/tasks.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/api/app/tasks.py b/api/app/tasks.py index 578a0e8d..3ad1a0dd 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -280,39 +280,9 @@ def parse_document(file_key: str, document_id: uuid.UUID, file_name: str = ""): # Prepare vision_model for parsing vision_model = _build_vision_model(file_name, db_knowledge) - # 先将文件读入内存,避免解析过程中依赖 NFS 文件持续可访问 - # python-docx 等库在 binary=None 时会用路径直接打开文件, - # 在 NFS/共享存储上可能因缓存失效导致 "Package not found" - max_wait_seconds = 30 - wait_interval = 2 - waited = 0 - file_binary = None - while waited <= max_wait_seconds: - # os.listdir 强制 NFS 客户端刷新目录缓存 - parent_dir = os.path.dirname(file_path) - try: - os.listdir(parent_dir) - except OSError: - pass - try: - with open(file_path, "rb") as f: - file_binary = f.read() - if not file_binary: - # NFS 上文件存在但内容为空(可能还在同步中) - raise IOError(f"File is empty (0 bytes), NFS may still be syncing: {file_path}") - break - except (FileNotFoundError, IOError) as e: - if waited >= max_wait_seconds: - raise type(e)( - f"File not accessible at '{file_path}' after waiting {max_wait_seconds}s: {e}" - ) - logger.warning(f"File not ready on this node, retrying in {wait_interval}s: {file_path} ({e})") - time.sleep(wait_interval) - waited += wait_interval - from app.core.rag.app.naive import chunk logger.info(f"[ParseDoc] file_binary size={len(file_binary)} bytes, type={type(file_binary).__name__}, bool={bool(file_binary)}") - res = chunk(filename=file_path, + res = chunk(filename=file_name, binary=file_binary, from_page=0, to_page=DEFAULT_PARSE_TO_PAGE,