[modify] rag file system

2026-04-27 16:05:27 +08:00
parent 3b359df02f
commit 30cdf229de
8 changed files with 228 additions and 348 deletions
--- a/api/app/tasks.py
+++ b/api/app/tasks.py
@@ -210,9 +210,14 @@ def _build_vision_model(file_path: str, db_knowledge):


@celery_app.task(name="app.core.rag.tasks.parse_document")
-def parse_document(file_path: str, document_id: uuid.UUID):
+def parse_document(file_key: str, document_id: uuid.UUID, file_name: str = ""):
    """
-    Document parsing, vectorization, and storage
+    Document parsing, vectorization, and storage.
+    
+    Args:
+        file_key: Storage key for FileStorageService (e.g. "kb/{kb_id}/{file_id}.docx")
+        document_id: Document UUID
+        file_name: Original file name (used for extension detection in chunk())
    """

    db_document = None
@@ -223,7 +228,6 @@ def parse_document(file_path: str, document_id: uuid.UUID):

    with get_db_context() as db:
      try:
-        # Celery JSON 序列化会将 UUID 转为字符串，需要确保类型正确
        if not isinstance(document_id, uuid.UUID):
            document_id = uuid.UUID(str(document_id))

@@ -234,7 +238,11 @@ def parse_document(file_path: str, document_id: uuid.UUID):
        if db_knowledge is None:
            raise ValueError(f"Knowledge {db_document.kb_id} not found")

-        # 1. Document parsing & segmentation
+        # Use file_name from argument or fall back to document record
+        if not file_name:
+            file_name = db_document.file_name
+
+        # 1. Download file from storage backend
        progress_lines.append(f"{datetime.now().strftime('%H:%M:%S')} Start to parse.")
        start_time = time.time()
        db_document.progress = 0.0
@@ -245,14 +253,36 @@ def parse_document(file_path: str, document_id: uuid.UUID):
        db.commit()
        db.refresh(db_document)

+        # Read file content from storage backend (no NFS dependency)
+        from app.services.file_storage_service import FileStorageService
+        import asyncio
+        storage_service = FileStorageService()
+
+        async def _download():
+            return await storage_service.download_file(file_key)
+
+        try:
+            file_binary = asyncio.run(_download())
+        except RuntimeError:
+            # If there's already a running loop (e.g. in some worker configurations)
+            loop = asyncio.new_event_loop()
+            try:
+                file_binary = loop.run_until_complete(_download())
+            finally:
+                loop.close()
+        if not file_binary:
+            raise IOError(f"Downloaded empty file from storage: {file_key}")
+        logger.info(f"[ParseDoc] Downloaded {len(file_binary)} bytes from storage key: {file_key}")
+
        def progress_callback(prog=None, msg=None):
            progress_lines.append(f"{datetime.now().strftime('%H:%M:%S')} parse progress: {prog} msg: {msg}.")

        # Prepare vision_model for parsing
-        vision_model = _build_vision_model(file_path, db_knowledge)
+        vision_model = _build_vision_model(file_name, db_knowledge)

        from app.core.rag.app.naive import chunk
-        res = chunk(filename=file_path,
+        res = chunk(filename=file_name,
+                    binary=file_binary,
                    from_page=0,
                    to_page=DEFAULT_PARSE_TO_PAGE,
                    callback=progress_callback,