[fix] system prompt fit error

[modify] QA pair
[add] batch add chunk for v1
2026-05-07 19:37:34 +08:00 · 2026-05-07 19:04:19 +08:00 · 2026-05-07 18:45:36 +08:00 · 2026-05-06 18:34:07 +08:00 · 2026-05-06 15:19:46 +08:00 · 2026-04-29 15:24:25 +08:00
17 changed files with 933 additions and 521 deletions
--- a/api/app/controllers/chunk_controller.py
+++ b/api/app/controllers/chunk_controller.py
@@ -1,8 +1,10 @@
 import os
+import csv
+import io
 from typing import Any, Optional
 import uuid

-from fastapi import APIRouter, Depends, HTTPException, status, Query
+from fastapi import APIRouter, Depends, HTTPException, status, Query, UploadFile, File
 from fastapi.encoders import jsonable_encoder
 from sqlalchemy.orm import Session

@@ -23,6 +25,7 @@ from app.models.user_model import User
 from app.schemas import chunk_schema
 from app.schemas.response_schema import ApiResponse
 from app.services import knowledge_service, document_service, file_service, knowledgeshare_service
+from app.services.file_storage_service import FileStorageService, get_file_storage_service, generate_kb_file_key
 from app.services.model_service import ModelApiKeyService

 # Obtain a dedicated API logger
@@ -82,19 +85,32 @@ async def get_preview_chunks(
            detail="The file does not exist or you do not have permission to access it"
        )

-    # 5. Construct file path：/files/{kb_id}/{parent_id}/{file.id}{file.file_ext}
-    file_path = os.path.join(
-        settings.FILE_PATH,
-        str(db_file.kb_id),
-        str(db_file.parent_id),
-        f"{db_file.id}{db_file.file_ext}"
-    )
-
-    # 6. Check if the file exists
-    if not os.path.exists(file_path):
+    # 5. Get file content from storage backend
+    if not db_file.file_key:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
-            detail="File not found (possibly deleted)"
+            detail="File has no storage key (legacy data not migrated)"
+        )
+
+    from app.services.file_storage_service import FileStorageService
+    import asyncio
+    storage_service = FileStorageService()
+
+    async def _download():
+        return await storage_service.download_file(db_file.file_key)
+
+    try:
+        file_binary = asyncio.run(_download())
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        try:
+            file_binary = loop.run_until_complete(_download())
+        finally:
+            loop.close()
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"File not found in storage: {e}"
        )

    # 7. Document parsing & segmentation
@@ -104,11 +120,12 @@ async def get_preview_chunks(
    vision_model = QWenCV(
            key=db_knowledge.image2text.api_keys[0].api_key,
            model_name=db_knowledge.image2text.api_keys[0].model_name,
-            lang="Chinese",  # Default to Chinese
+            lang="Chinese",
            base_url=db_knowledge.image2text.api_keys[0].api_base
        )
    from app.core.rag.app.naive import chunk
-    res = chunk(filename=file_path,
+    res = chunk(filename=db_file.file_name,
+                binary=file_binary,
                from_page=0,
                to_page=5,
                callback=progress_callback,
@@ -257,6 +274,9 @@ async def create_chunk(
        "sort_id": sort_id,
        "status": 1,
    }
+    # QA chunk: 注入 chunk_type/question/answer 到 metadata
+    if create_data.is_qa:
+        metadata.update(create_data.qa_metadata)
    chunk = DocumentChunk(page_content=content, metadata=metadata)
    # 3. Segmented vector storage
    vector_service.add_chunks([chunk])
@@ -268,6 +288,187 @@ async def create_chunk(
    return success(data=jsonable_encoder(chunk), msg="Document chunk creation successful")


+@router.post("/{kb_id}/{document_id}/chunk/batch", response_model=ApiResponse)
+async def create_chunks_batch(
+        kb_id: uuid.UUID,
+        document_id: uuid.UUID,
+        batch_data: chunk_schema.ChunkBatchCreate,
+        db: Session = Depends(get_db),
+        current_user: User = Depends(get_current_user)
+):
+    """
+    Batch create chunks (max 8)
+    """
+    api_logger.info(f"Batch create chunks: kb_id={kb_id}, document_id={document_id}, count={len(batch_data.items)}, username: {current_user.username}")
+
+    if len(batch_data.items) > settings.MAX_CHUNK_BATCH_SIZE:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Batch size exceeds limit: max {settings.MAX_CHUNK_BATCH_SIZE}, got {len(batch_data.items)}"
+        )
+
+    db_knowledge = knowledge_service.get_knowledge_by_id(db, knowledge_id=kb_id, current_user=current_user)
+    if not db_knowledge:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="The knowledge base does not exist or access is denied")
+
+    db_document = db.query(Document).filter(Document.id == document_id).first()
+    if not db_document:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="The document does not exist or you do not have permission to access it")
+
+    vector_service = ElasticSearchVectorFactory().init_vector(knowledge=db_knowledge)
+
+    # Get current max sort_id
+    sort_id = 0
+    total, items = vector_service.search_by_segment(document_id=str(document_id), pagesize=1, page=1, asc=False)
+    if items:
+        sort_id = items[0].metadata["sort_id"]
+
+    chunks = []
+    for create_data in batch_data.items:
+        sort_id += 1
+        doc_id = uuid.uuid4().hex
+        metadata = {
+            "doc_id": doc_id,
+            "file_id": str(db_document.file_id),
+            "file_name": db_document.file_name,
+            "file_created_at": int(db_document.created_at.timestamp() * 1000),
+            "document_id": str(document_id),
+            "knowledge_id": str(kb_id),
+            "sort_id": sort_id,
+            "status": 1,
+        }
+        if create_data.is_qa:
+            metadata.update(create_data.qa_metadata)
+        chunks.append(DocumentChunk(page_content=create_data.chunk_content, metadata=metadata))
+
+    vector_service.add_chunks(chunks)
+
+    db_document.chunk_num += len(chunks)
+    db.commit()
+
+    return success(data=jsonable_encoder(chunks), msg=f"Batch created {len(chunks)} chunks successfully")
+
+
+@router.post("/{kb_id}/import_qa", response_model=ApiResponse)
+async def import_qa_new_doc(
+        kb_id: uuid.UUID,
+        file: UploadFile = File(..., description="CSV 或 Excel 文件（第一行标题跳过，第一列问题，第二列答案）"),
+        db: Session = Depends(get_db),
+        current_user: User = Depends(get_current_user),
+        storage_service: FileStorageService = Depends(get_file_storage_service),
+):
+    """
+    导入 QA 问答对并新建文档（CSV/Excel），异步处理
+    """
+    from app.schemas import file_schema, document_schema
+
+    api_logger.info(f"Import QA (new doc): kb_id={kb_id}, file={file.filename}, username: {current_user.username}")
+
+    # 1. 校验文件格式
+    filename = file.filename or ""
+    if not (filename.endswith(".csv") or filename.endswith(".xlsx") or filename.endswith(".xls")):
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="仅支持 CSV (.csv) 或 Excel (.xlsx) 格式")
+
+    # 2. 校验知识库
+    db_knowledge = knowledge_service.get_knowledge_by_id(db, knowledge_id=kb_id, current_user=current_user)
+    if not db_knowledge:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="知识库不存在或无权访问")
+
+    # 3. 读取文件
+    contents = await file.read()
+    file_size = len(contents)
+    if file_size == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="文件为空")
+
+    _, file_extension = os.path.splitext(filename)
+    file_ext = file_extension.lower()
+
+    # 4. 创建 File 记录
+    file_data = file_schema.FileCreate(
+        kb_id=kb_id, created_by=current_user.id,
+        parent_id=uuid.UUID("00000000-0000-0000-0000-000000000000"),
+        file_name=filename, file_ext=file_ext, file_size=file_size,
+    )
+    db_file = file_service.create_file(db=db, file=file_data, current_user=current_user)
+
+    # 5. 上传文件到存储后端
+    file_key = generate_kb_file_key(kb_id=kb_id, file_id=db_file.id, file_ext=file_ext)
+    try:
+        await storage_service.storage.upload(file_key=file_key, content=contents, content_type=file.content_type)
+    except Exception as e:
+        api_logger.error(f"Storage upload failed: {e}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"文件存储失败: {str(e)}")
+
+    db_file.file_key = file_key
+    db.commit()
+    db.refresh(db_file)
+
+    # 6. 创建 Document 记录（标记为 QA 类型）
+    doc_data = document_schema.DocumentCreate(
+        kb_id=kb_id, created_by=current_user.id, file_id=db_file.id,
+        file_name=filename, file_ext=file_ext, file_size=file_size,
+        file_meta={}, parser_id="qa",
+        parser_config={"doc_type": "qa", "auto_questions": 0}
+    )
+    db_document = document_service.create_document(db=db, document=doc_data, current_user=current_user)
+
+    api_logger.info(f"Created doc for QA import: file_id={db_file.id}, document_id={db_document.id}, file_key={file_key}")
+
+    # 7. 派发异步任务
+    from app.celery_app import celery_app
+    task = celery_app.send_task(
+        "app.core.rag.tasks.import_qa_chunks",
+        args=[str(kb_id), str(db_document.id), filename, contents],
+        queue="qa_import"
+    )
+
+    return success(data={
+        "task_id": task.id,
+        "document_id": str(db_document.id),
+        "file_id": str(db_file.id),
+    }, msg="QA 导入任务已提交，后台处理中")
+
+
+@router.post("/{kb_id}/{document_id}/import_qa", response_model=ApiResponse)
+async def import_qa_chunks(
+        kb_id: uuid.UUID,
+        document_id: uuid.UUID,
+        file: UploadFile = File(..., description="CSV 或 Excel 文件（第一行标题跳过，第一列问题，第二列答案）"),
+        db: Session = Depends(get_db),
+        current_user: User = Depends(get_current_user)
+):
+    """
+    导入 QA 问答对（CSV/Excel），异步处理
+    """
+    api_logger.info(f"Import QA chunks: kb_id={kb_id}, document_id={document_id}, file={file.filename}, username: {current_user.username}")
+
+    # 1. 校验文件格式
+    filename = file.filename or ""
+    if not (filename.endswith(".csv") or filename.endswith(".xlsx") or filename.endswith(".xls")):
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="仅支持 CSV (.csv) 或 Excel (.xlsx) 格式")
+
+    # 2. 校验知识库和文档
+    db_knowledge = knowledge_service.get_knowledge_by_id(db, knowledge_id=kb_id, current_user=current_user)
+    if not db_knowledge:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="知识库不存在或无权访问")
+
+    db_document = db.query(Document).filter(Document.id == document_id).first()
+    if not db_document:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="文档不存在或无权访问")
+
+    # 3. 读取文件内容，派发异步任务
+    contents = await file.read()
+
+    from app.celery_app import celery_app
+    task = celery_app.send_task(
+        "app.core.rag.tasks.import_qa_chunks",
+        args=[str(kb_id), str(document_id), filename, contents],
+        queue="qa_import"
+    )
+
+    return success(data={"task_id": task.id}, msg="QA 导入任务已提交，后台处理中")
+
+
@router.get("/{kb_id}/{document_id}/{doc_id}", response_model=ApiResponse)
 async def get_chunk(
        kb_id: uuid.UUID,
@@ -328,6 +529,9 @@ async def update_chunk(
    if total:
        chunk = items[0]
        chunk.page_content = content
+        # QA chunk: 更新 metadata 中的 question/answer
+        if update_data.is_qa:
+            chunk.metadata.update(update_data.qa_metadata)
        vector_service.update_by_segment(chunk)
        return success(data=jsonable_encoder(chunk), msg="The document chunk has been successfully updated")
    else:
@@ -342,6 +546,7 @@ async def delete_chunk(
        kb_id: uuid.UUID,
        document_id: uuid.UUID,
        doc_id: str,
+        force_refresh: bool = Query(False, description="Force Elasticsearch refresh after deletion"),
        db: Session = Depends(get_db),
        current_user: User = Depends(get_current_user)
 ):
@@ -359,7 +564,7 @@ async def delete_chunk(

    vector_service = ElasticSearchVectorFactory().init_vector(knowledge=db_knowledge)
    if vector_service.text_exists(doc_id):
-        vector_service.delete_by_ids([doc_id])
+        vector_service.delete_by_ids([doc_id], refresh=force_refresh)
        # 更新 chunk_num
        db_document = db.query(Document).filter(Document.id == document_id).first()
        db_document.chunk_num -= 1
--- a/api/app/controllers/document_controller.py
+++ b/api/app/controllers/document_controller.py
@@ -20,6 +20,7 @@ from app.models.user_model import User
 from app.schemas import document_schema
 from app.schemas.response_schema import ApiResponse
 from app.services import document_service, file_service, knowledge_service
+from app.services.file_storage_service import FileStorageService, get_file_storage_service


 # Obtain a dedicated API logger
@@ -231,7 +232,8 @@ async def update_document(
 async def delete_document(
        document_id: uuid.UUID,
        db: Session = Depends(get_db),
-        current_user: User = Depends(get_current_user)
+        current_user: User = Depends(get_current_user),
+        storage_service: FileStorageService = Depends(get_file_storage_service),
 ):
    """
    Delete document
@@ -257,7 +259,7 @@ async def delete_document(
        db.commit()

        # 3. Delete file
-        await file_controller._delete_file(db=db, file_id=file_id, current_user=current_user)
+        await file_controller._delete_file(db=db, file_id=file_id, current_user=current_user, storage_service=storage_service)

        # 4. Delete vector index
        db_knowledge = knowledge_service.get_knowledge_by_id(db, knowledge_id=db_document.kb_id, current_user=current_user)
@@ -305,38 +307,25 @@ async def parse_documents(
                detail="The file does not exist or you do not have permission to access it"
            )

-        # 3. Construct file path：/files/{kb_id}/{parent_id}/{file.id}{file.file_ext}
-        file_path = os.path.join(
-            settings.FILE_PATH,
-            str(db_file.kb_id),
-            str(db_file.parent_id),
-            f"{db_file.id}{db_file.file_ext}"
-        )
-
-        # 4. Check if the file exists
-        api_logger.debug(f"Constructed file path: {file_path}")
-        api_logger.debug(f"File metadata - kb_id: {db_file.kb_id}, parent_id: {db_file.parent_id}, file_id: {db_file.id}, extension: {db_file.file_ext}")
-        if not os.path.exists(file_path):
-            api_logger.error(f"File not found (possibly deleted): file_path={file_path}, file_id={db_file.id}, document_id={document_id}")
+        # 3. Get file_key for storage backend
+        if not db_file.file_key:
+            api_logger.error(f"File has no storage key (legacy data not migrated): file_id={db_file.id}")
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
-                detail="File not found (possibly deleted)"
+                detail="File has no storage key (legacy data not migrated)"
            )

-        # 5. Obtain knowledge base information
-        api_logger.info( f"Obtain details of the knowledge base: knowledge_id={db_document.kb_id}")
+        # 4. Obtain knowledge base information
+        api_logger.info(f"Obtain details of the knowledge base: knowledge_id={db_document.kb_id}")
        db_knowledge = knowledge_service.get_knowledge_by_id(db, knowledge_id=db_document.kb_id, current_user=current_user)
        if not db_knowledge:
-            api_logger.warning(f"The knowledge base does not exist or access is denied: knowledge_id={db_document.kb_id}")
-            raise HTTPException(
-                status_code=status.HTTP_404_NOT_FOUND,
-                detail="The knowledge base does not exist or access is denied"
-            )
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Knowledge base not found")

-        # 6. Task: Document parsing, vectorization, and storage
-        # from app.tasks import parse_document
-        # parse_document(file_path, document_id)
-        task = celery_app.send_task("app.core.rag.tasks.parse_document", args=[file_path, document_id])
+        # 5. Dispatch parse task with file_key (not file_path)
+        task = celery_app.send_task(
+            "app.core.rag.tasks.parse_document",
+            args=[db_file.file_key, document_id, db_file.file_name]
+        )
        result = {
            "task_id": task.id
        }
--- a/api/app/controllers/file_controller.py
+++ b/api/app/controllers/file_controller.py
@@ -1,12 +1,10 @@
 import os
-from pathlib import Path
-import shutil
 from typing import Any, Optional
 import uuid

 from fastapi import APIRouter, Depends, HTTPException, status, File, UploadFile, Query
 from fastapi.encoders import jsonable_encoder
-from fastapi.responses import FileResponse
+from fastapi.responses import Response
 from sqlalchemy.orm import Session

 from app.core.config import settings
@@ -19,10 +17,14 @@ from app.models.user_model import User
 from app.schemas import file_schema, document_schema
 from app.schemas.response_schema import ApiResponse
 from app.services import file_service, document_service
+from app.services.knowledge_service import get_knowledge_by_id as get_kb_by_id
+from app.services.file_storage_service import (
+    FileStorageService,
+    generate_kb_file_key,
+    get_file_storage_service,
+)
 from app.core.quota_stub import check_knowledge_capacity_quota

-
-# Obtain a dedicated API logger
 api_logger = get_api_logger()

 router = APIRouter(
@@ -35,67 +37,37 @@ router = APIRouter(
 async def get_files(
        kb_id: uuid.UUID,
        parent_id: uuid.UUID,
-        page: int = Query(1, gt=0),  # Default: 1, which must be greater than 0
-        pagesize: int = Query(20, gt=0, le=100),  # Default: 20 items per page, maximum: 100 items
+        page: int = Query(1, gt=0),
+        pagesize: int = Query(20, gt=0, le=100),
        orderby: Optional[str] = Query(None, description="Sort fields, such as: created_at"),
        desc: Optional[bool] = Query(False, description="Is it descending order"),
        keywords: Optional[str] = Query(None, description="Search keywords (file name)"),
        db: Session = Depends(get_db),
        current_user: User = Depends(get_current_user)
 ):
-    """
-    Paged query file list
-    - Support filtering by kb_id and parent_id
-    - Support keyword search for file names
-    - Support dynamic sorting
-    - Return paging metadata + file list
-    """
-    api_logger.info(f"Query file list: kb_id={kb_id}, parent_id={parent_id}, page={page}, pagesize={pagesize}, keywords={keywords}, username: {current_user.username}")
-    # 1. parameter validation
-    if page < 1 or pagesize < 1:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="The paging parameter must be greater than 0"
-        )
+    """Paged query file list"""
+    api_logger.info(f"Query file list: kb_id={kb_id}, parent_id={parent_id}, page={page}, pagesize={pagesize}")

-    # 2. Construct query conditions
-    filters = [
-        file_model.File.kb_id == kb_id
-    ]
+    if page < 1 or pagesize < 1:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="The paging parameter must be greater than 0")
+
+    filters = [file_model.File.kb_id == kb_id]
    if parent_id:
        filters.append(file_model.File.parent_id == parent_id)
-    # Keyword search (fuzzy matching of file name)
    if keywords:
        filters.append(file_model.File.file_name.ilike(f"%{keywords}%"))

-    # 3. Execute paged query
    try:
-        api_logger.debug("Start executing file paging query")
        total, items = file_service.get_files_paginated(
-            db=db,
-            filters=filters,
-            page=page,
-            pagesize=pagesize,
-            orderby=orderby,
-            desc=desc,
-            current_user=current_user
+            db=db, filters=filters, page=page, pagesize=pagesize,
+            orderby=orderby, desc=desc, current_user=current_user
        )
-        api_logger.info(f"File query successful: total={total}, returned={len(items)} records")
    except Exception as e:
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Query failed: {str(e)}"
-        )
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Query failed: {str(e)}")

-    # 4. Return structured response
    result = {
        "items": items,
-        "page": {
-            "page": page,
-            "pagesize": pagesize,
-            "total": total,
-            "has_next": True if page * pagesize < total else False
-        }
+        "page": {"page": page, "pagesize": pagesize, "total": total, "has_next": page * pagesize < total}
    }
    return success(data=jsonable_encoder(result), msg="Query of file list succeeded")

@@ -108,23 +80,14 @@ async def create_folder(
        db: Session = Depends(get_db),
        current_user: User = Depends(get_current_user),
 ):
-    """
-    Create a new folder
-    """
-    api_logger.info(f"Create folder request: kb_id={kb_id}, parent_id={parent_id}, folder_name={folder_name}, username: {current_user.username}")
-
+    """Create a new folder"""
+    api_logger.info(f"Create folder request: kb_id={kb_id}, parent_id={parent_id}, folder_name={folder_name}")
    try:
-        api_logger.debug(f"Start creating a folder: {folder_name}")
-        create_folder = file_schema.FileCreate(
-            kb_id=kb_id,
-            created_by=current_user.id,
-            parent_id=parent_id,
-            file_name=folder_name,
-            file_ext='folder',
-            file_size=0,
+        create_folder_data = file_schema.FileCreate(
+            kb_id=kb_id, created_by=current_user.id, parent_id=parent_id,
+            file_name=folder_name, file_ext='folder', file_size=0,
        )
-        db_file = file_service.create_file(db=db, file=create_folder, current_user=current_user)
-        api_logger.info(f"Folder created successfully: {db_file.file_name} (ID: {db_file.id})")
+        db_file = file_service.create_file(db=db, file=create_folder_data, current_user=current_user)
        return success(data=jsonable_encoder(file_schema.File.model_validate(db_file)), msg="Folder creation successful")
    except Exception as e:
        api_logger.error(f"Folder creation failed: {folder_name} - {str(e)}")
@@ -138,76 +101,58 @@ async def upload_file(
        parent_id: uuid.UUID,
        file: UploadFile = File(...),
        db: Session = Depends(get_db),
-        current_user: User = Depends(get_current_user)
+        current_user: User = Depends(get_current_user),
+        storage_service: FileStorageService = Depends(get_file_storage_service),
 ):
-    """
-    upload file
-    """
-    api_logger.info(f"upload file request: kb_id={kb_id}, parent_id={parent_id}, filename={file.filename}, username: {current_user.username}")
+    """Upload file to storage backend"""
+    api_logger.info(f"upload file request: kb_id={kb_id}, parent_id={parent_id}, filename={file.filename}")

-    # Read the contents of the file
    contents = await file.read()
-    # Check file size
    file_size = len(contents)
-    print(f"file size: {file_size} byte")
    if file_size == 0:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="The file is empty."
-        )
-    # If the file size exceeds 50MB (50 * 1024 * 1024 bytes)
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="The file is empty.")
    if file_size > settings.MAX_FILE_SIZE:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"The file size exceeds the {settings.MAX_FILE_SIZE}byte limit"
-        )
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"File size exceeds {settings.MAX_FILE_SIZE} byte limit")

-    # Extract the extension using `os.path.splitext`
    _, file_extension = os.path.splitext(file.filename)
-    upload_file = file_schema.FileCreate(
-        kb_id=kb_id,
-        created_by=current_user.id,
-        parent_id=parent_id,
-        file_name=file.filename,
-        file_ext=file_extension.lower(),
-        file_size=file_size,
+    file_ext = file_extension.lower()
+
+    # Create File record
+    upload_file_data = file_schema.FileCreate(
+        kb_id=kb_id, created_by=current_user.id, parent_id=parent_id,
+        file_name=file.filename, file_ext=file_ext, file_size=file_size,
    )
-    db_file = file_service.create_file(db=db, file=upload_file, current_user=current_user)
+    db_file = file_service.create_file(db=db, file=upload_file_data, current_user=current_user)

-    # Construct a save path：/files/{kb_id}/{parent_id}/{file.id}{file_extension}
-    save_dir = os.path.join(settings.FILE_PATH, str(kb_id), str(parent_id))
-    Path(save_dir).mkdir(parents=True, exist_ok=True)  # Ensure that the directory exists
-    save_path = os.path.join(save_dir, f"{db_file.id}{db_file.file_ext}")
+    # Upload to storage backend
+    file_key = generate_kb_file_key(kb_id=kb_id, file_id=db_file.id, file_ext=file_ext)
+    try:
+        await storage_service.storage.upload(file_key=file_key, content=contents, content_type=file.content_type)
+    except Exception as e:
+        api_logger.error(f"Storage upload failed: {e}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"File storage failed: {str(e)}")

-    # Save file
-    with open(save_path, "wb") as f:
-        f.write(contents)
+    # Save file_key
+    db_file.file_key = file_key
+    db.commit()
+    db.refresh(db_file)

-    # Verify whether the file has been saved successfully
-    if not os.path.exists(save_path):
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="File save failed"
-        )
+    # Create document (inherit parser_config from knowledge base)
+    default_parser_config = {
+        "layout_recognize": "DeepDOC", "chunk_token_num": 128, "delimiter": "\n",
+        "auto_keywords": 0, "auto_questions": 0, "html4excel": "false"
+    }
+    try:
+        db_knowledge = get_kb_by_id(db, knowledge_id=kb_id, current_user=current_user)
+        if db_knowledge and db_knowledge.parser_config:
+            default_parser_config.update(dict(db_knowledge.parser_config))
+    except Exception:
+        pass

-    # Create a document
    create_data = document_schema.DocumentCreate(
-        kb_id=kb_id,
-        created_by=current_user.id,
-        file_id=db_file.id,
-        file_name=db_file.file_name,
-        file_ext=db_file.file_ext,
-        file_size=db_file.file_size,
-        file_meta={},
-        parser_id="naive",
-        parser_config={
-            "layout_recognize": "DeepDOC",
-            "chunk_token_num": 128,
-            "delimiter": "\n",
-            "auto_keywords": 0,
-            "auto_questions": 0,
-            "html4excel": "false"
-        }
+        kb_id=kb_id, created_by=current_user.id, file_id=db_file.id,
+        file_name=db_file.file_name, file_ext=db_file.file_ext, file_size=db_file.file_size,
+        file_meta={}, parser_id="naive", parser_config=default_parser_config
    )
    db_document = document_service.create_document(db=db, document=create_data, current_user=current_user)

@@ -221,123 +166,73 @@ async def custom_text(
        parent_id: uuid.UUID,
        create_data: file_schema.CustomTextFileCreate,
        db: Session = Depends(get_db),
-        current_user: User = Depends(get_current_user)
+        current_user: User = Depends(get_current_user),
+        storage_service: FileStorageService = Depends(get_file_storage_service),
 ):
-    """
-    custom text
-    """
-    api_logger.info(f"custom text upload request: kb_id={kb_id}, parent_id={parent_id}, title={create_data.title}, content={create_data.content}, username: {current_user.username}")
-
-    # Check file content size
-    # 将内容编码为字节（UTF-8）
+    """Custom text upload"""
    content_bytes = create_data.content.encode('utf-8')
    file_size = len(content_bytes)
-    print(f"file size: {file_size} byte")
    if file_size == 0:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="The content is empty."
-        )
-    # If the file size exceeds 50MB (50 * 1024 * 1024 bytes)
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="The content is empty.")
    if file_size > settings.MAX_FILE_SIZE:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"The content size exceeds the {settings.MAX_FILE_SIZE}byte limit"
-        )
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Content size exceeds {settings.MAX_FILE_SIZE} byte limit")

-    upload_file = file_schema.FileCreate(
-        kb_id=kb_id,
-        created_by=current_user.id,
-        parent_id=parent_id,
-        file_name=f"{create_data.title}.txt",
-        file_ext=".txt",
-        file_size=file_size,
+    upload_file_data = file_schema.FileCreate(
+        kb_id=kb_id, created_by=current_user.id, parent_id=parent_id,
+        file_name=f"{create_data.title}.txt", file_ext=".txt", file_size=file_size,
    )
-    db_file = file_service.create_file(db=db, file=upload_file, current_user=current_user)
+    db_file = file_service.create_file(db=db, file=upload_file_data, current_user=current_user)

-    # Construct a save path：/files/{kb_id}/{parent_id}/{file.id}{file_extension}
-    save_dir = os.path.join(settings.FILE_PATH, str(kb_id), str(parent_id))
-    Path(save_dir).mkdir(parents=True, exist_ok=True)  # Ensure that the directory exists
-    save_path = os.path.join(save_dir, f"{db_file.id}.txt")
+    # Upload to storage backend
+    file_key = generate_kb_file_key(kb_id=kb_id, file_id=db_file.id, file_ext=".txt")
+    try:
+        await storage_service.storage.upload(file_key=file_key, content=content_bytes, content_type="text/plain")
+    except Exception as e:
+        api_logger.error(f"Storage upload failed: {e}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"File storage failed: {str(e)}")

-    # Save file
-    with open(save_path, "wb") as f:
-        f.write(content_bytes)
+    db_file.file_key = file_key
+    db.commit()
+    db.refresh(db_file)

-    # Verify whether the file has been saved successfully
-    if not os.path.exists(save_path):
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="File save failed"
-        )
-
-    # Create a document
    create_document_data = document_schema.DocumentCreate(
-        kb_id=kb_id,
-        created_by=current_user.id,
-        file_id=db_file.id,
-        file_name=db_file.file_name,
-        file_ext=db_file.file_ext,
-        file_size=db_file.file_size,
-        file_meta={},
-        parser_id="naive",
-        parser_config={
-            "layout_recognize": "DeepDOC",
-            "chunk_token_num": 128,
-            "delimiter": "\n",
-            "auto_keywords": 0,
-            "auto_questions": 0,
-            "html4excel": "false"
-        }
+        kb_id=kb_id, created_by=current_user.id, file_id=db_file.id,
+        file_name=db_file.file_name, file_ext=db_file.file_ext, file_size=db_file.file_size,
+        file_meta={}, parser_id="naive",
+        parser_config={"layout_recognize": "DeepDOC", "chunk_token_num": 128, "delimiter": "\n",
+                       "auto_keywords": 0, "auto_questions": 0, "html4excel": "false"}
    )
    db_document = document_service.create_document(db=db, document=create_document_data, current_user=current_user)

-    api_logger.info(f"custom text upload successfully: {create_data.title} (file_id: {db_file.id}, document_id: {db_document.id})")
    return success(data=jsonable_encoder(document_schema.Document.model_validate(db_document)), msg="custom text upload successful")


@router.get("/{file_id}", response_model=Any)
 async def get_file(
        file_id: uuid.UUID,
-        db: Session = Depends(get_db)
+        db: Session = Depends(get_db),
+        storage_service: FileStorageService = Depends(get_file_storage_service),
 ) -> Any:
-    """
-    Download the file based on the file_id
-    - Query file information from the database
-    - Construct the file path and check if it exists
-    - Return a FileResponse to download the file
-    """
-    api_logger.info(f"Download the file based on the file_id: file_id={file_id}")
-
-    # 1. Query file information from the database
+    """Download file by file_id"""
    db_file = file_service.get_file_by_id(db, file_id=file_id)
    if not db_file:
-        api_logger.warning(f"The file does not exist or you do not have permission to access it: file_id={file_id}")
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail="The file does not exist or you do not have permission to access it"
-        )
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="File not found")

-    # 2. Construct file path：/files/{kb_id}/{parent_id}/{file.id}{file.file_ext}
-    file_path = os.path.join(
-        settings.FILE_PATH,
-        str(db_file.kb_id),
-        str(db_file.parent_id),
-        f"{db_file.id}{db_file.file_ext}"
-    )
+    if not db_file.file_key:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="File has no storage key (legacy data not migrated)")

-    # 3. Check if the file exists
-    if not os.path.exists(file_path):
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail="File not found (possibly deleted)"
-        )
+    try:
+        content = await storage_service.download_file(db_file.file_key)
+    except Exception as e:
+        api_logger.error(f"Storage download failed: {e}")
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="File not found in storage")

-    # 4.Return FileResponse (automatically handle download)
-    return FileResponse(
-        path=file_path,
-        filename=db_file.file_name,  # Use original file name
-        media_type="application/octet-stream"  # Universal binary stream type
+    import mimetypes
+    media_type = mimetypes.guess_type(db_file.file_name)[0] or "application/octet-stream"
+    return Response(
+        content=content,
+        media_type=media_type,
+        headers={"Content-Disposition": f'attachment; filename="{db_file.file_name}"'}
    )


@@ -348,50 +243,22 @@ async def update_file(
        db: Session = Depends(get_db),
        current_user: User = Depends(get_current_user)
 ):
-    """
-    Update file information (such as file name)
-    - Only specified fields such as file_name are allowed to be modified
-    """
-    api_logger.debug(f"Query the file to be updated: {file_id}")
-
-    # 1. Check if the file exists
+    """Update file information (such as file name)"""
    db_file = file_service.get_file_by_id(db, file_id=file_id)
-
    if not db_file:
-        api_logger.warning(f"The file does not exist or you do not have permission to access it: file_id={file_id}")
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail="The file does not exist or you do not have permission to access it"
-        )
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="File not found")

-    # 2. Update fields (only update non-null fields)
-    api_logger.debug(f"Start updating the file fields: {file_id}")
-    updated_fields = []
    for field, value in update_data.dict(exclude_unset=True).items():
        if hasattr(db_file, field):
-            old_value = getattr(db_file, field)
-            if old_value != value:
-                # update value
-                setattr(db_file, field, value)
-                updated_fields.append(f"{field}: {old_value} -> {value}")
+            setattr(db_file, field, value)

-    if updated_fields:
-        api_logger.debug(f"updated fields: {', '.join(updated_fields)}")
-
-    # 3. Save to database
    try:
        db.commit()
        db.refresh(db_file)
-        api_logger.info(f"The file has been successfully updated: {db_file.file_name} (ID: {db_file.id})")
    except Exception as e:
        db.rollback()
-        api_logger.error(f"File update failed: file_id={file_id} - {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"File update failed: {str(e)}"
-        )
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"File update failed: {str(e)}")

-    # 4. Return the updated file
    return success(data=jsonable_encoder(file_schema.File.model_validate(db_file)), msg="File information updated successfully")


@@ -399,60 +266,43 @@ async def update_file(
 async def delete_file(
        file_id: uuid.UUID,
        db: Session = Depends(get_db),
-        current_user: User = Depends(get_current_user)
+        current_user: User = Depends(get_current_user),
+        storage_service: FileStorageService = Depends(get_file_storage_service),
 ):
-    """
-    Delete a file or folder
-    """
-    api_logger.info(f"Request to delete file: file_id={file_id}, username: {current_user.username}")
-    await _delete_file(db=db, file_id=file_id, current_user=current_user)
+    """Delete a file or folder"""
+    api_logger.info(f"Request to delete file: file_id={file_id}")
+    await _delete_file(db=db, file_id=file_id, current_user=current_user, storage_service=storage_service)
    return success(msg="File deleted successfully")

+
 async def _delete_file(
        file_id: uuid.UUID,
-        db: Session = Depends(get_db),
-        current_user: User = Depends(get_current_user)
+        db: Session,
+        current_user: User,
+        storage_service: FileStorageService,
 ) -> None:
-    """
-    Delete a file or folder
-    """
-    # 1. Check if the file exists
+    """Delete a file or folder from storage and database"""
    db_file = file_service.get_file_by_id(db, file_id=file_id)
-
    if not db_file:
-        api_logger.warning(f"The file does not exist or you do not have permission to access it: file_id={file_id}")
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail="The file does not exist or you do not have permission to access it"
-        )
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="File not found")

-    # 2. Construct physical path
-    file_path = Path(
-        settings.FILE_PATH,
-        str(db_file.kb_id),
-        str(db_file.id)
-    ) if db_file.file_ext == 'folder' else Path(
-        settings.FILE_PATH,
-        str(db_file.kb_id),
-        str(db_file.parent_id),
-        f"{db_file.id}{db_file.file_ext}"
-    )
-
-    # 3. Delete physical files/folders
-    try:
-        if file_path.exists():
-            if db_file.file_ext == 'folder':
-                shutil.rmtree(file_path)  # Recursively delete folders
-            else:
-                file_path.unlink()  # Delete a single file
-    except Exception as e:
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to delete physical file/folder: {str(e)}"
-        )
-
-    # 4.Delete db_file
+    # Delete from storage backend
    if db_file.file_ext == 'folder':
+        # For folders, delete all child files from storage first
+        child_files = db.query(file_model.File).filter(file_model.File.parent_id == db_file.id).all()
+        for child in child_files:
+            if child.file_key:
+                try:
+                    await storage_service.delete_file(child.file_key)
+                except Exception as e:
+                    api_logger.warning(f"Failed to delete child file from storage: {child.file_key} - {e}")
        db.query(file_model.File).filter(file_model.File.parent_id == db_file.id).delete()
+    else:
+        if db_file.file_key:
+            try:
+                await storage_service.delete_file(db_file.file_key)
+            except Exception as e:
+                api_logger.warning(f"Failed to delete file from storage: {db_file.file_key} - {e}")
+
    db.delete(db_file)
    db.commit()
--- a/api/app/controllers/service/rag_api_chunk_controller.py
+++ b/api/app/controllers/service/rag_api_chunk_controller.py
@@ -113,6 +113,33 @@ async def create_chunk(
                                               current_user=current_user)


+@router.post("/{kb_id}/{document_id}/chunk/batch", response_model=ApiResponse)
+@require_api_key(scopes=["rag"])
+async def create_chunks_batch(
+    kb_id: uuid.UUID,
+    document_id: uuid.UUID,
+    request: Request,
+    api_key_auth: ApiKeyAuth = None,
+    db: Session = Depends(get_db),
+    items: list = Body(..., description="chunk items list"),
+):
+    """
+    Batch create chunks (max 8)
+    """
+    body = await request.json()
+    batch_data = chunk_schema.ChunkBatchCreate(**body)
+    # 0. Obtain the creator of the api key
+    api_key = api_key_service.ApiKeyService.get_api_key(db, api_key_auth.api_key_id, api_key_auth.workspace_id)
+    current_user = api_key.creator
+    current_user.current_workspace_id = api_key_auth.workspace_id
+
+    return await chunk_controller.create_chunks_batch(kb_id=kb_id,
+                                                      document_id=document_id,
+                                                      batch_data=batch_data,
+                                                      db=db,
+                                                      current_user=current_user)
+
+
@router.get("/{kb_id}/{document_id}/{doc_id}", response_model=ApiResponse)
@require_api_key(scopes=["rag"])
 async def get_chunk(
@@ -176,6 +203,7 @@ async def delete_chunk(
    request: Request,
    api_key_auth: ApiKeyAuth = None,
    db: Session = Depends(get_db),
+    force_refresh: bool = Query(False, description="Force Elasticsearch refresh after deletion"),
 ):
    """
    delete document chunk
@@ -188,6 +216,7 @@ async def delete_chunk(
    return await chunk_controller.delete_chunk(kb_id=kb_id,
                                               document_id=document_id,
                                               doc_id=doc_id,
+                                               force_refresh=force_refresh,
                                               db=db,
                                               current_user=current_user)

--- a/api/app/core/config.py
+++ b/api/app/core/config.py
@@ -98,6 +98,7 @@ class Settings:
    # File Upload
    MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "52428800"))
    MAX_FILE_COUNT: int = int(os.getenv("MAX_FILE_COUNT", "20"))
+    MAX_CHUNK_BATCH_SIZE: int = int(os.getenv("MAX_CHUNK_BATCH_SIZE", "8"))
    FILE_PATH: str = os.getenv("FILE_PATH", "/files")
    FILE_URL_EXPIRES: int = int(os.getenv("FILE_URL_EXPIRES", "3600"))

--- a/api/app/core/rag/graphrag/general/index.py
+++ b/api/app/core/rag/graphrag/general/index.py
@@ -46,7 +46,10 @@ async def run_graphrag(
    start = trio.current_time()
    workspace_id, kb_id, document_id = row["workspace_id"], str(row["kb_id"]), row["document_id"]
    chunks = []
-    for d in settings.retriever.chunk_list(document_id, workspace_id, [kb_id], fields=["page_content", "document_id"], sort_by_position=True):
+    for d in settings.retriever.chunk_list(document_id, workspace_id, [kb_id], fields=["page_content", "document_id", "chunk_type"], sort_by_position=True):
+        # 跳过 QA chunks，只用原文 chunks 构建图谱
+        if d.get("chunk_type") == "qa":
+            continue
        chunks.append(d["page_content"])

    with trio.fail_after(max(120, len(chunks) * 60 * 10) if enable_timeout_assertion else 10000000000):
@@ -150,6 +153,9 @@ async def run_graphrag_for_kb(

        total, items = vector_service.search_by_segment(document_id=str(document_id), query=None, pagesize=9999, page=1, asc=True)
        for doc in items:
+            # 跳过 QA chunks，只用原文 chunks 构建图谱
+            if (doc.metadata or {}).get("chunk_type") == "qa":
+                continue
            content = doc.page_content
            if num_tokens_from_string(current_chunk + content) < 1024:
                current_chunk += content
--- a/api/app/core/rag/prompts/generator.py
+++ b/api/app/core/rag/prompts/generator.py
@@ -131,18 +131,52 @@ def keyword_extraction(chat_mdl, content, topn=3):


 def question_proposal(chat_mdl, content, topn=3):
-    template = PROMPT_JINJA_ENV.from_string(QUESTION_PROMPT_TEMPLATE)
-    rendered_prompt = template.render(content=content, topn=topn)
-
-    msg = [{"role": "system", "content": rendered_prompt}, {"role": "user", "content": "Output: "}]
-    _, msg = message_fit_in(msg, getattr(chat_mdl, 'max_length', 8096))
-    kwd = chat_mdl.chat(rendered_prompt, msg[1:], {"temperature": 0.2})
-    if isinstance(kwd, tuple):
-        kwd = kwd[0]
-    kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
-    if kwd.find("**ERROR**") >= 0:
+    """生成问题（向后兼容，返回纯文本问题列表）"""
+    pairs = qa_proposal(chat_mdl, content, topn)
+    if not pairs:
        return ""
-    return kwd
+    return "\n".join([p["question"] for p in pairs])
+
+
+def qa_proposal(chat_mdl, content, topn=3, custom_prompt=None):
+    """生成 QA 对，返回 [{"question": ..., "answer": ...}, ...]
+    
+    Args:
+        chat_mdl: LLM 模型
+        content: 文本内容
+        topn: 生成 QA 对数量
+        custom_prompt: 自定义 prompt 模板（支持 Jinja2，可用变量: content, topn）
+    """
+    if custom_prompt:
+        template = PROMPT_JINJA_ENV.from_string(custom_prompt)
+        sys_prompt = template.render(topn=topn)
+    else:
+        sys_prompt = QUESTION_PROMPT_TEMPLATE
+    msg = [{"role": "system", "content": sys_prompt}, {"role": "user", "content": content}]
+    _, msg = message_fit_in(msg, getattr(chat_mdl, 'max_length', 8096))
+    raw = chat_mdl.chat(sys_prompt, msg[1:], {"temperature": 0.2})
+    if isinstance(raw, tuple):
+        raw = raw[0]
+    raw = re.sub(r"^.*</think>", "", raw, flags=re.DOTALL)
+    if raw.find("**ERROR**") >= 0:
+        return []
+    return parse_qa_pairs(raw)
+
+
+def parse_qa_pairs(text: str) -> list:
+    """解析 LLM 返回的 QA 对文本，格式: Q: xxx A: xxx"""
+    pairs = []
+    for line in text.strip().split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+        # 匹配 Q: ... A: ... 格式
+        match = re.match(r'^Q:\s*(.+?)\s+A:\s*(.+)$', line, re.IGNORECASE)
+        if match:
+            q, a = match.group(1).strip(), match.group(2).strip()
+            if q and a:
+                pairs.append({"question": q, "answer": a})
+    return pairs


 def graph_entity_types(chat_mdl, scenario):
--- a/api/app/core/rag/prompts/question_prompt.md
+++ b/api/app/core/rag/prompts/question_prompt.md
@@ -1,19 +1,20 @@
 ## Role
-You are a text analyzer.
+You are a text analyzer and knowledge extraction expert.

 ## Task
-Propose {{ topn }} questions about a given piece of text content.
+Generate question-answer pairs from the given text content.

 ## Requirements
- Understand and summarize the text content, and propose the top {{ topn }} important questions.
+- Understand and summarize the text content, then generate up to {{ topn }} important question-answer pairs.
+- Each question-answer pair MUST be on a single line, formatted as: Q: <question> A: <answer>
 - The questions SHOULD NOT have overlapping meanings.
 - The questions SHOULD cover the main content of the text as much as possible.
- The questions MUST be in the same language as the given piece of text content.
- One question per line.
- Output questions ONLY.
-
---
-
-## Text Content
-{{ content }}
+- The answers MUST be concise, accurate, and directly derived from the text content.
+- The answers SHOULD be self-contained and understandable without additional context.
+- Both questions and answers MUST be in the same language as the given text content.
+- If the text is too short or lacks substantive content, generate fewer pairs rather than padding.
+- Output question-answer pairs ONLY, no extra explanation or commentary.

+## Example Output
+Q: What is the capital of France? A: The capital of France is Paris.
+Q: When was the Eiffel Tower built? A: The Eiffel Tower was built in 1889.
--- a/api/app/core/rag/prompts/vision_llm_describe_prompt.md
+++ b/api/app/core/rag/prompts/vision_llm_describe_prompt.md
@@ -14,6 +14,7 @@ Transcribe the content from the provided PDF page image into clean Markdown form
 6. Do NOT wrap the output in ```markdown or ``` blocks.
 7. Only apply Markdown structure to headings, paragraphs, lists, and tables, strictly based on the layout of the image. Do NOT create tables unless an actual table exists in the image.
 8. Preserve the original language, information, and order exactly as shown in the image.
+9. Your output language MUST match the language of the content in the image. If the image contains Chinese text, output in Chinese. If English, output in English. Never translate.

 {% if page %}
 At the end of the transcription, add the page divider: `--- Page {{ page }} ---`.
--- a/api/app/core/rag/vdb/elasticsearch/elasticsearch_vector.py
+++ b/api/app/core/rag/vdb/elasticsearch/elasticsearch_vector.py
@@ -5,7 +5,7 @@ from typing import Any
 from urllib.parse import urlparse

 import requests
-from elasticsearch import Elasticsearch, helpers
+from elasticsearch import Elasticsearch, helpers, NotFoundError
 from elasticsearch.helpers import BulkIndexError
 from packaging.version import parse as parse_version
 # langchain-community
@@ -53,13 +53,30 @@ class ElasticSearchVector(BaseVector):
        return "elasticsearch"

    def add_chunks(self, chunks: list[DocumentChunk], **kwargs):
-        # 实现 Elasticsearch 保存向量
-        texts = [chunk.page_content for chunk in chunks]
+        # QA chunks: embedding 只对 question 字段做；source chunks: 不做 embedding
+        texts_for_embedding = []
+        for chunk in chunks:
+            chunk_type = (chunk.metadata or {}).get("chunk_type", "chunk")
+            if chunk_type == "source":
+                # source chunk 不需要向量索引
+                texts_for_embedding.append("")
+            elif chunk_type == "qa":
+                # QA chunk: 用 question 字段做 embedding
+                texts_for_embedding.append((chunk.metadata or {}).get("question", chunk.page_content))
+            else:
+                # 普通 chunk: 用 page_content 做 embedding
+                texts_for_embedding.append(chunk.page_content)
+
        if self.is_multimodal_embedding:
-            # 火山引擎多模态 Embedding
-            embeddings = self.embeddings.embed_batch(texts)
+            embeddings = self.embeddings.embed_batch(texts_for_embedding)
        else:
-            embeddings = self.embeddings.embed_documents(list(texts))
+            embeddings = self.embeddings.embed_documents(texts_for_embedding)
+
+        # source chunk 的向量置空
+        for i, chunk in enumerate(chunks):
+            if (chunk.metadata or {}).get("chunk_type") == "source":
+                embeddings[i] = None
+
        self.create(chunks, embeddings, **kwargs)

    def create(self, chunks: list[DocumentChunk], embeddings: list[list[float]], **kwargs):
@@ -72,13 +89,25 @@ class ElasticSearchVector(BaseVector):
        uuids = self._get_uuids(chunks)
        actions = []
        for i, chunk in enumerate(chunks):
+            source = {
+                Field.CONTENT_KEY.value: chunk.page_content,
+                Field.METADATA_KEY.value: chunk.metadata or {},
+                Field.VECTOR.value: embeddings[i] or None
+            }
+            # 写入 QA 相关字段
+            meta = chunk.metadata or {}
+            if meta.get("chunk_type"):
+                source[Field.CHUNK_TYPE.value] = meta["chunk_type"]
+            if meta.get("question"):
+                source[Field.QUESTION.value] = meta["question"]
+            if meta.get("answer"):
+                source[Field.ANSWER.value] = meta["answer"]
+            if meta.get("source_chunk_id"):
+                source[Field.SOURCE_CHUNK_ID.value] = meta["source_chunk_id"]
+
            action = {
                "_index": self._collection_name,
-                "_source": {
-                    Field.CONTENT_KEY.value: chunk.page_content,
-                    Field.METADATA_KEY.value: chunk.metadata or {},
-                    Field.VECTOR.value: embeddings[i] or None
-                }
+                "_source": source
            }
            actions.append(action)
        # using bulk mode
@@ -113,7 +142,7 @@ class ElasticSearchVector(BaseVector):

        return True

-    def delete_by_ids(self, ids: list[str]):
+    def delete_by_ids(self, ids: list[str], *, refresh: bool = False):
        if not ids:
            return
        if not self._client.indices.exists(index=self._collection_name):
@@ -134,6 +163,8 @@ class ElasticSearchVector(BaseVector):
            actions = [{"_op_type": "delete", "_index": self._collection_name, "_id": es_id} for es_id in actual_ids]
            try:
                helpers.bulk(self._client, actions)
+                if refresh:
+                    self._client.indices.refresh(index=self._collection_name)
            except BulkIndexError as e:
                for error in e.errors:
                    delete_error = error.get('delete', {})
@@ -153,7 +184,7 @@ class ElasticSearchVector(BaseVector):
        else:
            return None

-    def delete_by_metadata_field(self, key: str, value: str):
+    def delete_by_metadata_field(self, key: str, value: str, *, refresh: bool = False):
        if not self._client.indices.exists(index=self._collection_name):
            return False
        actual_ids = self.get_ids_by_metadata_field(key, value)
@@ -162,6 +193,8 @@ class ElasticSearchVector(BaseVector):
            actions = [{"_op_type": "delete", "_index": self._collection_name, "_id": es_id} for es_id in actual_ids]
            try:
                helpers.bulk(self._client, actions)
+                if refresh:
+                    self._client.indices.refresh(index=self._collection_name)
            except BulkIndexError as e:
                for error in e.errors:
                    delete_error = error.get('delete', {})
@@ -192,6 +225,8 @@ class ElasticSearchVector(BaseVector):
            List of DocumentChunk objects that match the query.
        """
        indices = kwargs.get("indices", self._collection_name)  # Default single index, multiple indexes are also supported, such as "index1, index2, index3"
+        if not self._client.indices.exists(index=indices):
+            return 0, []

        # Calculate the start position for the current page
        from_ = pagesize * (page-1)
@@ -226,12 +261,15 @@ class ElasticSearchVector(BaseVector):
            })

        # For simplicity, we use from/size here which has a limit (usually up to 10,000).
-        result = self._client.search(
-            index=indices,
-            from_=from_,  # Only use from_ for the first page (simplified)
-            size=pagesize,
-            body=query_str,
-        )
+        try:
+            result = self._client.search(
+                index=indices,
+                from_=from_,  # Only use from_ for the first page (simplified)
+                size=pagesize,
+                body=query_str,
+            )
+        except NotFoundError:
+            return 0, []

        if "errors" in result:
            raise ValueError(f"Error during query: {result['errors']}")
@@ -241,10 +279,19 @@ class ElasticSearchVector(BaseVector):
        for res in result["hits"]["hits"]:
            source = res["_source"]
            page_content = source.get(Field.CONTENT_KEY.value)
-            # vector = source.get(Field.VECTOR.value)
            vector = None
            metadata = source.get(Field.METADATA_KEY.value, {})
+            chunk_type = source.get(Field.CHUNK_TYPE.value)
            score = res["_score"]
+
+            # 将 QA 字段注入 metadata 供前端展示
+            if chunk_type:
+                metadata["chunk_type"] = chunk_type
+            if chunk_type == "qa":
+                metadata["question"] = source.get(Field.QUESTION.value, "")
+                metadata["answer"] = source.get(Field.ANSWER.value, "")
+                page_content = f"Q: {metadata['question']}\nA: {metadata['answer']}"
+
            docs_and_scores.append((DocumentChunk(page_content=page_content, vector=vector, metadata=metadata), score))

        docs = []
@@ -267,13 +314,18 @@ class ElasticSearchVector(BaseVector):
            List of DocumentChunk objects that match the query.
        """
        indices = kwargs.get("indices", self._collection_name)  # Default single index, multi-index available，etc "index1,index2,index3"
+        if not self._client.indices.exists(index=indices):
+            return 0, []
        query_str = {"query": {"term": {f"{Field.DOC_ID.value}": doc_id}}}
-        result = self._client.search(
-            index=indices,
-            from_=0,  # Only use from_ for the first page (simplified)
-            size=1,
-            body=query_str,
-        )
+        try:
+            result = self._client.search(
+                index=indices,
+                from_=0,  # Only use from_ for the first page (simplified)
+                size=1,
+                body=query_str,
+            )
+        except NotFoundError:
+            return 0, []
        # print(result)
        if "errors" in result:
            raise ValueError(f"Error during query: {result['errors']}")
@@ -308,27 +360,43 @@ class ElasticSearchVector(BaseVector):
        Returns:
            updated count.
        """
-        indices = kwargs.get("indices", self._collection_name)  # Default single index, multi-index available，etc "index1,index2,index3"
-        if self.is_multimodal_embedding:
-            # 火山引擎多模态 Embedding
-            chunk.vector = self.embeddings.embed_text(chunk.page_content)
+        indices = kwargs.get("indices", self._collection_name)
+        chunk_type = (chunk.metadata or {}).get("chunk_type")
+
+        # QA chunk: embedding 基于 question；source chunk: 不更新向量
+        if chunk_type == "source":
+            embed_text = ""
+        elif chunk_type == "qa":
+            embed_text = (chunk.metadata or {}).get("question", chunk.page_content)
        else:
-            chunk.vector = self.embeddings.embed_query(chunk.page_content)
+            embed_text = chunk.page_content
+
+        if chunk_type != "source":
+            if self.is_multimodal_embedding:
+                chunk.vector = self.embeddings.embed_text(embed_text)
+            else:
+                chunk.vector = self.embeddings.embed_query(embed_text)
+
+        script_source = "ctx._source.page_content = params.new_content; ctx._source.vector = params.new_vector;"
+        params = {
+            "new_content": chunk.page_content,
+            "new_vector": chunk.vector if chunk_type != "source" else None
+        }
+
+        # QA chunk: 同时更新 question/answer 字段
+        if chunk_type == "qa":
+            script_source += " ctx._source.question = params.new_question; ctx._source.answer = params.new_answer;"
+            params["new_question"] = (chunk.metadata or {}).get("question", "")
+            params["new_answer"] = (chunk.metadata or {}).get("answer", "")

        body = {
            "script": {
-                "source": """
-                        ctx._source.page_content = params.new_content;
-                        ctx._source.vector = params.new_vector;
-                    """,
-                "params": {
-                    "new_content": chunk.page_content,
-                    "new_vector": chunk.vector
-                }
+                "source": script_source,
+                "params": params
            },
            "query": {
                "term": {
-                    Field.DOC_ID.value: chunk.metadata["doc_id"]  # exact match doc_id
+                    Field.DOC_ID.value: chunk.metadata["doc_id"]
                }
            }
        }
@@ -336,9 +404,6 @@ class ElasticSearchVector(BaseVector):
            index=indices,
            body=body,
        )
-        # Remove debug printing and use logging instead
-        # print(result)
-        # print(f"Update successful, number of affected documents: {result['updated']}")
        return result['updated']

    def change_status_by_document_id(self, document_id: str, status: int, **kwargs) -> str:
@@ -397,11 +462,11 @@ class ElasticSearchVector(BaseVector):
                            }
                        }
                    },
-                    "filter": {  # Add the filter condition of status=1
-                        "term": {
-                            "metadata.status": 1
-                        }
-                    }
+                    "filter": [
+                        {"term": {"metadata.status": 1}},
+                        # 排除 source chunk（仅供 GraphRAG 使用，不参与检索）
+                        {"bool": {"must_not": {"term": {Field.CHUNK_TYPE.value: "source"}}}}
+                    ]
                }
            }
        # If file_names_filter is passed in, merge the filtering conditions
@@ -415,22 +480,14 @@ class ElasticSearchVector(BaseVector):
                            },
                            "script": {
                                "source": f"cosineSimilarity(params.query_vector, '{Field.VECTOR.value}') + 1.0",
-                                # The script_score query calculates the cosine similarity between the embedding field of each document and the query vector. The addition of +1.0 is to ensure that the scores returned by the script are non-negative, as the range of cosine similarity is [-1, 1]
                                "params": {"query_vector": query_vector}
                            }
                        }
                    },
                    "filter": [
-                        {
-                            "term": {
-                                "metadata.status": 1
-                            }
-                        },
-                        {
-                            "terms": {
-                                "metadata.file_name": file_names_filter  # Additional file_name filtering
-                            }
-                        }
+                        {"term": {"metadata.status": 1}},
+                        {"terms": {"metadata.file_name": file_names_filter}},
+                        {"bool": {"must_not": {"term": {Field.CHUNK_TYPE.value: "source"}}}}
                    ],
                }
            }
@@ -451,8 +508,19 @@ class ElasticSearchVector(BaseVector):
            source = res["_source"]
            page_content = source.get(Field.CONTENT_KEY.value)
            metadata = source.get(Field.METADATA_KEY.value, {})
+            chunk_type = source.get(Field.CHUNK_TYPE.value)
            score = res["_score"]
            score = score / 2  # Normalized [0-1]
+
+            # QA chunk: 返回 Q+A 拼接作为上下文
+            if chunk_type == "qa":
+                question = source.get(Field.QUESTION.value, "")
+                answer = source.get(Field.ANSWER.value, "")
+                page_content = f"Q: {question}\nA: {answer}"
+                metadata["chunk_type"] = "qa"
+                metadata["question"] = question
+                metadata["answer"] = answer
+
            docs_and_scores.append((DocumentChunk(page_content=page_content, metadata=metadata), score))

        docs = []
@@ -491,11 +559,10 @@ class ElasticSearchVector(BaseVector):
                        }
                    }
                },
-                "filter": {  # Add the filter condition of status=1
-                    "term": {
-                        "metadata.status": 1
-                    }
-                }
+                "filter": [
+                    {"term": {"metadata.status": 1}},
+                    {"bool": {"must_not": {"term": {Field.CHUNK_TYPE.value: "source"}}}}
+                ]
            }
        }

@@ -512,16 +579,9 @@ class ElasticSearchVector(BaseVector):
                        }
                    },
                    "filter": [
-                        {
-                            "term": {
-                                "metadata.status": 1
-                            }
-                        },
-                        {
-                            "terms": {
-                                "metadata.file_name": file_names_filter  # Additional file_name filtering
-                            }
-                        }
+                        {"term": {"metadata.status": 1}},
+                        {"terms": {"metadata.file_name": file_names_filter}},
+                        {"bool": {"must_not": {"term": {Field.CHUNK_TYPE.value: "source"}}}}
                    ],
                }
            }
@@ -543,6 +603,17 @@ class ElasticSearchVector(BaseVector):
            source = res["_source"]
            page_content = source.get(Field.CONTENT_KEY.value)
            metadata = source.get(Field.METADATA_KEY.value, {})
+            chunk_type = source.get(Field.CHUNK_TYPE.value)
+
+            # QA chunk: 返回 Q+A 拼接作为上下文
+            if chunk_type == "qa":
+                question = source.get(Field.QUESTION.value, "")
+                answer = source.get(Field.ANSWER.value, "")
+                page_content = f"Q: {question}\nA: {answer}"
+                metadata["chunk_type"] = "qa"
+                metadata["question"] = question
+                metadata["answer"] = answer
+
            # Normalize the score to the [0,1] interval
            normalized_score = res["_score"] / max_score
            docs_and_scores.append((DocumentChunk(page_content=page_content, metadata=metadata), normalized_score))
@@ -652,7 +723,7 @@ class ElasticSearchVector(BaseVector):
                        },
                        Field.VECTOR.value: {
                            "type": "dense_vector",
-                            "dims": len(embeddings[0]),  # Make sure the dimension is correct here,The dimension size of the vector. When index is true, it cannot exceed 1024; when index is false or not specified, it cannot exceed 2048, which can improve retrieval efficiency
+                            "dims": len(next((e for e in embeddings if e is not None), [0]*768)),  # 跳过 None 获取向量维度，fallback 768
                            "index": True,
                            "similarity": "cosine"
                        }
--- a/api/app/core/rag/vdb/field.py
+++ b/api/app/core/rag/vdb/field.py
@@ -14,3 +14,8 @@ class Field(StrEnum):
    DOCUMENT_ID = "metadata.document_id"
    KNOWLEDGE_ID = "metadata.knowledge_id"
    SORT_ID = "metadata.sort_id"
+    # QA fields
+    CHUNK_TYPE = "chunk_type"  # "chunk" | "source" | "qa"
+    QUESTION = "question"
+    ANSWER = "answer"
+    SOURCE_CHUNK_ID = "source_chunk_id"
--- a/api/app/core/rag/vdb/vector_base.py
+++ b/api/app/core/rag/vdb/vector_base.py
@@ -27,14 +27,14 @@ class BaseVector(ABC):
        raise NotImplementedError

    @abstractmethod
-    def delete_by_ids(self, ids: list[str]):
+    def delete_by_ids(self, ids: list[str], *, refresh: bool = False):
        raise NotImplementedError

    def get_ids_by_metadata_field(self, key: str, value: str):
        raise NotImplementedError

    @abstractmethod
-    def delete_by_metadata_field(self, key: str, value: str):
+    def delete_by_metadata_field(self, key: str, value: str, *, refresh: bool = False):
        raise NotImplementedError

    @abstractmethod
--- a/api/app/models/file_model.py
+++ b/api/app/models/file_model.py
@@ -15,4 +15,5 @@ class File(Base):
    file_ext = Column(String, index=True, nullable=False, comment="file extension:folder|pdf")
    file_size = Column(Integer, default=0, comment="file size(byte)")
    file_url = Column(String, index=True, nullable=True, comment="file comes from a website url")
+    file_key = Column(String(512), nullable=True, index=True, comment="storage file key for FileStorageService")
    created_at = Column(DateTime, default=datetime.datetime.now)
--- a/api/app/schemas/chunk_schema.py
+++ b/api/app/schemas/chunk_schema.py
@@ -20,13 +20,26 @@ class ChunkCreate(BaseModel):

    @property
    def chunk_content(self) -> str:
-        """
-        Get the actual content string regardless of input type
-        """
+        """Get the actual content string regardless of input type"""
        if isinstance(self.content, QAChunk):
-            return f"question: {self.content.question} answer: {self.content.answer}"
+            return self.content.question  # QA 模式下 page_content 存 question
        return self.content

+    @property
+    def is_qa(self) -> bool:
+        return isinstance(self.content, QAChunk)
+
+    @property
+    def qa_metadata(self) -> dict:
+        """返回 QA 相关的 metadata 字段"""
+        if isinstance(self.content, QAChunk):
+            return {
+                "chunk_type": "qa",
+                "question": self.content.question,
+                "answer": self.content.answer,
+            }
+        return {}
+

 class ChunkUpdate(BaseModel):
    content: Union[str, QAChunk] = Field(
@@ -35,13 +48,26 @@ class ChunkUpdate(BaseModel):

    @property
    def chunk_content(self) -> str:
-        """
-        Get the actual content string regardless of input type
-        """
+        """Get the actual content string regardless of input type"""
        if isinstance(self.content, QAChunk):
-            return f"question: {self.content.question} answer: {self.content.answer}"
+            return self.content.question  # QA 模式下 page_content 存 question
        return self.content

+    @property
+    def is_qa(self) -> bool:
+        return isinstance(self.content, QAChunk)
+
+    @property
+    def qa_metadata(self) -> dict:
+        """返回 QA 相关的 metadata 字段"""
+        if isinstance(self.content, QAChunk):
+            return {
+                "chunk_type": "qa",
+                "question": self.content.question,
+                "answer": self.content.answer,
+            }
+        return {}
+

 class ChunkRetrieve(BaseModel):
    query: str
@@ -51,3 +77,8 @@ class ChunkRetrieve(BaseModel):
    vector_similarity_weight: float | None = Field(None)
    top_k: int | None = Field(None)
    retrieve_type: RetrieveType | None = Field(None)
+
+
+class ChunkBatchCreate(BaseModel):
+    """批量创建 chunk"""
+    items: list[ChunkCreate] = Field(..., min_length=1, description="chunk 列表")
--- a/api/app/schemas/file_schema.py
+++ b/api/app/schemas/file_schema.py
@@ -11,6 +11,7 @@ class FileBase(BaseModel):
    file_ext: str
    file_size: int
    file_url: str | None = None
+    file_key: str | None = None
    created_at: datetime.datetime | None = None


--- a/api/app/services/file_storage_service.py
+++ b/api/app/services/file_storage_service.py
@@ -34,26 +34,7 @@ def generate_file_key(
    Generate a unique file key for storage.

    The file key follows the format: {tenant_id}/{workspace_id}/{file_id}{file_ext}
-
-    Args:
-        tenant_id: The tenant UUID.
-        workspace_id: The workspace UUID.
-        file_id: The file UUID.
-        file_ext: The file extension (e.g., '.pdf', '.txt').
-
-    Returns:
-        A unique file key string.
-
-    Example:
-        >>> generate_file_key(
-        ...     uuid.UUID('550e8400-e29b-41d4-a716-446655440000'),
-        ...     uuid.UUID('660e8400-e29b-41d4-a716-446655440001'),
-        ...     uuid.UUID('770e8400-e29b-41d4-a716-446655440002'),
-        ...     '.pdf'
-        ... )
-        '550e8400-e29b-41d4-a716-446655440000/660e8400-e29b-41d4-a716-446655440001/770e8400-e29b-41d4-a716-446655440002.pdf'
    """
-    # Ensure file_ext starts with a dot
    if file_ext and not file_ext.startswith('.'):
        file_ext = f'.{file_ext}'
    if workspace_id:
@@ -61,6 +42,21 @@ def generate_file_key(
    return f"{tenant_id}/{file_id}{file_ext}"


+def generate_kb_file_key(
+    kb_id: uuid.UUID,
+    file_id: uuid.UUID,
+    file_ext: str,
+) -> str:
+    """
+    Generate a file key for knowledge base files.
+
+    Format: kb/{kb_id}/{file_id}{file_ext}
+    """
+    if file_ext and not file_ext.startswith('.'):
+        file_ext = f'.{file_ext}'
+    return f"kb/{kb_id}/{file_id}{file_ext}"
+
+
 class FileStorageService:
    """
    High-level service for file storage operations.
--- a/api/app/tasks.py
+++ b/api/app/tasks.py
@@ -30,7 +30,7 @@ from app.core.rag.llm.cv_model import QWenCV
 from app.core.rag.llm.embedding_model import OpenAIEmbed
 from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
 from app.core.rag.models.chunk import DocumentChunk
-from app.core.rag.prompts.generator import question_proposal
+from app.core.rag.prompts.generator import question_proposal, qa_proposal
 from app.core.rag.vdb.elasticsearch.elasticsearch_vector import (
    ElasticSearchVectorFactory,
 )
@@ -210,9 +210,14 @@ def _build_vision_model(file_path: str, db_knowledge):


@celery_app.task(name="app.core.rag.tasks.parse_document")
-def parse_document(file_path: str, document_id: uuid.UUID):
+def parse_document(file_key: str, document_id: uuid.UUID, file_name: str = ""):
    """
-    Document parsing, vectorization, and storage
+    Document parsing, vectorization, and storage.
+    
+    Args:
+        file_key: Storage key for FileStorageService (e.g. "kb/{kb_id}/{file_id}.docx")
+        document_id: Document UUID
+        file_name: Original file name (used for extension detection in chunk())
    """

    db_document = None
@@ -223,7 +228,6 @@ def parse_document(file_path: str, document_id: uuid.UUID):

    with get_db_context() as db:
      try:
-        # Celery JSON 序列化会将 UUID 转为字符串，需要确保类型正确
        if not isinstance(document_id, uuid.UUID):
            document_id = uuid.UUID(str(document_id))

@@ -234,7 +238,11 @@ def parse_document(file_path: str, document_id: uuid.UUID):
        if db_knowledge is None:
            raise ValueError(f"Knowledge {db_document.kb_id} not found")

-        # 1. Document parsing & segmentation
+        # Use file_name from argument or fall back to document record
+        if not file_name:
+            file_name = db_document.file_name
+
+        # 1. Download file from storage backend
        progress_lines.append(f"{datetime.now().strftime('%H:%M:%S')} Start to parse.")
        start_time = time.time()
        db_document.progress = 0.0
@@ -245,45 +253,36 @@ def parse_document(file_path: str, document_id: uuid.UUID):
        db.commit()
        db.refresh(db_document)

+        # Read file content from storage backend (no NFS dependency)
+        from app.services.file_storage_service import FileStorageService
+        import asyncio
+        storage_service = FileStorageService()
+
+        async def _download():
+            return await storage_service.download_file(file_key)
+
+        try:
+            file_binary = asyncio.run(_download())
+        except RuntimeError:
+            # If there's already a running loop (e.g. in some worker configurations)
+            loop = asyncio.new_event_loop()
+            try:
+                file_binary = loop.run_until_complete(_download())
+            finally:
+                loop.close()
+        if not file_binary:
+            raise IOError(f"Downloaded empty file from storage: {file_key}")
+        logger.info(f"[ParseDoc] Downloaded {len(file_binary)} bytes from storage key: {file_key}")
+
        def progress_callback(prog=None, msg=None):
            progress_lines.append(f"{datetime.now().strftime('%H:%M:%S')} parse progress: {prog} msg: {msg}.")

        # Prepare vision_model for parsing
-        vision_model = _build_vision_model(file_path, db_knowledge)
-
-        # 先将文件读入内存，避免解析过程中依赖 NFS 文件持续可访问
-        # python-docx 等库在 binary=None 时会用路径直接打开文件，
-        # 在 NFS/共享存储上可能因缓存失效导致 "Package not found"
-        max_wait_seconds = 30
-        wait_interval = 2
-        waited = 0
-        file_binary = None
-        while waited <= max_wait_seconds:
-            # os.listdir 强制 NFS 客户端刷新目录缓存
-            parent_dir = os.path.dirname(file_path)
-            try:
-                os.listdir(parent_dir)
-            except OSError:
-                pass
-            try:
-                with open(file_path, "rb") as f:
-                    file_binary = f.read()
-                if not file_binary:
-                    # NFS 上文件存在但内容为空（可能还在同步中）
-                    raise IOError(f"File is empty (0 bytes), NFS may still be syncing: {file_path}")
-                break
-            except (FileNotFoundError, IOError) as e:
-                if waited >= max_wait_seconds:
-                    raise type(e)(
-                        f"File not accessible at '{file_path}' after waiting {max_wait_seconds}s: {e}"
-                    )
-                logger.warning(f"File not ready on this node, retrying in {wait_interval}s: {file_path} ({e})")
-                time.sleep(wait_interval)
-                waited += wait_interval
+        vision_model = _build_vision_model(file_name, db_knowledge)

        from app.core.rag.app.naive import chunk
        logger.info(f"[ParseDoc] file_binary size={len(file_binary)} bytes, type={type(file_binary).__name__}, bool={bool(file_binary)}")
-        res = chunk(filename=file_path,
+        res = chunk(filename=file_name,
                    binary=file_binary,
                    from_page=0,
                    to_page=DEFAULT_PARSE_TO_PAGE,
@@ -312,6 +311,7 @@ def parse_document(file_path: str, document_id: uuid.UUID):
            vector_service.delete_by_metadata_field(key="document_id", value=str(document_id))
            # 2.2 Vectorize and import batch documents
            auto_questions_topn = db_document.parser_config.get("auto_questions", 0)
+            qa_prompt = db_document.parser_config.get("qa_prompt", None)
            chat_model = None
            if auto_questions_topn:
                chat_model = Base(
@@ -319,62 +319,123 @@ def parse_document(file_path: str, document_id: uuid.UUID):
                    model_name=db_knowledge.llm.api_keys[0].model_name,
                    base_url=db_knowledge.llm.api_keys[0].api_base,
                )
+                logger.info(f"[QA] LLM model: {db_knowledge.llm.api_keys[0].model_name}, base_url: {db_knowledge.llm.api_keys[0].api_base}")
+                if qa_prompt:
+                    logger.info(f"[QA] Using custom prompt ({len(qa_prompt)} chars)")

            # 预先构建所有 batch 的 chunks，保证 sort_id 全局有序
            all_batch_chunks: list[list[DocumentChunk]] = []

            if auto_questions_topn:
-                # auto_questions 开启：先并发生成所有 chunk 的问题，再按 batch 分组
-                # 构建 (global_idx, item) 列表
+                # QA 模式（FastGPT 方案）：
+                # 1. 原 chunk 标记为 source（保留供 GraphRAG 使用，不参与检索）
+                # 2. LLM 生成 QA 对，每个 QA 对独立存储为 qa chunk
                indexed_items = list(enumerate(res))

-                def _generate_question(idx_item: tuple[int, dict]) -> tuple[int, str]:
-                    """为单个 chunk 生成问题（带缓存），返回 (global_idx, question_text)"""
+                def _generate_qa(idx_item: tuple[int, dict]) -> tuple[int, list]:
+                    """为单个 chunk 生成 QA 对（带缓存），返回 (global_idx, qa_pairs)"""
                    global_idx, item = idx_item
                    content = item["content_with_weight"]
-                    cached = get_llm_cache(chat_model.model_name, content, "question",
-                                           {"topn": auto_questions_topn})
+                    cache_params = {"topn": auto_questions_topn}
+                    if qa_prompt:
+                        import hashlib
+                        cache_params["prompt_hash"] = hashlib.md5(qa_prompt.encode()).hexdigest()[:8]
+                    cached = get_llm_cache(chat_model.model_name, content, "qa", cache_params)
                    if not cached:
-                        cached = question_proposal(chat_model, content, auto_questions_topn)
-                        set_llm_cache(chat_model.model_name, content, cached, "question",
-                                      {"topn": auto_questions_topn})
-                    return global_idx, cached
+                        logger.info(f"[QA] Cache miss for chunk {global_idx}, calling LLM. cache_params={cache_params}")
+                        try:
+                            pairs = qa_proposal(chat_model, content, auto_questions_topn, custom_prompt=qa_prompt)
+                        except Exception as e:
+                            logger.error(f"[QA] LLM call failed: model={chat_model.model_name}, base_url={getattr(chat_model, 'base_url', 'N/A')}, error={e}")
+                            return global_idx, []
+                        logger.info(f"[QA] Chunk {global_idx} generated {len(pairs)} QA pairs")
+                        # 缓存存 JSON 字符串
+                        set_llm_cache(chat_model.model_name, content, json.dumps(pairs, ensure_ascii=False), "qa",
+                                      cache_params)
+                        return global_idx, pairs
+                    logger.info(f"[QA] Cache hit for chunk {global_idx}, cache_params={cache_params}, cached_type={type(cached).__name__}")
+                    # 从缓存读取：可能是 JSON 字符串或旧格式纯文本
+                    if isinstance(cached, str):
+                        try:
+                            parsed = json.loads(cached)
+                            if isinstance(parsed, list):
+                                logger.info(f"[QA] Chunk {global_idx} loaded {len(parsed)} QA pairs from cache")
+                                return global_idx, parsed
+                        except (json.JSONDecodeError, TypeError):
+                            pass
+                        # 旧缓存格式（纯文本问题），尝试解析
+                        from app.core.rag.prompts.generator import parse_qa_pairs
+                        return global_idx, parse_qa_pairs(cached) if cached else []
+                    return global_idx, cached if isinstance(cached, list) else []

-                # 并发调用 LLM 生成问题
-                question_map: dict[int, str] = {}
+                # 并发调用 LLM 生成 QA 对
+                qa_map: dict[int, list] = {}
                with ThreadPoolExecutor(max_workers=AUTO_QUESTIONS_MAX_WORKERS) as q_executor:
-                    futures = {q_executor.submit(_generate_question, item): item[0]
+                    futures = {q_executor.submit(_generate_qa, item): item[0]
                               for item in indexed_items}
                    for future in futures:
-                        global_idx, cached = future.result()
-                        question_map[global_idx] = cached
+                        global_idx, pairs = future.result()
+                        qa_map[global_idx] = pairs

                progress_lines.append(
-                    f"{datetime.now().strftime('%H:%M:%S')} Auto questions generated for {total_chunks} chunks "
+                    f"{datetime.now().strftime('%H:%M:%S')} QA pairs generated for {total_chunks} chunks "
                    f"(workers={AUTO_QUESTIONS_MAX_WORKERS}).")

-                # 按 batch 分组组装 DocumentChunk
-                for batch_start in range(0, total_chunks, EMBEDDING_BATCH_SIZE):
-                    batch_end = min(batch_start + EMBEDDING_BATCH_SIZE, total_chunks)
-                    chunks = []
-                    for global_idx in range(batch_start, batch_end):
-                        item = res[global_idx]
-                        metadata = {
+                # 组装 chunks：source chunks + qa chunks
+                source_chunks = []
+                qa_chunks = []
+                qa_sort_id = 0
+
+                for global_idx in range(total_chunks):
+                    item = res[global_idx]
+                    source_chunk_id = uuid.uuid4().hex
+
+                    # source chunk：保留原文，供 GraphRAG 使用，不参与向量检索
+                    source_meta = {
+                        "doc_id": source_chunk_id,
+                        "file_id": str(db_document.file_id),
+                        "file_name": db_document.file_name,
+                        "file_created_at": int(db_document.created_at.timestamp() * 1000),
+                        "document_id": str(db_document.id),
+                        "knowledge_id": str(db_document.kb_id),
+                        "sort_id": global_idx,
+                        "status": 1,
+                        "chunk_type": "source",
+                    }
+                    source_chunks.append(
+                        DocumentChunk(page_content=item["content_with_weight"], metadata=source_meta))
+
+                    # qa chunks：每个 QA 对独立存储
+                    pairs = qa_map.get(global_idx, [])
+                    for pair in pairs:
+                        qa_meta = {
                            "doc_id": uuid.uuid4().hex,
                            "file_id": str(db_document.file_id),
                            "file_name": db_document.file_name,
                            "file_created_at": int(db_document.created_at.timestamp() * 1000),
                            "document_id": str(db_document.id),
                            "knowledge_id": str(db_document.kb_id),
-                            "sort_id": global_idx,
+                            "sort_id": qa_sort_id,
                            "status": 1,
+                            "chunk_type": "qa",
+                            "question": pair["question"],
+                            "answer": pair["answer"],
+                            "source_chunk_id": source_chunk_id,
                        }
-                        cached = question_map[global_idx]
-                        chunks.append(
-                            DocumentChunk(
-                                page_content=f"question: {cached} answer: {item['content_with_weight']}",
-                                metadata=metadata))
-                    all_batch_chunks.append(chunks)
+                        # page_content 存 question，用于向量索引
+                        qa_chunks.append(
+                            DocumentChunk(page_content=pair["question"], metadata=qa_meta))
+                        qa_sort_id += 1
+
+                # 按 batch 分组（source + qa 一起）
+                all_chunks = source_chunks + qa_chunks
+                for batch_start in range(0, len(all_chunks), EMBEDDING_BATCH_SIZE):
+                    batch_end = min(batch_start + EMBEDDING_BATCH_SIZE, len(all_chunks))
+                    all_batch_chunks.append(all_chunks[batch_start:batch_end])
+
+                progress_lines.append(
+                    f"{datetime.now().strftime('%H:%M:%S')} QA mode: {len(source_chunks)} source chunks + "
+                    f"{len(qa_chunks)} QA chunks prepared.")
            else:
                # 无 auto_questions：直接构建 chunks
                for batch_start in range(0, total_chunks, EMBEDDING_BATCH_SIZE):
@@ -636,6 +697,136 @@ def build_graphrag_for_document(document_id: str, knowledge_id: str):
            return f"build_graphrag_for_document '{document_id}' failed: {e}"


+@celery_app.task(name="app.core.rag.tasks.import_qa_chunks", queue="qa_import")
+def import_qa_chunks(kb_id: str, document_id: str, filename: str, contents: bytes):
+    """
+    异步导入 QA 问答对（CSV/Excel）
+    
+    文件格式：第一行标题（跳过），第一列问题，第二列答案
+    """
+    import csv as csv_module
+    import io
+
+    db = None
+    try:
+        from app.db import get_db_context
+        with get_db_context() as db:
+            db_document = db.query(Document).filter(Document.id == uuid.UUID(document_id)).first()
+            db_knowledge = db.query(Knowledge).filter(Knowledge.id == uuid.UUID(kb_id)).first()
+            if not db_document or not db_knowledge:
+                logger.error(f"[ImportQA] document={document_id} or knowledge={kb_id} not found")
+                return {"error": "document or knowledge not found", "imported": 0}
+
+            # 1. 解析文件
+            qa_pairs = []
+            failed_rows = []
+
+            if filename.endswith(".csv"):
+                try:
+                    text = contents.decode("utf-8-sig")
+                except UnicodeDecodeError:
+                    text = contents.decode("gbk", errors="ignore")
+
+                sniffer = csv_module.Sniffer()
+                try:
+                    dialect = sniffer.sniff(text[:2048])
+                    delimiter = dialect.delimiter
+                except csv_module.Error:
+                    delimiter = "," if "," in text[:500] else "\t"
+
+                reader = csv_module.reader(io.StringIO(text), delimiter=delimiter)
+                for i, row in enumerate(reader):
+                    if i == 0:
+                        continue
+                    if len(row) >= 2 and row[0].strip() and row[1].strip():
+                        qa_pairs.append({"question": row[0].strip(), "answer": row[1].strip()})
+                    elif len(row) >= 1 and row[0].strip():
+                        failed_rows.append(i + 1)
+
+            elif filename.endswith(".xlsx") or filename.endswith(".xls"):
+                try:
+                    import openpyxl
+                    wb = openpyxl.load_workbook(io.BytesIO(contents), read_only=True)
+                    for sheet in wb.worksheets:
+                        for i, row in enumerate(sheet.iter_rows(values_only=True)):
+                            if i == 0:
+                                continue
+                            if len(row) >= 2 and row[0] and row[1]:
+                                q = str(row[0]).strip()
+                                a = str(row[1]).strip()
+                                if q and a:
+                                    qa_pairs.append({"question": q, "answer": a})
+                            elif len(row) >= 1 and row[0]:
+                                failed_rows.append(i + 1)
+                    wb.close()
+                except Exception as e:
+                    logger.error(f"[ImportQA] Excel parse failed: {e}")
+                    return {"error": f"Excel parse failed: {e}", "imported": 0}
+
+            if not qa_pairs:
+                logger.warning(f"[ImportQA] No valid QA pairs found in {filename}")
+                return {"error": "No valid QA pairs found", "imported": 0}
+
+            logger.info(f"[ImportQA] Parsed {len(qa_pairs)} QA pairs from {filename}, failed_rows={failed_rows}")
+
+            # 2. 写入 ES
+            vector_service = ElasticSearchVectorFactory().init_vector(knowledge=db_knowledge)
+
+            sort_id = 0
+            total, items = vector_service.search_by_segment(document_id=document_id, pagesize=1, page=1, asc=False)
+            if items:
+                sort_id = items[0].metadata["sort_id"]
+
+            chunks = []
+            for pair in qa_pairs:
+                sort_id += 1
+                doc_id = uuid.uuid4().hex
+                metadata = {
+                    "doc_id": doc_id,
+                    "file_id": str(db_document.file_id),
+                    "file_name": db_document.file_name,
+                    "file_created_at": int(db_document.created_at.timestamp() * 1000),
+                    "document_id": document_id,
+                    "knowledge_id": kb_id,
+                    "sort_id": sort_id,
+                    "status": 1,
+                    "chunk_type": "qa",
+                    "question": pair["question"],
+                    "answer": pair["answer"],
+                }
+                chunks.append(DocumentChunk(page_content=pair["question"], metadata=metadata))
+
+            batch_size = 50
+            for i in range(0, len(chunks), batch_size):
+                batch = chunks[i:i + batch_size]
+                vector_service.add_chunks(batch)
+
+            # 3. 更新 chunk_num 和 progress
+            db_document.chunk_num += len(chunks)
+            db_document.progress = 1.0
+            db_document.progress_msg = f"QA 导入完成: {len(chunks)} 条"
+            db.commit()
+
+            result = {"imported": len(chunks), "failed_rows": failed_rows}
+            logger.info(f"[ImportQA] Done: imported={len(chunks)}, failed={len(failed_rows)}")
+            return result
+
+    except Exception as e:
+        logger.error(f"[ImportQA] Failed: {e}", exc_info=True)
+        # 尝试更新文档状态为失败
+        try:
+            from app.db import get_db_context
+            with get_db_context() as err_db:
+                doc = err_db.query(Document).filter(Document.id == uuid.UUID(document_id)).first()
+                if doc:
+                    doc.progress = -1.0
+                    doc.progress_msg = f"QA 导入失败: {str(e)[:200]}"
+                    err_db.commit()
+        except Exception:
+            pass
+        return {"error": str(e), "imported": 0}
+
+
@celery_app.task(name="app.core.rag.tasks.sync_knowledge_for_kb")
 def sync_knowledge_for_kb(kb_id: uuid.UUID):
    """
Author	SHA1	Message	Date
Mark	f8d1ed51a7	[fix] system prompt fit error	2026-05-07 19:37:34 +08:00
Mark	9fa83ed01e	[modify] QA pair	2026-05-07 19:04:19 +08:00
Mark	e222490bce	[add] batch add chunk for v1	2026-05-07 18:45:36 +08:00
Mark	ad2e885f72	[fix] index_not_found_exception	2026-05-06 18:34:07 +08:00
Mark	70c6d161c8	[fix] delete chunk refresh index	2026-05-06 15:19:46 +08:00
Mark	f85c0594c9	[fix] es vector	2026-04-29 15:24:25 +08:00
Mark	5fceba54b4	[fix] file upload	2026-04-29 13:41:14 +08:00
Mark	6e89302cb2	no message	2026-04-29 11:44:03 +08:00
Mark	90aa4cef21	[add] import qa chunks	2026-04-28 16:38:14 +08:00
Mark	6c47bb77ab	[add] task log	2026-04-28 16:13:26 +08:00
Mark	f667936664	[fix] qa cache	2026-04-28 15:53:07 +08:00
Mark	64e640d882	[add] batch chunk. qa_prompt set	2026-04-28 15:33:44 +08:00
Mark	140311048a	[modify] rag qa chunk	2026-04-28 14:04:36 +08:00
Mark	4bef9b578b	[fix] document file delete	2026-04-27 17:35:13 +08:00
Mark	c53fcf3981	[fix] old code file_path	2026-04-27 17:10:00 +08:00
Mark	2997558bc8	Merge branch 'release/v0.3.2' into feature/rag2 * release/v0.3.2: (245 commits) fix(conversation_schema): refine citations field type to Dict[str, Any] fix(tool_controller): re-raise HTTPException to preserve original status codes fix(workflow): add reasoning content, suggested questions, citations and audio status support feat(workflow): augment logging queries and ameliorate error handling fix(api_key): bypass publication check for SERVICE type API keys fix(multimodal_service): add '文档内容：' prefix to document text and simplify image placeholder text fix(api): convert config_id to string in write_router fix(api): convert end_user_id to string in write_router fix(multimodal_service): refactor image processing to use intermediate list before extending result fix(web): node status ui fix(api): correct import paths in memory_read and celery task command fix(api): correct import paths in memory_read and celery task command refactor(tool): flatten request body parameters for model exposure fix(api): correct import paths in memory_read and celery task command refactor(workflow): streamline node execution handling and log service logic feat(web): http request add process feat(web): workflow app logs fix(app_chat_service,draft_run_service): move system_prompt augmentation before LangChainAgent instantiation fix(app_chat_service,draft_run_service): move system_prompt augmentation before LangChainAgent instantiation refactor(http_request): simplify request handling and remove unused fields ... # Conflicts: # api/app/controllers/file_controller.py # api/app/tasks.py	2026-04-27 16:13:57 +08:00
Mark	30cdf229de	[modify] rag file system	2026-04-27 16:05:27 +08:00