feat(multimodal): support document image extraction and inline vision processing

Added document image extraction capability for PDF and DOCX files, including page/index metadata and storage integration. Extended `process_files` with `document_image_recognition` flag to conditionally enable vision-based image processing when model supports it. Updated knowledge repository and workflow node logic to enforce status=1 checks. Added PyMuPDF dependency.
This commit is contained in:
Timebomb2018
2026-04-24 11:18:50 +08:00
parent 9fdb952396
commit 767eb5e6f2
13 changed files with 397 additions and 52 deletions

View File

@@ -1,12 +1,15 @@
import logging
import uuid
from typing import Any
from app.core.config import settings
from app.core.workflow.engine.state_manager import WorkflowState
from app.core.workflow.engine.variable_pool import VariablePool
from app.core.workflow.nodes.base_node import BaseNode
from app.core.workflow.nodes.document_extractor.config import DocExtractorNodeConfig
from app.core.workflow.variable.base_variable import VariableType, FileObject
from app.db import get_db_read
from app.models.file_metadata_model import FileMetadata
from app.schemas.app_schema import FileInput, FileType, TransferMethod
logger = logging.getLogger(__name__)
@@ -15,7 +18,6 @@ logger = logging.getLogger(__name__)
def _file_object_to_file_input(f: FileObject) -> FileInput:
"""Convert workflow FileObject to multimodal FileInput."""
file_type = f.origin_file_type or ""
# Prefer mime_type for more accurate type detection
if not file_type and f.mime_type:
file_type = f.mime_type
resolved_type = FileType.trans(f.type) if isinstance(f.type, str) else f.type
@@ -51,21 +53,68 @@ def _normalise_files(val: Any) -> list[FileObject]:
return []
async def _save_image_to_storage(
img_bytes: bytes,
ext: str,
tenant_id: uuid.UUID,
workspace_id: uuid.UUID,
) -> tuple[uuid.UUID, str]:
"""
将图片字节保存到存储后端,写入 FileMetadata返回 (file_id, url)。
"""
from app.services.file_storage_service import FileStorageService, generate_file_key
file_id = uuid.uuid4()
file_ext = f".{ext}" if not ext.startswith(".") else ext
content_type = f"image/{ext}"
file_key = generate_file_key(
tenant_id=tenant_id,
workspace_id=workspace_id,
file_id=file_id,
file_ext=file_ext,
)
storage_svc = FileStorageService()
await storage_svc.storage.upload(file_key, img_bytes, content_type)
with get_db_read() as db:
meta = FileMetadata(
id=file_id,
tenant_id=tenant_id,
workspace_id=workspace_id,
file_key=file_key,
file_name=f"doc_image_{file_id}{file_ext}",
file_ext=file_ext,
file_size=len(img_bytes),
content_type=content_type,
status="completed",
)
db.add(meta)
db.commit()
url = f"{settings.FILE_LOCAL_SERVER_URL}/storage/permanent/{file_id}"
return file_id, url
class DocExtractorNode(BaseNode):
"""Document Extractor Node.
Reads one or more file variables and extracts their text content
by delegating to MultimodalService._extract_document_text.
and embedded images.
Outputs:
text (string) full concatenated text of all input files
chunks (array[string]) per-file extracted text
text (string) full text with image placeholders like [图片 第N页 第M张]
chunks (array[string]) per-file extracted text (with placeholders)
images (array[file]) extracted images as FileObject list, each with
name encoding position: "p{page}_i{index}"
"""
def _output_types(self) -> dict[str, VariableType]:
return {
"text": VariableType.STRING,
"chunks": VariableType.ARRAY_STRING,
"images": VariableType.ARRAY_FILE,
}
def _extract_output(self, business_result: Any) -> Any:
@@ -80,13 +129,18 @@ class DocExtractorNode(BaseNode):
raw_val = self.get_variable(config.file_selector, variable_pool, strict=False)
if raw_val is None:
logger.warning(f"Node {self.node_id}: file variable '{config.file_selector}' is empty")
return {"text": "", "chunks": []}
return {"text": "", "chunks": [], "images": []}
files = _normalise_files(raw_val)
if not files:
return {"text": "", "chunks": []}
return {"text": "", "chunks": [], "images": []}
tenant_id = uuid.UUID(self.get_variable("sys.tenant_id", variable_pool, strict=False) or str(uuid.uuid4()))
workspace_id = uuid.UUID(self.get_variable("sys.workspace_id", variable_pool))
chunks: list[str] = []
image_file_objects: list[dict] = []
with get_db_read() as db:
from app.services.multimodal_service import MultimodalService
svc = MultimodalService(db)
@@ -94,13 +148,44 @@ class DocExtractorNode(BaseNode):
label = f.name or f.url or f.file_id
try:
file_input = _file_object_to_file_input(f)
# Ensure URL is populated for local files
if not file_input.url:
file_input.url = await svc.get_file_url(file_input)
# Reuse cached bytes if already fetched
if f.get_content():
file_input.set_content(f.get_content())
text = await svc.extract_document_text(file_input)
# 从工作流 features 读取 document_image_recognition 开关
fu_config = self.workflow_config.get("features", {}).get("file_upload", {})
image_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
if image_recognition:
img_infos = await svc.extract_document_images(file_input)
for img_info in img_infos:
page = img_info["page"]
index = img_info["index"]
ext = img_info.get("ext", "png")
placeholder = f"[图片 第{page}页 第{index + 1}张]" if page > 0 else f"[图片 第{index + 1}张]"
try:
file_id, url = await _save_image_to_storage(
img_bytes=img_info["bytes"],
ext=ext,
tenant_id=tenant_id,
workspace_id=workspace_id,
)
image_file_objects.append(FileObject(
type=FileType.IMAGE,
url=url,
transfer_method=TransferMethod.REMOTE_URL,
origin_file_type=f"image/{ext}",
file_id=str(file_id),
name=f"p{page}_i{index}",
mime_type=f"image/{ext}",
is_file=True,
).model_dump())
text = text + f"\n{placeholder}: {url}"
except Exception as e:
logger.error(f"Node {self.node_id}: failed to save image {placeholder}: {e}")
chunks.append(text)
except Exception as e:
logger.error(
@@ -110,5 +195,8 @@ class DocExtractorNode(BaseNode):
chunks.append("")
full_text = "\n\n".join(c for c in chunks if c)
logger.info(f"Node {self.node_id}: extracted {len(files)} file(s), total chars={len(full_text)}")
return {"text": full_text, "chunks": chunks}
logger.info(
f"Node {self.node_id}: extracted {len(files)} file(s), "
f"total chars={len(full_text)}, images={len(image_file_objects)}"
)
return {"text": full_text, "chunks": chunks, "images": image_file_objects}

View File

@@ -333,7 +333,7 @@ class KnowledgeRetrievalNode(BaseNode):
tasks = []
for kb_config in knowledge_bases:
db_knowledge = knowledge_repository.get_knowledge_by_id(db=db, knowledge_id=kb_config.kb_id)
if not db_knowledge:
if not (db_knowledge and db_knowledge.chunk_num > 0 and db_knowledge.status == 1):
raise RuntimeError("The knowledge base does not exist or access is denied.")
tasks.append(self.knowledge_retrieval(db, query, db_knowledge, kb_config))
if tasks: