feat(multimodal): support document image extraction and inline vision processing
Added document image extraction capability for PDF and DOCX files, including page/index metadata and storage integration. Extended `process_files` with `document_image_recognition` flag to conditionally enable vision-based image processing when model supports it. Updated knowledge repository and workflow node logic to enforce status=1 checks. Added PyMuPDF dependency.
This commit is contained in:
@@ -1,12 +1,15 @@
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Any
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.workflow.engine.state_manager import WorkflowState
|
||||
from app.core.workflow.engine.variable_pool import VariablePool
|
||||
from app.core.workflow.nodes.base_node import BaseNode
|
||||
from app.core.workflow.nodes.document_extractor.config import DocExtractorNodeConfig
|
||||
from app.core.workflow.variable.base_variable import VariableType, FileObject
|
||||
from app.db import get_db_read
|
||||
from app.models.file_metadata_model import FileMetadata
|
||||
from app.schemas.app_schema import FileInput, FileType, TransferMethod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -15,7 +18,6 @@ logger = logging.getLogger(__name__)
|
||||
def _file_object_to_file_input(f: FileObject) -> FileInput:
|
||||
"""Convert workflow FileObject to multimodal FileInput."""
|
||||
file_type = f.origin_file_type or ""
|
||||
# Prefer mime_type for more accurate type detection
|
||||
if not file_type and f.mime_type:
|
||||
file_type = f.mime_type
|
||||
resolved_type = FileType.trans(f.type) if isinstance(f.type, str) else f.type
|
||||
@@ -51,21 +53,68 @@ def _normalise_files(val: Any) -> list[FileObject]:
|
||||
return []
|
||||
|
||||
|
||||
async def _save_image_to_storage(
|
||||
img_bytes: bytes,
|
||||
ext: str,
|
||||
tenant_id: uuid.UUID,
|
||||
workspace_id: uuid.UUID,
|
||||
) -> tuple[uuid.UUID, str]:
|
||||
"""
|
||||
将图片字节保存到存储后端,写入 FileMetadata,返回 (file_id, url)。
|
||||
"""
|
||||
from app.services.file_storage_service import FileStorageService, generate_file_key
|
||||
|
||||
file_id = uuid.uuid4()
|
||||
file_ext = f".{ext}" if not ext.startswith(".") else ext
|
||||
content_type = f"image/{ext}"
|
||||
|
||||
file_key = generate_file_key(
|
||||
tenant_id=tenant_id,
|
||||
workspace_id=workspace_id,
|
||||
file_id=file_id,
|
||||
file_ext=file_ext,
|
||||
)
|
||||
|
||||
storage_svc = FileStorageService()
|
||||
await storage_svc.storage.upload(file_key, img_bytes, content_type)
|
||||
|
||||
with get_db_read() as db:
|
||||
meta = FileMetadata(
|
||||
id=file_id,
|
||||
tenant_id=tenant_id,
|
||||
workspace_id=workspace_id,
|
||||
file_key=file_key,
|
||||
file_name=f"doc_image_{file_id}{file_ext}",
|
||||
file_ext=file_ext,
|
||||
file_size=len(img_bytes),
|
||||
content_type=content_type,
|
||||
status="completed",
|
||||
)
|
||||
db.add(meta)
|
||||
db.commit()
|
||||
|
||||
url = f"{settings.FILE_LOCAL_SERVER_URL}/storage/permanent/{file_id}"
|
||||
return file_id, url
|
||||
|
||||
|
||||
class DocExtractorNode(BaseNode):
|
||||
"""Document Extractor Node.
|
||||
|
||||
Reads one or more file variables and extracts their text content
|
||||
by delegating to MultimodalService._extract_document_text.
|
||||
and embedded images.
|
||||
|
||||
Outputs:
|
||||
text (string) – full concatenated text of all input files
|
||||
chunks (array[string]) – per-file extracted text
|
||||
text (string) – full text with image placeholders like [图片 第N页 第M张]
|
||||
chunks (array[string]) – per-file extracted text (with placeholders)
|
||||
images (array[file]) – extracted images as FileObject list, each with
|
||||
name encoding position: "p{page}_i{index}"
|
||||
"""
|
||||
|
||||
def _output_types(self) -> dict[str, VariableType]:
|
||||
return {
|
||||
"text": VariableType.STRING,
|
||||
"chunks": VariableType.ARRAY_STRING,
|
||||
"images": VariableType.ARRAY_FILE,
|
||||
}
|
||||
|
||||
def _extract_output(self, business_result: Any) -> Any:
|
||||
@@ -80,13 +129,18 @@ class DocExtractorNode(BaseNode):
|
||||
raw_val = self.get_variable(config.file_selector, variable_pool, strict=False)
|
||||
if raw_val is None:
|
||||
logger.warning(f"Node {self.node_id}: file variable '{config.file_selector}' is empty")
|
||||
return {"text": "", "chunks": []}
|
||||
return {"text": "", "chunks": [], "images": []}
|
||||
|
||||
files = _normalise_files(raw_val)
|
||||
if not files:
|
||||
return {"text": "", "chunks": []}
|
||||
return {"text": "", "chunks": [], "images": []}
|
||||
|
||||
tenant_id = uuid.UUID(self.get_variable("sys.tenant_id", variable_pool, strict=False) or str(uuid.uuid4()))
|
||||
workspace_id = uuid.UUID(self.get_variable("sys.workspace_id", variable_pool))
|
||||
|
||||
chunks: list[str] = []
|
||||
image_file_objects: list[dict] = []
|
||||
|
||||
with get_db_read() as db:
|
||||
from app.services.multimodal_service import MultimodalService
|
||||
svc = MultimodalService(db)
|
||||
@@ -94,13 +148,44 @@ class DocExtractorNode(BaseNode):
|
||||
label = f.name or f.url or f.file_id
|
||||
try:
|
||||
file_input = _file_object_to_file_input(f)
|
||||
# Ensure URL is populated for local files
|
||||
if not file_input.url:
|
||||
file_input.url = await svc.get_file_url(file_input)
|
||||
# Reuse cached bytes if already fetched
|
||||
if f.get_content():
|
||||
file_input.set_content(f.get_content())
|
||||
|
||||
text = await svc.extract_document_text(file_input)
|
||||
|
||||
# 从工作流 features 读取 document_image_recognition 开关
|
||||
fu_config = self.workflow_config.get("features", {}).get("file_upload", {})
|
||||
image_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
||||
if image_recognition:
|
||||
img_infos = await svc.extract_document_images(file_input)
|
||||
for img_info in img_infos:
|
||||
page = img_info["page"]
|
||||
index = img_info["index"]
|
||||
ext = img_info.get("ext", "png")
|
||||
placeholder = f"[图片 第{page}页 第{index + 1}张]" if page > 0 else f"[图片 第{index + 1}张]"
|
||||
try:
|
||||
file_id, url = await _save_image_to_storage(
|
||||
img_bytes=img_info["bytes"],
|
||||
ext=ext,
|
||||
tenant_id=tenant_id,
|
||||
workspace_id=workspace_id,
|
||||
)
|
||||
image_file_objects.append(FileObject(
|
||||
type=FileType.IMAGE,
|
||||
url=url,
|
||||
transfer_method=TransferMethod.REMOTE_URL,
|
||||
origin_file_type=f"image/{ext}",
|
||||
file_id=str(file_id),
|
||||
name=f"p{page}_i{index}",
|
||||
mime_type=f"image/{ext}",
|
||||
is_file=True,
|
||||
).model_dump())
|
||||
text = text + f"\n{placeholder}: {url}"
|
||||
except Exception as e:
|
||||
logger.error(f"Node {self.node_id}: failed to save image {placeholder}: {e}")
|
||||
|
||||
chunks.append(text)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
@@ -110,5 +195,8 @@ class DocExtractorNode(BaseNode):
|
||||
chunks.append("")
|
||||
|
||||
full_text = "\n\n".join(c for c in chunks if c)
|
||||
logger.info(f"Node {self.node_id}: extracted {len(files)} file(s), total chars={len(full_text)}")
|
||||
return {"text": full_text, "chunks": chunks}
|
||||
logger.info(
|
||||
f"Node {self.node_id}: extracted {len(files)} file(s), "
|
||||
f"total chars={len(full_text)}, images={len(image_file_objects)}"
|
||||
)
|
||||
return {"text": full_text, "chunks": chunks, "images": image_file_objects}
|
||||
|
||||
@@ -333,7 +333,7 @@ class KnowledgeRetrievalNode(BaseNode):
|
||||
tasks = []
|
||||
for kb_config in knowledge_bases:
|
||||
db_knowledge = knowledge_repository.get_knowledge_by_id(db=db, knowledge_id=kb_config.kb_id)
|
||||
if not db_knowledge:
|
||||
if not (db_knowledge and db_knowledge.chunk_num > 0 and db_knowledge.status == 1):
|
||||
raise RuntimeError("The knowledge base does not exist or access is denied.")
|
||||
tasks.append(self.knowledge_retrieval(db, query, db_knowledge, kb_config))
|
||||
if tasks:
|
||||
|
||||
Reference in New Issue
Block a user