feat(multimodal): support document image extraction and inline vision processing

Added document image extraction capability for PDF and DOCX files, including page/index metadata and storage integration. Extended `process_files` with `document_image_recognition` flag to conditionally enable vision-based image processing when model supports it. Updated knowledge repository and workflow node logic to enforce status=1 checks. Added PyMuPDF dependency.
This commit is contained in:
Timebomb2018
2026-04-24 11:18:50 +08:00
parent 9fdb952396
commit 767eb5e6f2
13 changed files with 397 additions and 52 deletions

View File

@@ -344,6 +344,7 @@ class MultimodalService:
async def process_files(
self,
files: Optional[List[FileInput]],
document_image_recognition: bool = False,
) -> List[Dict[str, Any]]:
"""
处理文件列表,返回 LLM 可用的格式
@@ -379,6 +380,31 @@ class MultimodalService:
elif file.type == FileType.DOCUMENT:
is_support, content = await self._process_document(file, strategy)
result.append(content)
# 仅当开关开启且模型支持视觉时,才提取文档内嵌图片
if document_image_recognition and "vision" in self.capability:
img_infos = await self.extract_document_images(file)
for img_info in img_infos:
page = img_info["page"]
index = img_info["index"]
ext = img_info.get("ext", "png")
try:
_, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext)
placeholder = f"{page}页 第{index + 1}张图片" if page > 0 else f"{index + 1}张图片"
# 在文本内容中追加图片位置标记
if result and result[-1].get("type") in ("text", "document"):
key = "text" if "text" in result[-1] else list(result[-1].keys())[-1]
result[-1][key] = result[-1].get(key, "") + f"\n[{placeholder}]: {img_url}"
# 将图片以视觉格式追加到消息内容中
img_file = FileInput(
type=FileType.IMAGE,
transfer_method=TransferMethod.REMOTE_URL,
url=img_url,
file_type="image/png",
)
_, img_content = await self._process_image(img_file, strategy_class(img_file))
result.append(img_content)
except Exception as img_err:
logger.warning(f"文档图片处理失败: {img_err}")
elif file.type == FileType.AUDIO and "audio" in self.capability:
is_support, content = await self._process_audio(file, strategy)
result.append(content)
@@ -431,12 +457,8 @@ class MultimodalService:
"""
处理文档文件PDF、Word 等)
Args:
file: 文档文件输入
strategy: 格式化策略
Returns:
Dict: 根据 provider 返回不同格式的文档内容
仅返回文本内容(图片通过 process_files 中的额外步骤追加)
"""
if file.transfer_method == TransferMethod.REMOTE_URL:
return True, {
@@ -444,19 +466,63 @@ class MultimodalService:
"text": f"<document url=\"{file.url}\">\n{await self.extract_document_text(file)}\n</document>"
}
else:
# 本地文件,提取文本内容
server_url = settings.FILE_LOCAL_SERVER_URL
file.url = f"{server_url}/storage/permanent/{file.upload_file_id}"
text = await self.extract_document_text(file)
file_metadata = self.db.query(FileMetadata).filter(
FileMetadata.id == file.upload_file_id
).first()
file_name = file_metadata.file_name if file_metadata else "unknown"
# 使用策略格式化文档
return await strategy.format_document(file_name, text)
async def _save_doc_image_to_storage(
self,
img_bytes: bytes,
ext: str,
) -> tuple[str, str]:
"""
将文档内嵌图片保存到存储后端,写入 FileMetadata。
tenant_id / workspace_id 从 api_config 所在的 FileMetadata 上下文获取,
无法获取时使用占位 UUID图片仍可通过 permanent URL 访问)。
Returns:
(file_id_str, permanent_url)
"""
import uuid as _uuid
from app.services.file_storage_service import FileStorageService, generate_file_key
from app.db import get_db_context
file_id = _uuid.uuid4()
file_ext = f".{ext}" if not ext.startswith(".") else ext
content_type = f"image/{ext}"
# tenant_id / workspace_id 尽量从已有 FileMetadata 推断,否则用占位值
placeholder = _uuid.UUID(int=0)
tenant_id = placeholder
workspace_id = placeholder
file_key = generate_file_key(tenant_id, workspace_id, file_id, file_ext)
storage_svc = FileStorageService()
await storage_svc.storage.upload(file_key, img_bytes, content_type)
with get_db_context() as db:
meta = FileMetadata(
id=file_id,
tenant_id=tenant_id,
workspace_id=workspace_id,
file_key=file_key,
file_name=f"doc_image_{file_id}{file_ext}",
file_ext=file_ext,
file_size=len(img_bytes),
content_type=content_type,
status="completed",
)
db.add(meta)
db.commit()
url = f"{settings.FILE_LOCAL_SERVER_URL}/storage/permanent/{file_id}"
return str(file_id), url
async def _process_audio(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]:
"""
处理音频文件
@@ -582,6 +648,84 @@ class MultimodalService:
logger.error(f"Failed to load file. - {e}")
return "[Failed to load file.]"
async def extract_document_images(self, file: FileInput) -> list[dict]:
"""
提取文档中的内嵌图片(支持 PDF 和 DOCX附带位置信息。
Returns:
list[dict]: 每项包含:
- bytes: 图片二进制
- page: 所在页码PDF 从 1 开始DOCX 为 0
- index: 该页/文档内的图片序号(从 0 开始)
- ext: 图片扩展名(如 png、jpeg
"""
try:
file_content = file.get_content()
if not file_content:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.get(file.url, follow_redirects=True)
response.raise_for_status()
file_content = response.content
file.set_content(file_content)
file_mime_type = magic.from_buffer(file_content, mime=True)
if file_mime_type in PDF_MIME:
return self._extract_pdf_images(file_content)
elif self._is_word_file(file_content, file_mime_type):
return self._extract_docx_images(file_content)
return []
except Exception as e:
logger.error(f"提取文档图片失败: {e}")
return []
@staticmethod
def _extract_pdf_images(file_content: bytes) -> list[dict]:
"""从 PDF 提取内嵌图片,附带页码和序号"""
images = []
try:
import fitz # PyMuPDF
doc = fitz.open(stream=file_content, filetype="pdf")
for page_num, page in enumerate(doc, start=1):
for idx, img in enumerate(page.get_images(full=True)):
xref = img[0]
base_image = doc.extract_image(xref)
images.append({
"bytes": base_image["image"],
"ext": base_image.get("ext", "png"),
"page": page_num,
"index": idx,
})
doc.close()
except ImportError:
logger.warning("PyMuPDF 未安装,无法提取 PDF 图片,请执行: uv add pymupdf")
except Exception as e:
logger.error(f"提取 PDF 图片失败: {e}")
return images
@staticmethod
def _extract_docx_images(file_content: bytes) -> list[dict]:
"""从 DOCX 提取内嵌图片附带序号DOCX 无页码概念page 固定为 0"""
images = []
try:
if file_content[:2] != b'PK':
return []
with zipfile.ZipFile(io.BytesIO(file_content)) as zf:
media_files = sorted(
name for name in zf.namelist()
if name.startswith("word/media/") and not name.endswith("/")
)
for idx, name in enumerate(media_files):
ext = name.rsplit(".", 1)[-1].lower() if "." in name else "png"
images.append({
"bytes": zf.read(name),
"ext": ext,
"page": 0,
"index": idx,
})
except Exception as e:
logger.error(f"提取 DOCX 图片失败: {e}")
return images
@staticmethod
async def _extract_pdf_text(file_content: bytes) -> str:
"""提取 PDF 文本"""