feat(multimodal): support document image extraction and inline vision processing

Added document image extraction capability for PDF and DOCX files, including page/index metadata and storage integration. Extended `process_files` with `document_image_recognition` flag to conditionally enable vision-based image processing when model supports it. Updated knowledge repository and workflow node logic to enforce status=1 checks. Added PyMuPDF dependency.
2026-04-24 11:18:50 +08:00
parent 9fdb952396
commit 767eb5e6f2
13 changed files with 397 additions and 52 deletions
--- a/api/app/services/multimodal_service.py
+++ b/api/app/services/multimodal_service.py
@@ -344,6 +344,7 @@ class MultimodalService:
    async def process_files(
            self,
            files: Optional[List[FileInput]],
+            document_image_recognition: bool = False,
    ) -> List[Dict[str, Any]]:
        """
        处理文件列表，返回 LLM 可用的格式
@@ -379,6 +380,31 @@ class MultimodalService:
                elif file.type == FileType.DOCUMENT:
                    is_support, content = await self._process_document(file, strategy)
                    result.append(content)
+                    # 仅当开关开启且模型支持视觉时，才提取文档内嵌图片
+                    if document_image_recognition and "vision" in self.capability:
+                        img_infos = await self.extract_document_images(file)
+                        for img_info in img_infos:
+                            page = img_info["page"]
+                            index = img_info["index"]
+                            ext = img_info.get("ext", "png")
+                            try:
+                                _, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext)
+                                placeholder = f"第{page}页 第{index + 1}张图片" if page > 0 else f"第{index + 1}张图片"
+                                # 在文本内容中追加图片位置标记
+                                if result and result[-1].get("type") in ("text", "document"):
+                                    key = "text" if "text" in result[-1] else list(result[-1].keys())[-1]
+                                    result[-1][key] = result[-1].get(key, "") + f"\n[{placeholder}]: {img_url}"
+                                # 将图片以视觉格式追加到消息内容中
+                                img_file = FileInput(
+                                    type=FileType.IMAGE,
+                                    transfer_method=TransferMethod.REMOTE_URL,
+                                    url=img_url,
+                                    file_type="image/png",
+                                )
+                                _, img_content = await self._process_image(img_file, strategy_class(img_file))
+                                result.append(img_content)
+                            except Exception as img_err:
+                                logger.warning(f"文档图片处理失败: {img_err}")
                elif file.type == FileType.AUDIO and "audio" in self.capability:
                    is_support, content = await self._process_audio(file, strategy)
                    result.append(content)
@@ -431,12 +457,8 @@ class MultimodalService:
        """
        处理文档文件（PDF、Word 等）
        
-        Args:
-            file: 文档文件输入
-            strategy: 格式化策略
-            
        Returns:
-            Dict: 根据 provider 返回不同格式的文档内容
+            仅返回文本内容（图片通过 process_files 中的额外步骤追加）
        """
        if file.transfer_method == TransferMethod.REMOTE_URL:
            return True, {
@@ -444,19 +466,63 @@ class MultimodalService:
                "text": f"<document url=\"{file.url}\">\n{await self.extract_document_text(file)}\n</document>"
            }
        else:
-            # 本地文件，提取文本内容
            server_url = settings.FILE_LOCAL_SERVER_URL
            file.url = f"{server_url}/storage/permanent/{file.upload_file_id}"
            text = await self.extract_document_text(file)
            file_metadata = self.db.query(FileMetadata).filter(
                FileMetadata.id == file.upload_file_id
            ).first()
-
            file_name = file_metadata.file_name if file_metadata else "unknown"
-
-            # 使用策略格式化文档
            return await strategy.format_document(file_name, text)

+    async def _save_doc_image_to_storage(
+            self,
+            img_bytes: bytes,
+            ext: str,
+    ) -> tuple[str, str]:
+        """
+        将文档内嵌图片保存到存储后端，写入 FileMetadata。
+        tenant_id / workspace_id 从 api_config 所在的 FileMetadata 上下文获取，
+        无法获取时使用占位 UUID（图片仍可通过 permanent URL 访问）。
+
+        Returns:
+            (file_id_str, permanent_url)
+        """
+        import uuid as _uuid
+        from app.services.file_storage_service import FileStorageService, generate_file_key
+        from app.db import get_db_context
+
+        file_id = _uuid.uuid4()
+        file_ext = f".{ext}" if not ext.startswith(".") else ext
+        content_type = f"image/{ext}"
+
+        # tenant_id / workspace_id 尽量从已有 FileMetadata 推断，否则用占位值
+        placeholder = _uuid.UUID(int=0)
+        tenant_id = placeholder
+        workspace_id = placeholder
+
+        file_key = generate_file_key(tenant_id, workspace_id, file_id, file_ext)
+        storage_svc = FileStorageService()
+        await storage_svc.storage.upload(file_key, img_bytes, content_type)
+
+        with get_db_context() as db:
+            meta = FileMetadata(
+                id=file_id,
+                tenant_id=tenant_id,
+                workspace_id=workspace_id,
+                file_key=file_key,
+                file_name=f"doc_image_{file_id}{file_ext}",
+                file_ext=file_ext,
+                file_size=len(img_bytes),
+                content_type=content_type,
+                status="completed",
+            )
+            db.add(meta)
+            db.commit()
+
+        url = f"{settings.FILE_LOCAL_SERVER_URL}/storage/permanent/{file_id}"
+        return str(file_id), url
+
    async def _process_audio(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]:
        """
        处理音频文件
@@ -582,6 +648,84 @@ class MultimodalService:
            logger.error(f"Failed to load file. - {e}")
            return "[Failed to load file.]"

+    async def extract_document_images(self, file: FileInput) -> list[dict]:
+        """
+        提取文档中的内嵌图片（支持 PDF 和 DOCX），附带位置信息。
+
+        Returns:
+            list[dict]: 每项包含:
+                - bytes: 图片二进制
+                - page: 所在页码（PDF 从 1 开始，DOCX 为 0）
+                - index: 该页/文档内的图片序号（从 0 开始）
+                - ext: 图片扩展名（如 png、jpeg）
+        """
+        try:
+            file_content = file.get_content()
+            if not file_content:
+                async with httpx.AsyncClient(timeout=30.0) as client:
+                    response = await client.get(file.url, follow_redirects=True)
+                    response.raise_for_status()
+                    file_content = response.content
+                    file.set_content(file_content)
+
+            file_mime_type = magic.from_buffer(file_content, mime=True)
+            if file_mime_type in PDF_MIME:
+                return self._extract_pdf_images(file_content)
+            elif self._is_word_file(file_content, file_mime_type):
+                return self._extract_docx_images(file_content)
+            return []
+        except Exception as e:
+            logger.error(f"提取文档图片失败: {e}")
+            return []
+
+    @staticmethod
+    def _extract_pdf_images(file_content: bytes) -> list[dict]:
+        """从 PDF 提取内嵌图片，附带页码和序号"""
+        images = []
+        try:
+            import fitz  # PyMuPDF
+            doc = fitz.open(stream=file_content, filetype="pdf")
+            for page_num, page in enumerate(doc, start=1):
+                for idx, img in enumerate(page.get_images(full=True)):
+                    xref = img[0]
+                    base_image = doc.extract_image(xref)
+                    images.append({
+                        "bytes": base_image["image"],
+                        "ext": base_image.get("ext", "png"),
+                        "page": page_num,
+                        "index": idx,
+                    })
+            doc.close()
+        except ImportError:
+            logger.warning("PyMuPDF 未安装，无法提取 PDF 图片，请执行: uv add pymupdf")
+        except Exception as e:
+            logger.error(f"提取 PDF 图片失败: {e}")
+        return images
+
+    @staticmethod
+    def _extract_docx_images(file_content: bytes) -> list[dict]:
+        """从 DOCX 提取内嵌图片，附带序号（DOCX 无页码概念，page 固定为 0）"""
+        images = []
+        try:
+            if file_content[:2] != b'PK':
+                return []
+            with zipfile.ZipFile(io.BytesIO(file_content)) as zf:
+                media_files = sorted(
+                    name for name in zf.namelist()
+                    if name.startswith("word/media/") and not name.endswith("/")
+                )
+                for idx, name in enumerate(media_files):
+                    ext = name.rsplit(".", 1)[-1].lower() if "." in name else "png"
+                    images.append({
+                        "bytes": zf.read(name),
+                        "ext": ext,
+                        "page": 0,
+                        "index": idx,
+                    })
+        except Exception as e:
+            logger.error(f"提取 DOCX 图片失败: {e}")
+        return images
+
    @staticmethod
    async def _extract_pdf_text(file_content: bytes) -> str:
        """提取 PDF 文本"""