From 74be09340cd697ef17d067d66ae2c803faf8c609 Mon Sep 17 00:00:00 2001 From: Timebomb2018 <18868801967@163.com> Date: Fri, 24 Apr 2026 15:56:06 +0800 Subject: [PATCH] feat(multimodal): support tenant-aware document image storage and improve image placeholder labeling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pass workspace_id to multimodal_service.process_files across app_chat_service, draft_run_service - Fetch tenant_id from workspace in multimodal_service for proper file storage scoping - Update image placeholder format from "[第N页 第M张图片]" to "[图片 第N页 第M张图片]" for clarity - Add strict URL preservation rules to system prompt for agents handling document images - Refactor _save_doc_image_to_storage to accept explicit tenant_id and workspace_id instead of inferring from FileMetadata --- api/app/services/app_chat_service.py | 6 ++++-- api/app/services/draft_run_service.py | 11 ++++++++--- api/app/services/multimodal_service.py | 23 +++++++++++------------ 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/api/app/services/app_chat_service.py b/api/app/services/app_chat_service.py index bc5ae964..407a1cfc 100644 --- a/api/app/services/app_chat_service.py +++ b/api/app/services/app_chat_service.py @@ -170,7 +170,8 @@ class AppChatService: fu_config = fu_config.model_dump() doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False) processed_files = await multimodal_service.process_files( - files, document_image_recognition=doc_img_recognition + files, document_image_recognition=doc_img_recognition, + workspace_id=workspace_id ) logger.info(f"处理了 {len(processed_files)} 个文件") if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any( @@ -462,7 +463,8 @@ class AppChatService: fu_config = fu_config.model_dump() doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False) processed_files = await multimodal_service.process_files( - files, document_image_recognition=doc_img_recognition + files, document_image_recognition=doc_img_recognition, + workspace_id=workspace_id ) logger.info(f"处理了 {len(processed_files)} 个文件") if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any( diff --git a/api/app/services/draft_run_service.py b/api/app/services/draft_run_service.py index 2869326f..9d04369b 100644 --- a/api/app/services/draft_run_service.py +++ b/api/app/services/draft_run_service.py @@ -655,7 +655,8 @@ class AgentRunService: fu_config = fu_config.model_dump() doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False) processed_files = await multimodal_service.process_files( - files, document_image_recognition=doc_img_recognition + files, document_image_recognition=doc_img_recognition, + workspace_id=workspace_id ) logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}") capability = api_key_config.get("capability", []) @@ -936,7 +937,8 @@ class AgentRunService: fu_config = fu_config.model_dump() doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False) processed_files = await multimodal_service.process_files( - files, document_image_recognition=doc_img_recognition + files, document_image_recognition=doc_img_recognition, + workspace_id=workspace_id ) logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}") capability = api_key_config.get("capability", []) @@ -947,8 +949,11 @@ class AgentRunService: ) if has_doc_with_images: agent.system_prompt += ( - "\n\n文档中包含图片,图片位置已在文本中以 [第N页 第M张图片]: URL 标记。" + "\n\n文档中包含图片,图片位置已在文本中以 [图片 第N页 第M张图片]: URL 标记。" "请在回答中用 Markdown 格式 ![描述](URL) 展示相关图片,做到图文并茂。" + "**规则1:图片URL必须原封不动、一字不差地复制,禁止修改、禁止省略任何字符**" + "**规则2:禁止修改URL中UUID里的任何数字和字母**" + "**规则3:直接使用 ![描述](完整URL) 格式输出**" ) agent.agent = create_agent( model=agent.llm, diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py index 946649c3..960bf6bb 100644 --- a/api/app/services/multimodal_service.py +++ b/api/app/services/multimodal_service.py @@ -24,6 +24,7 @@ import chardet import httpx import magic import openpyxl +import uuid from docx import Document from sqlalchemy.orm import Session @@ -344,6 +345,7 @@ class MultimodalService: async def process_files( self, files: Optional[List[FileInput]], + workspace_id: uuid.UUID = None, document_image_recognition: bool = False, ) -> List[Dict[str, Any]]: """ @@ -383,17 +385,20 @@ class MultimodalService: # 仅当开关开启且模型支持视觉时,才提取文档内嵌图片 if document_image_recognition and "vision" in self.capability: img_infos = await self.extract_document_images(file) + from app.models.workspace_model import Workspace as WorkspaceModel + ws = self.db.query(WorkspaceModel).filter(WorkspaceModel.id == workspace_id).first() + tenant_id = ws.tenant_id if ws else None for img_info in img_infos: page = img_info["page"] index = img_info["index"] ext = img_info.get("ext", "png") try: - _, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext) + _, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext, tenant_id, workspace_id) placeholder = f"第{page}页 第{index + 1}张图片" if page > 0 else f"第{index + 1}张图片" # 在文本内容中追加图片位置标记 if result and result[-1].get("type") in ("text", "document"): key = "text" if "text" in result[-1] else list(result[-1].keys())[-1] - result[-1][key] = result[-1].get(key, "") + f"\n[{placeholder}]: {img_url}" + result[-1][key] = result[-1].get(key, "") + f"\n[图片 {placeholder}]: {img_url}" # 将图片以视觉格式追加到消息内容中 img_file = FileInput( type=FileType.IMAGE, @@ -475,32 +480,26 @@ class MultimodalService: file_name = file_metadata.file_name if file_metadata else "unknown" return await strategy.format_document(file_name, text) + @staticmethod async def _save_doc_image_to_storage( - self, img_bytes: bytes, ext: str, + tenant_id: uuid.UUID, + workspace_id: uuid.UUID, ) -> tuple[str, str]: """ 将文档内嵌图片保存到存储后端,写入 FileMetadata。 - tenant_id / workspace_id 从 api_config 所在的 FileMetadata 上下文获取, - 无法获取时使用占位 UUID(图片仍可通过 permanent URL 访问)。 Returns: (file_id_str, permanent_url) """ - import uuid as _uuid from app.services.file_storage_service import FileStorageService, generate_file_key from app.db import get_db_context - file_id = _uuid.uuid4() + file_id = uuid.uuid4() file_ext = f".{ext}" if not ext.startswith(".") else ext content_type = f"image/{ext}" - # tenant_id / workspace_id 尽量从已有 FileMetadata 推断,否则用占位值 - placeholder = _uuid.UUID(int=0) - tenant_id = placeholder - workspace_id = placeholder - file_key = generate_file_key(tenant_id, workspace_id, file_id, file_ext) storage_svc = FileStorageService() await storage_svc.storage.upload(file_key, img_bytes, content_type)