feat(multimodal): support tenant-aware document image storage and improve image placeholder labeling

- Pass workspace_id to multimodal_service.process_files across app_chat_service, draft_run_service
- Fetch tenant_id from workspace in multimodal_service for proper file storage scoping
- Update image placeholder format from "[第N页 第M张图片]" to "[图片 第N页 第M张图片]" for clarity
- Add strict URL preservation rules to system prompt for agents handling document images
- Refactor _save_doc_image_to_storage to accept explicit tenant_id and workspace_id instead of inferring from FileMetadata
This commit is contained in:
Timebomb2018
2026-04-24 15:56:06 +08:00
parent 2c2551e15c
commit 74be09340c
3 changed files with 23 additions and 17 deletions

View File

@@ -170,7 +170,8 @@ class AppChatService:
fu_config = fu_config.model_dump() fu_config = fu_config.model_dump()
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False) doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
processed_files = await multimodal_service.process_files( processed_files = await multimodal_service.process_files(
files, document_image_recognition=doc_img_recognition files, document_image_recognition=doc_img_recognition,
workspace_id=workspace_id
) )
logger.info(f"处理了 {len(processed_files)} 个文件") logger.info(f"处理了 {len(processed_files)} 个文件")
if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any( if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any(
@@ -462,7 +463,8 @@ class AppChatService:
fu_config = fu_config.model_dump() fu_config = fu_config.model_dump()
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False) doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
processed_files = await multimodal_service.process_files( processed_files = await multimodal_service.process_files(
files, document_image_recognition=doc_img_recognition files, document_image_recognition=doc_img_recognition,
workspace_id=workspace_id
) )
logger.info(f"处理了 {len(processed_files)} 个文件") logger.info(f"处理了 {len(processed_files)} 个文件")
if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any( if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any(

View File

@@ -655,7 +655,8 @@ class AgentRunService:
fu_config = fu_config.model_dump() fu_config = fu_config.model_dump()
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False) doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
processed_files = await multimodal_service.process_files( processed_files = await multimodal_service.process_files(
files, document_image_recognition=doc_img_recognition files, document_image_recognition=doc_img_recognition,
workspace_id=workspace_id
) )
logger.info(f"处理了 {len(processed_files)} 个文件provider={provider}") logger.info(f"处理了 {len(processed_files)} 个文件provider={provider}")
capability = api_key_config.get("capability", []) capability = api_key_config.get("capability", [])
@@ -936,7 +937,8 @@ class AgentRunService:
fu_config = fu_config.model_dump() fu_config = fu_config.model_dump()
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False) doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
processed_files = await multimodal_service.process_files( processed_files = await multimodal_service.process_files(
files, document_image_recognition=doc_img_recognition files, document_image_recognition=doc_img_recognition,
workspace_id=workspace_id
) )
logger.info(f"处理了 {len(processed_files)} 个文件provider={provider}") logger.info(f"处理了 {len(processed_files)} 个文件provider={provider}")
capability = api_key_config.get("capability", []) capability = api_key_config.get("capability", [])
@@ -947,8 +949,11 @@ class AgentRunService:
) )
if has_doc_with_images: if has_doc_with_images:
agent.system_prompt += ( agent.system_prompt += (
"\n\n文档中包含图片,图片位置已在文本中以 [第N页 第M张图片]: URL 标记。" "\n\n文档中包含图片,图片位置已在文本中以 [图片 第N页 第M张图片]: URL 标记。"
"请在回答中用 Markdown 格式 ![描述](URL) 展示相关图片,做到图文并茂。" "请在回答中用 Markdown 格式 ![描述](URL) 展示相关图片,做到图文并茂。"
"**规则1图片URL必须原封不动、一字不差地复制禁止修改、禁止省略任何字符**"
"**规则2禁止修改URL中UUID里的任何数字和字母**"
"**规则3直接使用 ![描述](完整URL) 格式输出**"
) )
agent.agent = create_agent( agent.agent = create_agent(
model=agent.llm, model=agent.llm,

View File

@@ -24,6 +24,7 @@ import chardet
import httpx import httpx
import magic import magic
import openpyxl import openpyxl
import uuid
from docx import Document from docx import Document
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@@ -344,6 +345,7 @@ class MultimodalService:
async def process_files( async def process_files(
self, self,
files: Optional[List[FileInput]], files: Optional[List[FileInput]],
workspace_id: uuid.UUID = None,
document_image_recognition: bool = False, document_image_recognition: bool = False,
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
@@ -383,17 +385,20 @@ class MultimodalService:
# 仅当开关开启且模型支持视觉时,才提取文档内嵌图片 # 仅当开关开启且模型支持视觉时,才提取文档内嵌图片
if document_image_recognition and "vision" in self.capability: if document_image_recognition and "vision" in self.capability:
img_infos = await self.extract_document_images(file) img_infos = await self.extract_document_images(file)
from app.models.workspace_model import Workspace as WorkspaceModel
ws = self.db.query(WorkspaceModel).filter(WorkspaceModel.id == workspace_id).first()
tenant_id = ws.tenant_id if ws else None
for img_info in img_infos: for img_info in img_infos:
page = img_info["page"] page = img_info["page"]
index = img_info["index"] index = img_info["index"]
ext = img_info.get("ext", "png") ext = img_info.get("ext", "png")
try: try:
_, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext) _, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext, tenant_id, workspace_id)
placeholder = f"{page}页 第{index + 1}张图片" if page > 0 else f"{index + 1}张图片" placeholder = f"{page}页 第{index + 1}张图片" if page > 0 else f"{index + 1}张图片"
# 在文本内容中追加图片位置标记 # 在文本内容中追加图片位置标记
if result and result[-1].get("type") in ("text", "document"): if result and result[-1].get("type") in ("text", "document"):
key = "text" if "text" in result[-1] else list(result[-1].keys())[-1] key = "text" if "text" in result[-1] else list(result[-1].keys())[-1]
result[-1][key] = result[-1].get(key, "") + f"\n[{placeholder}]: {img_url}" result[-1][key] = result[-1].get(key, "") + f"\n[图片 {placeholder}]: {img_url}"
# 将图片以视觉格式追加到消息内容中 # 将图片以视觉格式追加到消息内容中
img_file = FileInput( img_file = FileInput(
type=FileType.IMAGE, type=FileType.IMAGE,
@@ -475,32 +480,26 @@ class MultimodalService:
file_name = file_metadata.file_name if file_metadata else "unknown" file_name = file_metadata.file_name if file_metadata else "unknown"
return await strategy.format_document(file_name, text) return await strategy.format_document(file_name, text)
@staticmethod
async def _save_doc_image_to_storage( async def _save_doc_image_to_storage(
self,
img_bytes: bytes, img_bytes: bytes,
ext: str, ext: str,
tenant_id: uuid.UUID,
workspace_id: uuid.UUID,
) -> tuple[str, str]: ) -> tuple[str, str]:
""" """
将文档内嵌图片保存到存储后端,写入 FileMetadata。 将文档内嵌图片保存到存储后端,写入 FileMetadata。
tenant_id / workspace_id 从 api_config 所在的 FileMetadata 上下文获取,
无法获取时使用占位 UUID图片仍可通过 permanent URL 访问)。
Returns: Returns:
(file_id_str, permanent_url) (file_id_str, permanent_url)
""" """
import uuid as _uuid
from app.services.file_storage_service import FileStorageService, generate_file_key from app.services.file_storage_service import FileStorageService, generate_file_key
from app.db import get_db_context from app.db import get_db_context
file_id = _uuid.uuid4() file_id = uuid.uuid4()
file_ext = f".{ext}" if not ext.startswith(".") else ext file_ext = f".{ext}" if not ext.startswith(".") else ext
content_type = f"image/{ext}" content_type = f"image/{ext}"
# tenant_id / workspace_id 尽量从已有 FileMetadata 推断,否则用占位值
placeholder = _uuid.UUID(int=0)
tenant_id = placeholder
workspace_id = placeholder
file_key = generate_file_key(tenant_id, workspace_id, file_id, file_ext) file_key = generate_file_key(tenant_id, workspace_id, file_id, file_ext)
storage_svc = FileStorageService() storage_svc = FileStorageService()
await storage_svc.storage.upload(file_key, img_bytes, content_type) await storage_svc.storage.upload(file_key, img_bytes, content_type)