feat(multimodal): support tenant-aware document image storage and improve image placeholder labeling
- Pass workspace_id to multimodal_service.process_files across app_chat_service, draft_run_service - Fetch tenant_id from workspace in multimodal_service for proper file storage scoping - Update image placeholder format from "[第N页 第M张图片]" to "[图片 第N页 第M张图片]" for clarity - Add strict URL preservation rules to system prompt for agents handling document images - Refactor _save_doc_image_to_storage to accept explicit tenant_id and workspace_id instead of inferring from FileMetadata
This commit is contained in:
@@ -170,7 +170,8 @@ class AppChatService:
|
|||||||
fu_config = fu_config.model_dump()
|
fu_config = fu_config.model_dump()
|
||||||
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
||||||
processed_files = await multimodal_service.process_files(
|
processed_files = await multimodal_service.process_files(
|
||||||
files, document_image_recognition=doc_img_recognition
|
files, document_image_recognition=doc_img_recognition,
|
||||||
|
workspace_id=workspace_id
|
||||||
)
|
)
|
||||||
logger.info(f"处理了 {len(processed_files)} 个文件")
|
logger.info(f"处理了 {len(processed_files)} 个文件")
|
||||||
if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any(
|
if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any(
|
||||||
@@ -462,7 +463,8 @@ class AppChatService:
|
|||||||
fu_config = fu_config.model_dump()
|
fu_config = fu_config.model_dump()
|
||||||
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
||||||
processed_files = await multimodal_service.process_files(
|
processed_files = await multimodal_service.process_files(
|
||||||
files, document_image_recognition=doc_img_recognition
|
files, document_image_recognition=doc_img_recognition,
|
||||||
|
workspace_id=workspace_id
|
||||||
)
|
)
|
||||||
logger.info(f"处理了 {len(processed_files)} 个文件")
|
logger.info(f"处理了 {len(processed_files)} 个文件")
|
||||||
if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any(
|
if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any(
|
||||||
|
|||||||
@@ -655,7 +655,8 @@ class AgentRunService:
|
|||||||
fu_config = fu_config.model_dump()
|
fu_config = fu_config.model_dump()
|
||||||
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
||||||
processed_files = await multimodal_service.process_files(
|
processed_files = await multimodal_service.process_files(
|
||||||
files, document_image_recognition=doc_img_recognition
|
files, document_image_recognition=doc_img_recognition,
|
||||||
|
workspace_id=workspace_id
|
||||||
)
|
)
|
||||||
logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}")
|
logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}")
|
||||||
capability = api_key_config.get("capability", [])
|
capability = api_key_config.get("capability", [])
|
||||||
@@ -936,7 +937,8 @@ class AgentRunService:
|
|||||||
fu_config = fu_config.model_dump()
|
fu_config = fu_config.model_dump()
|
||||||
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
|
||||||
processed_files = await multimodal_service.process_files(
|
processed_files = await multimodal_service.process_files(
|
||||||
files, document_image_recognition=doc_img_recognition
|
files, document_image_recognition=doc_img_recognition,
|
||||||
|
workspace_id=workspace_id
|
||||||
)
|
)
|
||||||
logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}")
|
logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}")
|
||||||
capability = api_key_config.get("capability", [])
|
capability = api_key_config.get("capability", [])
|
||||||
@@ -947,8 +949,11 @@ class AgentRunService:
|
|||||||
)
|
)
|
||||||
if has_doc_with_images:
|
if has_doc_with_images:
|
||||||
agent.system_prompt += (
|
agent.system_prompt += (
|
||||||
"\n\n文档中包含图片,图片位置已在文本中以 [第N页 第M张图片]: URL 标记。"
|
"\n\n文档中包含图片,图片位置已在文本中以 [图片 第N页 第M张图片]: URL 标记。"
|
||||||
"请在回答中用 Markdown 格式  展示相关图片,做到图文并茂。"
|
"请在回答中用 Markdown 格式  展示相关图片,做到图文并茂。"
|
||||||
|
"**规则1:图片URL必须原封不动、一字不差地复制,禁止修改、禁止省略任何字符**"
|
||||||
|
"**规则2:禁止修改URL中UUID里的任何数字和字母**"
|
||||||
|
"**规则3:直接使用  格式输出**"
|
||||||
)
|
)
|
||||||
agent.agent = create_agent(
|
agent.agent = create_agent(
|
||||||
model=agent.llm,
|
model=agent.llm,
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import chardet
|
|||||||
import httpx
|
import httpx
|
||||||
import magic
|
import magic
|
||||||
import openpyxl
|
import openpyxl
|
||||||
|
import uuid
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
@@ -344,6 +345,7 @@ class MultimodalService:
|
|||||||
async def process_files(
|
async def process_files(
|
||||||
self,
|
self,
|
||||||
files: Optional[List[FileInput]],
|
files: Optional[List[FileInput]],
|
||||||
|
workspace_id: uuid.UUID = None,
|
||||||
document_image_recognition: bool = False,
|
document_image_recognition: bool = False,
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
@@ -383,17 +385,20 @@ class MultimodalService:
|
|||||||
# 仅当开关开启且模型支持视觉时,才提取文档内嵌图片
|
# 仅当开关开启且模型支持视觉时,才提取文档内嵌图片
|
||||||
if document_image_recognition and "vision" in self.capability:
|
if document_image_recognition and "vision" in self.capability:
|
||||||
img_infos = await self.extract_document_images(file)
|
img_infos = await self.extract_document_images(file)
|
||||||
|
from app.models.workspace_model import Workspace as WorkspaceModel
|
||||||
|
ws = self.db.query(WorkspaceModel).filter(WorkspaceModel.id == workspace_id).first()
|
||||||
|
tenant_id = ws.tenant_id if ws else None
|
||||||
for img_info in img_infos:
|
for img_info in img_infos:
|
||||||
page = img_info["page"]
|
page = img_info["page"]
|
||||||
index = img_info["index"]
|
index = img_info["index"]
|
||||||
ext = img_info.get("ext", "png")
|
ext = img_info.get("ext", "png")
|
||||||
try:
|
try:
|
||||||
_, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext)
|
_, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext, tenant_id, workspace_id)
|
||||||
placeholder = f"第{page}页 第{index + 1}张图片" if page > 0 else f"第{index + 1}张图片"
|
placeholder = f"第{page}页 第{index + 1}张图片" if page > 0 else f"第{index + 1}张图片"
|
||||||
# 在文本内容中追加图片位置标记
|
# 在文本内容中追加图片位置标记
|
||||||
if result and result[-1].get("type") in ("text", "document"):
|
if result and result[-1].get("type") in ("text", "document"):
|
||||||
key = "text" if "text" in result[-1] else list(result[-1].keys())[-1]
|
key = "text" if "text" in result[-1] else list(result[-1].keys())[-1]
|
||||||
result[-1][key] = result[-1].get(key, "") + f"\n[{placeholder}]: {img_url}"
|
result[-1][key] = result[-1].get(key, "") + f"\n[图片 {placeholder}]: {img_url}"
|
||||||
# 将图片以视觉格式追加到消息内容中
|
# 将图片以视觉格式追加到消息内容中
|
||||||
img_file = FileInput(
|
img_file = FileInput(
|
||||||
type=FileType.IMAGE,
|
type=FileType.IMAGE,
|
||||||
@@ -475,32 +480,26 @@ class MultimodalService:
|
|||||||
file_name = file_metadata.file_name if file_metadata else "unknown"
|
file_name = file_metadata.file_name if file_metadata else "unknown"
|
||||||
return await strategy.format_document(file_name, text)
|
return await strategy.format_document(file_name, text)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
async def _save_doc_image_to_storage(
|
async def _save_doc_image_to_storage(
|
||||||
self,
|
|
||||||
img_bytes: bytes,
|
img_bytes: bytes,
|
||||||
ext: str,
|
ext: str,
|
||||||
|
tenant_id: uuid.UUID,
|
||||||
|
workspace_id: uuid.UUID,
|
||||||
) -> tuple[str, str]:
|
) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
将文档内嵌图片保存到存储后端,写入 FileMetadata。
|
将文档内嵌图片保存到存储后端,写入 FileMetadata。
|
||||||
tenant_id / workspace_id 从 api_config 所在的 FileMetadata 上下文获取,
|
|
||||||
无法获取时使用占位 UUID(图片仍可通过 permanent URL 访问)。
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
(file_id_str, permanent_url)
|
(file_id_str, permanent_url)
|
||||||
"""
|
"""
|
||||||
import uuid as _uuid
|
|
||||||
from app.services.file_storage_service import FileStorageService, generate_file_key
|
from app.services.file_storage_service import FileStorageService, generate_file_key
|
||||||
from app.db import get_db_context
|
from app.db import get_db_context
|
||||||
|
|
||||||
file_id = _uuid.uuid4()
|
file_id = uuid.uuid4()
|
||||||
file_ext = f".{ext}" if not ext.startswith(".") else ext
|
file_ext = f".{ext}" if not ext.startswith(".") else ext
|
||||||
content_type = f"image/{ext}"
|
content_type = f"image/{ext}"
|
||||||
|
|
||||||
# tenant_id / workspace_id 尽量从已有 FileMetadata 推断,否则用占位值
|
|
||||||
placeholder = _uuid.UUID(int=0)
|
|
||||||
tenant_id = placeholder
|
|
||||||
workspace_id = placeholder
|
|
||||||
|
|
||||||
file_key = generate_file_key(tenant_id, workspace_id, file_id, file_ext)
|
file_key = generate_file_key(tenant_id, workspace_id, file_id, file_ext)
|
||||||
storage_svc = FileStorageService()
|
storage_svc = FileStorageService()
|
||||||
await storage_svc.storage.upload(file_key, img_bytes, content_type)
|
await storage_svc.storage.upload(file_key, img_bytes, content_type)
|
||||||
|
|||||||
Reference in New Issue
Block a user