Merge pull request #994 from SuanmoSuanyangTechnology/feature/agent-tool_xjn

feat(multimodal)
This commit is contained in:
山程漫悟
2026-04-24 16:25:10 +08:00
committed by GitHub
3 changed files with 23 additions and 17 deletions

View File

@@ -170,7 +170,8 @@ class AppChatService:
fu_config = fu_config.model_dump()
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
processed_files = await multimodal_service.process_files(
files, document_image_recognition=doc_img_recognition
files, document_image_recognition=doc_img_recognition,
workspace_id=workspace_id
)
logger.info(f"处理了 {len(processed_files)} 个文件")
if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any(
@@ -462,7 +463,8 @@ class AppChatService:
fu_config = fu_config.model_dump()
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
processed_files = await multimodal_service.process_files(
files, document_image_recognition=doc_img_recognition
files, document_image_recognition=doc_img_recognition,
workspace_id=workspace_id
)
logger.info(f"处理了 {len(processed_files)} 个文件")
if doc_img_recognition and "vision" in (api_key_obj.capability or []) and any(

View File

@@ -655,7 +655,8 @@ class AgentRunService:
fu_config = fu_config.model_dump()
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
processed_files = await multimodal_service.process_files(
files, document_image_recognition=doc_img_recognition
files, document_image_recognition=doc_img_recognition,
workspace_id=workspace_id
)
logger.info(f"处理了 {len(processed_files)} 个文件provider={provider}")
capability = api_key_config.get("capability", [])
@@ -936,7 +937,8 @@ class AgentRunService:
fu_config = fu_config.model_dump()
doc_img_recognition = isinstance(fu_config, dict) and fu_config.get("document_image_recognition", False)
processed_files = await multimodal_service.process_files(
files, document_image_recognition=doc_img_recognition
files, document_image_recognition=doc_img_recognition,
workspace_id=workspace_id
)
logger.info(f"处理了 {len(processed_files)} 个文件provider={provider}")
capability = api_key_config.get("capability", [])
@@ -947,8 +949,11 @@ class AgentRunService:
)
if has_doc_with_images:
agent.system_prompt += (
"\n\n文档中包含图片,图片位置已在文本中以 [第N页 第M张图片]: URL 标记。"
"\n\n文档中包含图片,图片位置已在文本中以 [图片 第N页 第M张图片]: URL 标记。"
"请在回答中用 Markdown 格式 ![描述](URL) 展示相关图片,做到图文并茂。"
"**规则1图片URL必须原封不动、一字不差地复制禁止修改、禁止省略任何字符**"
"**规则2禁止修改URL中UUID里的任何数字和字母**"
"**规则3直接使用 ![描述](完整URL) 格式输出**"
)
agent.agent = create_agent(
model=agent.llm,

View File

@@ -24,6 +24,7 @@ import chardet
import httpx
import magic
import openpyxl
import uuid
from docx import Document
from sqlalchemy.orm import Session
@@ -344,6 +345,7 @@ class MultimodalService:
async def process_files(
self,
files: Optional[List[FileInput]],
workspace_id: uuid.UUID = None,
document_image_recognition: bool = False,
) -> List[Dict[str, Any]]:
"""
@@ -383,17 +385,20 @@ class MultimodalService:
# 仅当开关开启且模型支持视觉时,才提取文档内嵌图片
if document_image_recognition and "vision" in self.capability:
img_infos = await self.extract_document_images(file)
from app.models.workspace_model import Workspace as WorkspaceModel
ws = self.db.query(WorkspaceModel).filter(WorkspaceModel.id == workspace_id).first()
tenant_id = ws.tenant_id if ws else None
for img_info in img_infos:
page = img_info["page"]
index = img_info["index"]
ext = img_info.get("ext", "png")
try:
_, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext)
_, img_url = await self._save_doc_image_to_storage(img_info["bytes"], ext, tenant_id, workspace_id)
placeholder = f"{page}页 第{index + 1}张图片" if page > 0 else f"{index + 1}张图片"
# 在文本内容中追加图片位置标记
if result and result[-1].get("type") in ("text", "document"):
key = "text" if "text" in result[-1] else list(result[-1].keys())[-1]
result[-1][key] = result[-1].get(key, "") + f"\n[{placeholder}]: {img_url}"
result[-1][key] = result[-1].get(key, "") + f"\n[图片 {placeholder}]: {img_url}"
# 将图片以视觉格式追加到消息内容中
img_file = FileInput(
type=FileType.IMAGE,
@@ -475,32 +480,26 @@ class MultimodalService:
file_name = file_metadata.file_name if file_metadata else "unknown"
return await strategy.format_document(file_name, text)
@staticmethod
async def _save_doc_image_to_storage(
self,
img_bytes: bytes,
ext: str,
tenant_id: uuid.UUID,
workspace_id: uuid.UUID,
) -> tuple[str, str]:
"""
将文档内嵌图片保存到存储后端,写入 FileMetadata。
tenant_id / workspace_id 从 api_config 所在的 FileMetadata 上下文获取,
无法获取时使用占位 UUID图片仍可通过 permanent URL 访问)。
Returns:
(file_id_str, permanent_url)
"""
import uuid as _uuid
from app.services.file_storage_service import FileStorageService, generate_file_key
from app.db import get_db_context
file_id = _uuid.uuid4()
file_id = uuid.uuid4()
file_ext = f".{ext}" if not ext.startswith(".") else ext
content_type = f"image/{ext}"
# tenant_id / workspace_id 尽量从已有 FileMetadata 推断,否则用占位值
placeholder = _uuid.UUID(int=0)
tenant_id = placeholder
workspace_id = placeholder
file_key = generate_file_key(tenant_id, workspace_id, file_id, file_ext)
storage_svc = FileStorageService()
await storage_svc.storage.upload(file_key, img_bytes, content_type)