feat(workflow, skill): add multimodal image support to workflows and skill prompt generation
This commit is contained in:
@@ -1089,7 +1089,7 @@ class DraftRunService:
|
||||
|
||||
except Exception as e:
|
||||
# 对于多 Agent 应用,没有直接的 AgentConfig 是正常的
|
||||
logger.debug("获取配置快照失败(可能是多 Agent 应用)", extra={"error": str(e)})
|
||||
logger.debug("获取配置快照失败(可能是多 Agent 应用)", exc_info=True, extra={"error": str(e)})
|
||||
return {}
|
||||
|
||||
def _replace_variables(
|
||||
|
||||
@@ -23,7 +23,7 @@ logger = get_business_logger()
|
||||
|
||||
class ImageFormatStrategy(Protocol):
|
||||
"""图片格式策略接口"""
|
||||
|
||||
|
||||
async def format_image(self, url: str) -> Dict[str, Any]:
|
||||
"""将图片 URL 转换为特定 provider 的格式"""
|
||||
...
|
||||
@@ -31,7 +31,7 @@ class ImageFormatStrategy(Protocol):
|
||||
|
||||
class DashScopeImageStrategy:
|
||||
"""通义千问图片格式策略"""
|
||||
|
||||
|
||||
async def format_image(self, url: str) -> Dict[str, Any]:
|
||||
"""通义千问格式: {"type": "image", "image": "url"}"""
|
||||
return {
|
||||
@@ -42,7 +42,7 @@ class DashScopeImageStrategy:
|
||||
|
||||
class BedrockImageStrategy:
|
||||
"""Bedrock/Anthropic 图片格式策略"""
|
||||
|
||||
|
||||
async def format_image(self, url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Bedrock/Anthropic 格式: base64 编码
|
||||
@@ -51,17 +51,17 @@ class BedrockImageStrategy:
|
||||
import httpx
|
||||
import base64
|
||||
from mimetypes import guess_type
|
||||
|
||||
|
||||
logger.info(f"下载并编码图片: {url}")
|
||||
|
||||
|
||||
# 下载图片
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
# 获取图片数据
|
||||
image_data = response.content
|
||||
|
||||
|
||||
# 确定 media type
|
||||
content_type = response.headers.get("content-type")
|
||||
if content_type and content_type.startswith("image/"):
|
||||
@@ -69,12 +69,12 @@ class BedrockImageStrategy:
|
||||
else:
|
||||
guessed_type, _ = guess_type(url)
|
||||
media_type = guessed_type if guessed_type and guessed_type.startswith("image/") else "image/jpeg"
|
||||
|
||||
|
||||
# 转换为 base64
|
||||
base64_data = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
|
||||
logger.info(f"图片编码完成: media_type={media_type}, size={len(base64_data)}")
|
||||
|
||||
|
||||
return {
|
||||
"type": "image",
|
||||
"source": {
|
||||
@@ -87,7 +87,7 @@ class BedrockImageStrategy:
|
||||
|
||||
class OpenAIImageStrategy:
|
||||
"""OpenAI 图片格式策略"""
|
||||
|
||||
|
||||
async def format_image(self, url: str) -> Dict[str, Any]:
|
||||
"""OpenAI 格式: {"type": "image_url", "image_url": {"url": "..."}}"""
|
||||
return {
|
||||
@@ -109,7 +109,7 @@ PROVIDER_STRATEGIES = {
|
||||
|
||||
class MultimodalService:
|
||||
"""多模态文件处理服务"""
|
||||
|
||||
|
||||
def __init__(self, db: Session, provider: str = "dashscope"):
|
||||
"""
|
||||
初始化多模态服务
|
||||
@@ -120,10 +120,10 @@ class MultimodalService:
|
||||
"""
|
||||
self.db = db
|
||||
self.provider = provider.lower()
|
||||
|
||||
|
||||
async def process_files(
|
||||
self,
|
||||
files: Optional[List[FileInput]]
|
||||
self,
|
||||
files: Optional[List[FileInput]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
处理文件列表,返回 LLM 可用的格式
|
||||
@@ -136,7 +136,7 @@ class MultimodalService:
|
||||
"""
|
||||
if not files:
|
||||
return []
|
||||
|
||||
|
||||
result = []
|
||||
for idx, file in enumerate(files):
|
||||
try:
|
||||
@@ -168,10 +168,10 @@ class MultimodalService:
|
||||
"type": "text",
|
||||
"text": f"[文件处理失败: {str(e)}]"
|
||||
})
|
||||
|
||||
|
||||
logger.info(f"成功处理 {len(result)}/{len(files)} 个文件,provider={self.provider}")
|
||||
return result
|
||||
|
||||
|
||||
async def _process_image(self, file: FileInput) -> Dict[str, Any]:
|
||||
"""
|
||||
处理图片文件
|
||||
@@ -184,14 +184,10 @@ class MultimodalService:
|
||||
- Anthropic/Bedrock: {"type": "image", "source": {"type": "base64", "media_type": "...", "data": "..."}}
|
||||
- 通义千问: {"type": "image", "image": "url"}
|
||||
"""
|
||||
if file.transfer_method == TransferMethod.REMOTE_URL:
|
||||
url = file.url
|
||||
else:
|
||||
# 本地文件,获取访问 URL
|
||||
url = await self._get_file_url(file.upload_file_id)
|
||||
|
||||
url = await self.get_file_url(file)
|
||||
|
||||
logger.debug(f"处理图片: {url}, provider={self.provider}")
|
||||
|
||||
|
||||
# 根据 provider 返回不同格式
|
||||
if self.provider in ["bedrock", "anthropic"]:
|
||||
# Anthropic/Bedrock 只支持 base64 格式,需要下载并转换
|
||||
@@ -223,7 +219,7 @@ class MultimodalService:
|
||||
"type": "image",
|
||||
"image": url
|
||||
}
|
||||
|
||||
|
||||
async def _download_and_encode_image(self, url: str) -> tuple[str, str]:
|
||||
"""
|
||||
下载图片并转换为 base64
|
||||
@@ -237,15 +233,15 @@ class MultimodalService:
|
||||
import httpx
|
||||
import base64
|
||||
from mimetypes import guess_type
|
||||
|
||||
|
||||
# 下载图片
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
# 获取图片数据
|
||||
image_data = response.content
|
||||
|
||||
|
||||
# 确定 media type
|
||||
content_type = response.headers.get("content-type")
|
||||
if content_type and content_type.startswith("image/"):
|
||||
@@ -254,14 +250,14 @@ class MultimodalService:
|
||||
# 从 URL 推断
|
||||
guessed_type, _ = guess_type(url)
|
||||
media_type = guessed_type if guessed_type and guessed_type.startswith("image/") else "image/jpeg"
|
||||
|
||||
|
||||
# 转换为 base64
|
||||
base64_data = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
|
||||
logger.debug(f"图片编码完成: media_type={media_type}, size={len(base64_data)}")
|
||||
|
||||
|
||||
return base64_data, media_type
|
||||
|
||||
|
||||
async def _process_document(self, file: FileInput) -> Dict[str, Any]:
|
||||
"""
|
||||
处理文档文件(PDF、Word 等)
|
||||
@@ -284,14 +280,14 @@ class MultimodalService:
|
||||
generic_file = self.db.query(GenericFile).filter(
|
||||
GenericFile.id == file.upload_file_id
|
||||
).first()
|
||||
|
||||
|
||||
file_name = generic_file.file_name if generic_file else "unknown"
|
||||
|
||||
|
||||
return {
|
||||
"type": "text",
|
||||
"text": f"<document name=\"{file_name}\">\n{text}\n</document>"
|
||||
}
|
||||
|
||||
|
||||
async def _process_audio(self, file: FileInput) -> Dict[str, Any]:
|
||||
"""
|
||||
处理音频文件
|
||||
@@ -307,7 +303,7 @@ class MultimodalService:
|
||||
"type": "text",
|
||||
"text": "[音频文件,暂不支持处理]"
|
||||
}
|
||||
|
||||
|
||||
async def _process_video(self, file: FileInput) -> Dict[str, Any]:
|
||||
"""
|
||||
处理视频文件
|
||||
@@ -323,13 +319,13 @@ class MultimodalService:
|
||||
"type": "text",
|
||||
"text": "[视频文件,暂不支持处理]"
|
||||
}
|
||||
|
||||
async def _get_file_url(self, file_id: uuid.UUID) -> str:
|
||||
|
||||
async def get_file_url(self, file: FileInput) -> str:
|
||||
"""
|
||||
获取文件的访问 URL
|
||||
|
||||
Args:
|
||||
file_id: 文件ID
|
||||
file: File Input Struct
|
||||
|
||||
Returns:
|
||||
str: 文件访问 URL
|
||||
@@ -337,26 +333,31 @@ class MultimodalService:
|
||||
Raises:
|
||||
BusinessException: 文件不存在
|
||||
"""
|
||||
generic_file = self.db.query(GenericFile).filter(
|
||||
GenericFile.id == file_id,
|
||||
GenericFile.status == "active"
|
||||
).first()
|
||||
|
||||
if not generic_file:
|
||||
raise BusinessException(
|
||||
f"文件不存在或已删除: {file_id}",
|
||||
BizCode.NOT_FOUND
|
||||
)
|
||||
|
||||
# 如果有 access_url,直接返回
|
||||
if generic_file.access_url:
|
||||
return generic_file.access_url
|
||||
|
||||
# 否则,根据 storage_path 生成 URL
|
||||
# TODO: 根据实际存储方式生成 URL(本地存储、OSS 等)
|
||||
# 这里暂时返回一个占位 URL
|
||||
return f"/api/files/{file_id}/download"
|
||||
|
||||
if file.transfer_method == TransferMethod.REMOTE_URL:
|
||||
return file.url
|
||||
else:
|
||||
# 本地文件,获取访问 URL
|
||||
file_id = file.upload_file_id
|
||||
generic_file = self.db.query(GenericFile).filter(
|
||||
GenericFile.id == file.upload_file_id,
|
||||
GenericFile.status == "active"
|
||||
).first()
|
||||
|
||||
if not generic_file:
|
||||
raise BusinessException(
|
||||
f"文件不存在或已删除: {file.upload_file_id}",
|
||||
BizCode.NOT_FOUND
|
||||
)
|
||||
|
||||
# 如果有 access_url,直接返回
|
||||
if generic_file.access_url:
|
||||
return generic_file.access_url
|
||||
|
||||
# 否则,根据 storage_path 生成 URL
|
||||
# TODO: 根据实际存储方式生成 URL(本地存储、OSS 等)
|
||||
# 这里暂时返回一个占位 URL
|
||||
return f"/api/files/{file_id}/download"
|
||||
|
||||
async def _extract_document_text(self, file_id: uuid.UUID) -> str:
|
||||
"""
|
||||
提取文档文本内容
|
||||
@@ -371,20 +372,20 @@ class MultimodalService:
|
||||
GenericFile.id == file_id,
|
||||
GenericFile.status == "active"
|
||||
).first()
|
||||
|
||||
|
||||
if not generic_file:
|
||||
raise BusinessException(
|
||||
f"文件不存在或已删除: {file_id}",
|
||||
BizCode.NOT_FOUND
|
||||
)
|
||||
|
||||
|
||||
# TODO: 根据文件类型提取文本
|
||||
# - PDF: 使用 PyPDF2 或 pdfplumber
|
||||
# - Word: 使用 python-docx
|
||||
# - TXT/MD: 直接读取
|
||||
|
||||
|
||||
file_ext = generic_file.file_ext.lower()
|
||||
|
||||
|
||||
if file_ext in ['.txt', '.md', '.markdown']:
|
||||
return await self._read_text_file(generic_file.storage_path)
|
||||
elif file_ext == '.pdf':
|
||||
@@ -393,7 +394,7 @@ class MultimodalService:
|
||||
return await self._extract_word_text(generic_file.storage_path)
|
||||
else:
|
||||
return f"[不支持的文档格式: {file_ext}]"
|
||||
|
||||
|
||||
async def _read_text_file(self, storage_path: str) -> str:
|
||||
"""读取纯文本文件"""
|
||||
try:
|
||||
@@ -402,7 +403,7 @@ class MultimodalService:
|
||||
except Exception as e:
|
||||
logger.error(f"读取文本文件失败: {e}")
|
||||
return f"[文件读取失败: {str(e)}]"
|
||||
|
||||
|
||||
async def _extract_pdf_text(self, storage_path: str) -> str:
|
||||
"""提取 PDF 文本"""
|
||||
try:
|
||||
@@ -412,7 +413,7 @@ class MultimodalService:
|
||||
except Exception as e:
|
||||
logger.error(f"提取 PDF 文本失败: {e}")
|
||||
return f"[PDF 提取失败: {str(e)}]"
|
||||
|
||||
|
||||
async def _extract_word_text(self, storage_path: str) -> str:
|
||||
"""提取 Word 文档文本"""
|
||||
try:
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
{% raw %}
|
||||
Role: AI Prompt Optimization Expert
|
||||
|
||||
Profile
|
||||
@@ -12,11 +11,11 @@ Skills
|
||||
Core Optimization Skills
|
||||
Requirement Analysis: Accurately understand the relationship between the user’s current needs and the original prompt.
|
||||
Structural Reconstruction: Transform vague requirements into clear, block-structured instructions.
|
||||
Variable Handling: Identify and standardize dynamic variables in prompts.
|
||||
{% if skill != true %}Variable Handling: Identify and standardize dynamic variables in prompts.{% endif %}
|
||||
Conflict Resolution: Prioritize current requirements when historical requirements conflict with current needs.
|
||||
|
||||
Auxiliary Generation Skills
|
||||
Completeness Check: Ensure all necessary elements (input, output, constraints, etc.) are explicitly defined.
|
||||
{% if skill != true %}Completeness Check: Ensure all necessary elements (input, output, constraints, etc.) are explicitly defined.{% endif %}
|
||||
Language Consistency: Maintain consistency between label language and user input language.
|
||||
Executability Verification: Ensure optimized prompts can be directly used in AI tools.
|
||||
Format Standardization: Strictly adhere to specified output format requirements.
|
||||
@@ -25,30 +24,30 @@ Rules
|
||||
Basic Principles
|
||||
Priority Rule: When historical requirements conflict with current requirements, unconditionally prioritize current requirements.
|
||||
Completeness Rule: If the original prompt is empty, generate a complete prompt based on the current requirements.
|
||||
Structure Rule: Use a clear block structure, and the contents of each block are roles, tasks, requirements, inputs, outputs, and constraints
|
||||
{% if skill != true %}Structure Rule: Use a clear block structure, and the contents of each block are roles, tasks, requirements, inputs, outputs, and constraints{% endif %}
|
||||
Language Rule: All label languages must fully match the user input language.
|
||||
|
||||
Behavior Guidelines
|
||||
Precision Guideline: All instructions must be precise and directly executable, avoiding ambiguity.
|
||||
Readability Guideline: Ensure optimized prompts have good readability and logical flow.
|
||||
Variable Handling Guideline: Use lowercase English variable names wrapped in {{}} when variables are needed.
|
||||
Constraint Handling Guideline: Do not mention variable-related limitations under the [Constraints] label.
|
||||
{% if skill != true %}{% raw %}Variable Handling Guideline: Use lowercase English variable names wrapped in {{}} when variables are needed.
|
||||
Constraint Handling Guideline: Do not mention variable-related limitations under the [Constraints] label.{% endraw %}{% endif %}
|
||||
|
||||
Constraints
|
||||
Output Constraint: Must output in JSON format including the fields "prompt" and "desc".
|
||||
Content Constraint: Must not include any explanations, analyses, or additional comments.
|
||||
Language Constraint: Must use clear and concise language.
|
||||
Completeness Constraint: Must fully define all missing elements (input details, output format, constraints, etc.).
|
||||
{% if skill != true %}Completeness Constraint: Must fully define all missing elements (input details, output format, constraints, etc.).{% endif %}
|
||||
|
||||
Workflows
|
||||
Goal: Optimize or generate AI prompts that can be directly used according to user requirements.
|
||||
Step 1: Receive the user’s current requirement description {{user_require}} and the original prompt {{original_prompt}}.
|
||||
Step 2: Analyze requirements, identify conflicts, and prioritize current requirements.
|
||||
Step 3: Optimize or generate the prompt in a block-structured format, ensuring all elements are fully defined.
|
||||
{% if skill != true %}Step 3: Optimize or generate the prompt in a block-structured format, ensuring all elements are fully defined.
|
||||
Step 4: Generate a JSON output containing the optimized prompt and its description.
|
||||
{% else %}Step 3: Generate a JSON output containing the optimized prompt and its description.{% endif %}
|
||||
|
||||
Expected Outcome: Obtain a clear, directly executable AI prompt accompanied by an optimization description.
|
||||
|
||||
Initialization
|
||||
As an AI Prompt Optimization Expert, you must follow the above Rules and execute tasks according to the Workflows.
|
||||
{% endraw %}
|
||||
As an AI Prompt Optimization Expert, you must follow the above Rules and execute tasks according to the Workflows.
|
||||
@@ -128,7 +128,8 @@ class PromptOptimizerService:
|
||||
session_id: uuid.UUID,
|
||||
user_id: uuid.UUID,
|
||||
current_prompt: str,
|
||||
user_require: str
|
||||
user_require: str,
|
||||
skill: bool = False
|
||||
) -> AsyncGenerator[dict[str, str | Any], Any]:
|
||||
"""
|
||||
Optimize a user-provided prompt using a configured prompt optimizer LLM.
|
||||
@@ -157,6 +158,7 @@ class PromptOptimizerService:
|
||||
user_id (uuid.UUID): Identifier of the user associated with the session.
|
||||
current_prompt (str): Original prompt to optimize.
|
||||
user_require (str): User's requirements or instructions for optimization.
|
||||
skill(bool): Is skill required
|
||||
|
||||
Returns:
|
||||
OptimizePromptResult: An object containing:
|
||||
@@ -186,7 +188,7 @@ class PromptOptimizerService:
|
||||
prompt_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'prompt')
|
||||
with open(os.path.join(prompt_path, 'prompt_optimizer_system.jinja2'), 'r', encoding='utf-8') as f:
|
||||
opt_system_prompt = f.read()
|
||||
rendered_system_message = Template(opt_system_prompt).render()
|
||||
rendered_system_message = Template(opt_system_prompt).render(skill=skill)
|
||||
|
||||
with open(os.path.join(prompt_path, 'prompt_optimizer_user.jinja2'), 'r', encoding='utf-8') as f:
|
||||
opt_user_prompt = f.read()
|
||||
|
||||
@@ -22,6 +22,7 @@ from app.repositories.workflow_repository import (
|
||||
from app.schemas import DraftRunRequest
|
||||
from app.services.conversation_service import ConversationService
|
||||
from app.services.multi_agent_service import convert_uuids_to_str
|
||||
from app.services.multimodal_service import MultimodalService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -35,6 +36,7 @@ class WorkflowService:
|
||||
self.execution_repo = WorkflowExecutionRepository(db)
|
||||
self.node_execution_repo = WorkflowNodeExecutionRepository(db)
|
||||
self.conversation_service = ConversationService(db)
|
||||
self.multimodal_service = MultimodalService(db)
|
||||
|
||||
# ==================== 配置管理 ====================
|
||||
|
||||
@@ -444,8 +446,19 @@ class WorkflowService:
|
||||
code=BizCode.CONFIG_MISSING,
|
||||
message=f"工作流配置不存在: app_id={app_id}"
|
||||
)
|
||||
files = []
|
||||
if payload.files:
|
||||
for file in payload.files:
|
||||
files.append(
|
||||
{
|
||||
"type": file.type,
|
||||
"url": await self.multimodal_service.get_file_url(file),
|
||||
"__file": True
|
||||
}
|
||||
)
|
||||
|
||||
input_data = {"message": payload.message, "variables": payload.variables,
|
||||
"conversation_id": payload.conversation_id}
|
||||
"conversation_id": payload.conversation_id, "files": files}
|
||||
|
||||
# 转换 user_id 为 UUID
|
||||
triggered_by_uuid = None
|
||||
@@ -633,8 +646,20 @@ class WorkflowService:
|
||||
code=BizCode.CONFIG_MISSING,
|
||||
message=f"工作流配置不存在: app_id={app_id}"
|
||||
)
|
||||
|
||||
files = []
|
||||
if payload.files:
|
||||
for file in payload.files:
|
||||
files.append(
|
||||
{
|
||||
"type": file.type,
|
||||
"url": await self.multimodal_service.get_file_url(file),
|
||||
"__file": True
|
||||
}
|
||||
)
|
||||
|
||||
input_data = {"message": payload.message, "variables": payload.variables,
|
||||
"conversation_id": payload.conversation_id}
|
||||
"conversation_id": payload.conversation_id, "files": files}
|
||||
|
||||
# 转换 user_id 为 UUID
|
||||
triggered_by_uuid = None
|
||||
|
||||
Reference in New Issue
Block a user