From 590ec3a446d957f17e9d9e05189f2a155dd378ab Mon Sep 17 00:00:00 2001
From: Timebomb2018 <18868801967@163.com>
Date: Thu, 5 Mar 2026 09:55:54 +0800
Subject: [PATCH] feat(model and app): 1. Increase support for visual models
and multimodal models; 2. The application and workflow can input various
multimodal files such as images, documents, audio, and videos.
---
api/app/controllers/app_controller.py | 3 +-
api/app/controllers/model_controller.py | 4 +-
api/app/controllers/ontology_controller.py | 17 +-
api/app/core/agent/langchain_agent.py | 153 ++++---
api/app/core/models/base.py | 30 +-
.../core/models/scripts/bedrock_models.yaml | 38 +-
.../core/models/scripts/dashscope_models.yaml | 172 ++++++-
api/app/core/models/scripts/loader.py | 45 +-
.../core/models/scripts/openai_models.yaml | 64 ++-
api/app/models/models_model.py | 13 +-
api/app/schemas/app_schema.py | 16 +-
api/app/schemas/model_schema.py | 18 +
api/app/services/app_chat_service.py | 6 +-
api/app/services/app_service.py | 2 +-
.../services/audio_transcription_service.py | 101 +++++
.../services/collaborative_orchestrator.py | 2 +
api/app/services/draft_run_service.py | 11 +-
api/app/services/handoffs_service.py | 1 +
api/app/services/llm_router.py | 1 +
api/app/services/master_agent_router.py | 1 +
api/app/services/model_service.py | 58 ++-
api/app/services/multi_agent_orchestrator.py | 2 +
api/app/services/multi_agent_service.py | 2 +-
api/app/services/multimodal_service.py | 426 +++++++++++++-----
api/app/services/prompt_optimizer_service.py | 3 +-
api/app/services/shared_chat_service.py | 2 +
26 files changed, 958 insertions(+), 233 deletions(-)
create mode 100644 api/app/services/audio_transcription_service.py
diff --git a/api/app/controllers/app_controller.py b/api/app/controllers/app_controller.py
index e2849ad6..653f616c 100644
--- a/api/app/controllers/app_controller.py
+++ b/api/app/controllers/app_controller.py
@@ -835,7 +835,8 @@ async def draft_run_compare(
web_search=True,
memory=True,
parallel=payload.parallel,
- timeout=payload.timeout or 60
+ timeout=payload.timeout or 60,
+ files=payload.files
)
logger.info(
diff --git a/api/app/controllers/model_controller.py b/api/app/controllers/model_controller.py
index bb1ba526..0de3d4fe 100644
--- a/api/app/controllers/model_controller.py
+++ b/api/app/controllers/model_controller.py
@@ -469,7 +469,9 @@ async def create_model_api_key_by_provider(
config=api_key_data.config,
is_active=api_key_data.is_active,
priority=api_key_data.priority,
- model_config_ids=model_config_ids
+ model_config_ids=model_config_ids,
+ capability=api_key_data.capability,
+ is_omni=api_key_data.is_omni
)
created_keys, failed_models = await ModelApiKeyService.create_api_key_by_provider(db=db, data=create_data)
diff --git a/api/app/controllers/ontology_controller.py b/api/app/controllers/ontology_controller.py
index e4a87141..42d4bee0 100644
--- a/api/app/controllers/ontology_controller.py
+++ b/api/app/controllers/ontology_controller.py
@@ -124,15 +124,23 @@ def _get_ontology_service(
)
# 通过 Repository 获取可用的 API Key(负载均衡逻辑由 Repository 处理)
- from app.repositories.model_repository import ModelApiKeyRepository
- api_keys = ModelApiKeyRepository.get_by_model_config(db, model_config.id)
- if not api_keys:
+ # from app.repositories.model_repository import ModelApiKeyRepository
+ from app.services.model_service import ModelApiKeyService
+ api_key_config = ModelApiKeyService.get_available_api_key(db, model_config.id)
+ if not api_key_config:
logger.error(f"Model {llm_id} has no active API key")
raise HTTPException(
status_code=400,
detail="指定的LLM模型没有可用的API密钥"
)
- api_key_config = api_keys[0]
+ # api_keys = ModelApiKeyRepository.get_by_model_config(db, model_config.id)
+ # if not api_keys:
+ # logger.error(f"Model {llm_id} has no active API key")
+ # raise HTTPException(
+ # status_code=400,
+ # detail="指定的LLM模型没有可用的API密钥"
+ # )
+ # api_key_config = api_keys[0]
is_composite = getattr(model_config, 'is_composite', False)
logger.info(
@@ -154,6 +162,7 @@ def _get_ontology_service(
provider=actual_provider,
api_key=api_key_config.api_key,
base_url=api_key_config.api_base,
+ is_omni=api_key_config.is_omni,
max_retries=3,
timeout=60.0
)
diff --git a/api/app/core/agent/langchain_agent.py b/api/app/core/agent/langchain_agent.py
index fae20ea2..88b6371c 100644
--- a/api/app/core/agent/langchain_agent.py
+++ b/api/app/core/agent/langchain_agent.py
@@ -11,35 +11,37 @@ LangChain Agent 封装
import time
from typing import Any, AsyncGenerator, Dict, List, Optional, Sequence
-from app.core.memory.agent.langgraph_graph.write_graph import write_long_term
+from app.core.memory.agent.langgraph_graph.write_graph import write_long_term
from app.db import get_db
from app.core.logging_config import get_business_logger
from app.core.models import RedBearLLM, RedBearModelConfig
-from app.models.models_model import ModelType
+from app.models.models_model import ModelType, ModelProvider
from app.services.memory_agent_service import (
get_end_user_connected_config,
)
from langchain.agents import create_agent
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
from langchain_core.tools import BaseTool
+
logger = get_business_logger()
class LangChainAgent:
def __init__(
- self,
- model_name: str,
- api_key: str,
- provider: str = "openai",
- api_base: Optional[str] = None,
- temperature: float = 0.7,
- max_tokens: int = 2000,
- system_prompt: Optional[str] = None,
- tools: Optional[Sequence[BaseTool]] = None,
- streaming: bool = False,
- max_iterations: Optional[int] = None, # 最大迭代次数(None 表示自动计算)
- max_tool_consecutive_calls: int = 3 # 单个工具最大连续调用次数
+ self,
+ model_name: str,
+ api_key: str,
+ provider: str = "openai",
+ api_base: Optional[str] = None,
+ is_omni: bool = False,
+ temperature: float = 0.7,
+ max_tokens: int = 2000,
+ system_prompt: Optional[str] = None,
+ tools: Optional[Sequence[BaseTool]] = None,
+ streaming: bool = False,
+ max_iterations: Optional[int] = None, # 最大迭代次数(None 表示自动计算)
+ max_tool_consecutive_calls: int = 3 # 单个工具最大连续调用次数
):
"""初始化 LangChain Agent
@@ -60,12 +62,13 @@ class LangChainAgent:
self.provider = provider
self.tools = tools or []
self.streaming = streaming
+ self.is_omni = is_omni
self.max_tool_consecutive_calls = max_tool_consecutive_calls
-
+
# 工具调用计数器:记录每个工具的连续调用次数
self.tool_call_counter: Dict[str, int] = {}
self.last_tool_called: Optional[str] = None
-
+
# 根据工具数量动态调整最大迭代次数
# 基础值 + 每个工具额外的调用机会
if max_iterations is None:
@@ -73,9 +76,9 @@ class LangChainAgent:
self.max_iterations = 5 + len(self.tools) * 2
else:
self.max_iterations = max_iterations
-
+
self.system_prompt = system_prompt or "你是一个专业的AI助手"
-
+
logger.debug(
f"Agent 迭代次数配置: max_iterations={self.max_iterations}, "
f"tool_count={len(self.tools)}, "
@@ -89,6 +92,7 @@ class LangChainAgent:
provider=provider,
api_key=api_key,
base_url=api_base,
+ is_omni=is_omni,
extra_params={
"temperature": temperature,
"max_tokens": max_tokens,
@@ -143,21 +147,22 @@ class LangChainAgent:
"""
from langchain_core.tools import StructuredTool
from functools import wraps
-
+
wrapped_tools = []
-
+
for original_tool in tools:
tool_name = original_tool.name
original_func = original_tool.func if hasattr(original_tool, 'func') else None
-
+
if not original_func:
# 如果无法获取原始函数,直接使用原工具
wrapped_tools.append(original_tool)
continue
-
+
# 创建包装函数
def make_wrapped_func(tool_name, original_func):
"""创建包装函数的工厂函数,避免闭包问题"""
+
@wraps(original_func)
def wrapped_func(*args, **kwargs):
"""包装后的工具函数,跟踪连续调用次数"""
@@ -168,13 +173,13 @@ class LangChainAgent:
# 切换到新工具,重置计数器
self.tool_call_counter[tool_name] = 1
self.last_tool_called = tool_name
-
+
current_count = self.tool_call_counter[tool_name]
-
+
logger.debug(
f"工具调用: {tool_name}, 连续调用次数: {current_count}/{self.max_tool_consecutive_calls}"
)
-
+
# 检查是否超过最大连续调用次数
if current_count > self.max_tool_consecutive_calls:
logger.warning(
@@ -185,12 +190,12 @@ class LangChainAgent:
f"工具 '{tool_name}' 已连续调用 {self.max_tool_consecutive_calls} 次,"
f"未找到有效结果。请尝试其他方法或直接回答用户的问题。"
)
-
+
# 调用原始工具函数
return original_func(*args, **kwargs)
-
+
return wrapped_func
-
+
# 使用 StructuredTool 创建新工具
wrapped_tool = StructuredTool(
name=original_tool.name,
@@ -198,17 +203,17 @@ class LangChainAgent:
func=make_wrapped_func(tool_name, original_func),
args_schema=original_tool.args_schema if hasattr(original_tool, 'args_schema') else None
)
-
+
wrapped_tools.append(wrapped_tool)
-
+
return wrapped_tools
def _prepare_messages(
- self,
- message: str,
- history: Optional[List[Dict[str, str]]] = None,
- context: Optional[str] = None,
- files: Optional[List[Dict[str, Any]]] = None
+ self,
+ message: str,
+ history: Optional[List[Dict[str, str]]] = None,
+ context: Optional[str] = None,
+ files: Optional[List[Dict[str, Any]]] = None
) -> List[BaseMessage]:
"""准备消息列表
@@ -248,7 +253,7 @@ class LangChainAgent:
messages.append(HumanMessage(content=user_content))
return messages
-
+
def _build_multimodal_content(self, text: str, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
构建多模态消息内容
@@ -261,23 +266,26 @@ class LangChainAgent:
List[Dict]: 消息内容列表
"""
# 根据 provider 使用不同的文本格式
- if self.provider.lower() in ["bedrock", "anthropic"]:
- # Anthropic/Bedrock: {"type": "text", "text": "..."}
- content_parts = [{"type": "text", "text": text}]
- else:
- # 通义千问等: {"text": "..."}
- content_parts = [{"text": text}]
-
+ # if (self.provider.lower() in [ModelProvider.BEDROCK, ModelProvider.OPENAI, ModelProvider.XINFERENCE,
+ # ModelProvider.GPUSTACK] or (
+ # self.provider.lower() == ModelProvider.DASHSCOPE and self.is_omni)):
+ # # Anthropic/Bedrock/Xinference/Gpustack/Openai: {"type": "text", "text": "..."}
+ # content_parts = [{"type": "text", "text": text}]
+ # else:
+ # # 通义千问等: {"text": "..."}
+ # content_parts = [{"type": "text", "text": text}]
+ content_parts = [{"type": "text", "text": text}]
+
# 添加文件内容
# MultimodalService 已经根据 provider 返回了正确格式,直接使用
content_parts.extend(files)
-
+
logger.debug(
f"构建多模态消息: provider={self.provider}, "
f"parts={len(content_parts)}, "
f"files={len(files)}"
)
-
+
return content_parts
async def chat(
@@ -302,7 +310,7 @@ class LangChainAgent:
Returns:
Dict: 包含 content 和元数据的字典
"""
- message_chat= message
+ message_chat = message
start_time = time.time()
actual_config_id = config_id
# If config_id is None, try to get from end_user's connected config
@@ -322,8 +330,8 @@ class LangChainAgent:
except Exception as e:
logger.warning(f"Failed to get db session: {e}")
actual_end_user_id = end_user_id if end_user_id is not None else "unknown"
- logger.info(f'写入类型{storage_type,str(end_user_id), message, str(user_rag_memory_id)}')
- print(f'写入类型{storage_type,str(end_user_id), message, str(user_rag_memory_id)}')
+ logger.info(f'写入类型{storage_type, str(end_user_id), message, str(user_rag_memory_id)}')
+ print(f'写入类型{storage_type, str(end_user_id), message, str(user_rag_memory_id)}')
try:
# 准备消息列表(支持多模态)
messages = self._prepare_messages(message, history, context, files)
@@ -367,14 +375,14 @@ class LangChainAgent:
# 获取最后的 AI 消息
output_messages = result.get("messages", [])
content = ""
-
+
logger.debug(f"输出消息数量: {len(output_messages)}")
total_tokens = 0
for msg in reversed(output_messages):
if isinstance(msg, AIMessage):
logger.debug(f"找到 AI 消息,content 类型: {type(msg.content)}")
logger.debug(f"AI 消息内容: {msg.content}")
-
+
# 处理多模态响应:content 可能是字符串或列表
if isinstance(msg.content, str):
content = msg.content
@@ -407,12 +415,13 @@ class LangChainAgent:
response_meta = msg.response_metadata if hasattr(msg, 'response_metadata') else None
total_tokens = response_meta.get("token_usage", {}).get("total_tokens", 0) if response_meta else 0
break
-
+
logger.info(f"最终提取的内容长度: {len(content)}")
elapsed_time = time.time() - start_time
if memory_flag:
- await write_long_term(storage_type, end_user_id, message_chat, content, user_rag_memory_id, actual_config_id)
+ await write_long_term(storage_type, end_user_id, message_chat, content, user_rag_memory_id,
+ actual_config_id)
response = {
"content": content,
"model": self.model_name,
@@ -439,16 +448,16 @@ class LangChainAgent:
raise
async def chat_stream(
- self,
- message: str,
- history: Optional[List[Dict[str, str]]] = None,
- context: Optional[str] = None,
- end_user_id:Optional[str] = None,
- config_id: Optional[str] = None,
- storage_type:Optional[str] = None,
- user_rag_memory_id:Optional[str] = None,
- memory_flag: Optional[bool] = True,
- files: Optional[List[Dict[str, Any]]] = None # 新增:多模态文件
+ self,
+ message: str,
+ history: Optional[List[Dict[str, str]]] = None,
+ context: Optional[str] = None,
+ end_user_id: Optional[str] = None,
+ config_id: Optional[str] = None,
+ storage_type: Optional[str] = None,
+ user_rag_memory_id: Optional[str] = None,
+ memory_flag: Optional[bool] = True,
+ files: Optional[List[Dict[str, Any]]] = None # 新增:多模态文件
) -> AsyncGenerator[str, None]:
"""执行流式对话
@@ -482,7 +491,6 @@ class LangChainAgent:
except Exception as e:
logger.warning(f"Failed to get db session: {e}")
-
# 注意:不在这里写入用户消息,等 AI 回复后一起写入
try:
# 准备消息列表(支持多模态)
@@ -500,13 +508,13 @@ class LangChainAgent:
full_content = ''
try:
async for event in self.agent.astream_events(
- {"messages": messages},
- version="v2",
- config={"recursion_limit": self.max_iterations}
+ {"messages": messages},
+ version="v2",
+ config={"recursion_limit": self.max_iterations}
):
chunk_count += 1
kind = event.get("event")
-
+
# 处理所有可能的流式事件
if kind == "on_chat_model_stream":
# LLM 流式输出
@@ -540,7 +548,7 @@ class LangChainAgent:
full_content += item
yield item
yielded_content = True
-
+
elif kind == "on_llm_stream":
# 另一种 LLM 流式事件
chunk = event.get("data", {}).get("chunk")
@@ -577,13 +585,13 @@ class LangChainAgent:
full_content += chunk
yield chunk
yielded_content = True
-
+
# 记录工具调用(可选)
elif kind == "on_tool_start":
logger.debug(f"工具调用开始: {event.get('name')}")
elif kind == "on_tool_end":
logger.debug(f"工具调用结束: {event.get('name')}")
-
+
logger.debug(f"Agent 流式完成,共 {chunk_count} 个事件")
# 统计token消耗
output_messages = event.get("data", {}).get("output", {}).get("messages", [])
@@ -595,7 +603,8 @@ class LangChainAgent:
yield total_tokens
break
if memory_flag:
- await write_long_term(storage_type, end_user_id, message_chat, full_content, user_rag_memory_id, actual_config_id)
+ await write_long_term(storage_type, end_user_id, message_chat, full_content, user_rag_memory_id,
+ actual_config_id)
except Exception as e:
logger.error(f"Agent astream_events 失败: {str(e)}", exc_info=True)
raise
@@ -609,5 +618,3 @@ class LangChainAgent:
logger.info("=" * 80)
logger.info("chat_stream 方法执行结束")
logger.info("=" * 80)
-
-
diff --git a/api/app/core/models/base.py b/api/app/core/models/base.py
index f5f49af0..5d4dbd10 100644
--- a/api/app/core/models/base.py
+++ b/api/app/core/models/base.py
@@ -27,6 +27,7 @@ class RedBearModelConfig(BaseModel):
provider: str
api_key: str
base_url: Optional[str] = None
+ is_omni: bool = False # 是否为 Omni 模型
# 请求超时时间(秒)- 默认120秒以支持复杂的LLM调用,可通过环境变量 LLM_TIMEOUT 配置
timeout: float = Field(default_factory=lambda: float(os.getenv("LLM_TIMEOUT", "120.0")))
# 最大重试次数 - 默认2次以避免过长等待,可通过环境变量 LLM_MAX_RETRIES 配置
@@ -45,7 +46,28 @@ class RedBearModelFactory:
# 打印供应商信息用于调试
from app.core.logging_config import get_business_logger
logger = get_business_logger()
- logger.debug(f"获取模型参数 - Provider: {provider}, Model: {config.model_name}")
+ logger.debug(f"获取模型参数 - Provider: {provider}, Model: {config.model_name}, is_omni: {config.is_omni}")
+
+ # dashscope 的 omni 模型使用 OpenAI 兼容模式
+ if provider == ModelProvider.DASHSCOPE and config.is_omni:
+ import httpx
+ if not config.base_url:
+ config.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ timeout_config = httpx.Timeout(
+ timeout=config.timeout,
+ connect=60.0,
+ read=config.timeout,
+ write=60.0,
+ pool=10.0,
+ )
+ return {
+ "model": config.model_name,
+ "base_url": config.base_url,
+ "api_key": config.api_key,
+ "timeout": timeout_config,
+ "max_retries": config.max_retries,
+ **config.extra_params
+ }
if provider in [ModelProvider.OPENAI, ModelProvider.XINFERENCE, ModelProvider.GPUSTACK, ModelProvider.OLLAMA]:
# 使用 httpx.Timeout 对象来设置详细的超时配置
@@ -135,6 +157,12 @@ class RedBearModelFactory:
def get_provider_llm_class(config:RedBearModelConfig, type: ModelType=ModelType.LLM) -> type[BaseLLM]:
"""根据模型提供商获取对应的模型类"""
provider = config.provider.lower()
+
+ # dashscope 的 omni 模型使用 OpenAI 兼容模式
+ if provider == ModelProvider.DASHSCOPE and config.is_omni:
+ from langchain_openai import ChatOpenAI
+ return ChatOpenAI
+
if provider in [ModelProvider.OPENAI, ModelProvider.XINFERENCE, ModelProvider.GPUSTACK] :
if type == ModelType.LLM:
from langchain_openai import OpenAI
diff --git a/api/app/core/models/scripts/bedrock_models.yaml b/api/app/core/models/scripts/bedrock_models.yaml
index e5b91d1c..2c0ab757 100644
--- a/api/app/core/models/scripts/bedrock_models.yaml
+++ b/api/app/core/models/scripts/bedrock_models.yaml
@@ -6,6 +6,8 @@ models:
description: AI21 Labs大语言模型,completion生成模式,256000上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
logo: bedrock
@@ -15,6 +17,9 @@ models:
description: Amazon Nova大语言模型,支持智能体思考、工具调用、流式工具调用、视觉能力,300000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -28,6 +33,9 @@ models:
description: Anthropic Claude大语言模型,支持智能体思考、视觉能力、工具调用、流式工具调用、文档处理,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -42,6 +50,8 @@ models:
description: Cohere大语言模型,支持智能体思考、工具调用、流式工具调用,128000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -54,6 +64,9 @@ models:
description: DeepSeek大语言模型,支持智能体思考、视觉能力、工具调用、流式工具调用,32768上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -67,6 +80,8 @@ models:
description: Meta Llama大语言模型,支持智能体思考、工具调用,128000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -78,6 +93,8 @@ models:
description: Mistral AI大语言模型,支持智能体思考、工具调用,32000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -89,6 +106,8 @@ models:
description: OpenAI大语言模型,支持智能体思考、工具调用、流式工具调用,32768上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -101,6 +120,8 @@ models:
description: Qwen大语言模型,支持智能体思考、工具调用、流式工具调用,32768上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -113,6 +134,8 @@ models:
description: amazon.rerank-v1:0重排序模型,5120上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 重排序模型
logo: bedrock
@@ -122,6 +145,8 @@ models:
description: cohere.rerank-v3-5:0重排序模型,5120上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 重排序模型
logo: bedrock
@@ -131,6 +156,9 @@ models:
description: amazon.nova-2-multimodal-embeddings-v1:0文本嵌入模型,支持视觉能力,8192上下文窗口
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 文本嵌入模型
- vision
@@ -141,6 +169,8 @@ models:
description: amazon.titan-embed-text-v1文本嵌入模型,8192上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 文本嵌入模型
logo: bedrock
@@ -150,6 +180,8 @@ models:
description: amazon.titan-embed-text-v2:0文本嵌入模型,8192上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 文本嵌入模型
logo: bedrock
@@ -159,6 +191,8 @@ models:
description: Cohere Embed 3 English文本嵌入模型,512上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 文本嵌入模型
logo: bedrock
@@ -168,6 +202,8 @@ models:
description: Cohere Embed 3 Multilingual文本嵌入模型,512上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 文本嵌入模型
- logo: bedrock
+ logo: bedrock
\ No newline at end of file
diff --git a/api/app/core/models/scripts/dashscope_models.yaml b/api/app/core/models/scripts/dashscope_models.yaml
index af1c3619..89a16966 100644
--- a/api/app/core/models/scripts/dashscope_models.yaml
+++ b/api/app/core/models/scripts/dashscope_models.yaml
@@ -6,6 +6,8 @@ models:
description: DeepSeek-R1-Distill-Qwen-14B大语言模型,支持智能体思考,32000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -16,6 +18,8 @@ models:
description: DeepSeek-R1-Distill-Qwen-32B大语言模型,支持智能体思考,32000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -26,6 +30,8 @@ models:
description: DeepSeek-R1大语言模型,支持智能体思考,131072超大上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -36,6 +42,8 @@ models:
description: DeepSeek-V3.1大语言模型,支持智能体思考,131072超大上下文窗口,对话模式,支持丰富生成参数调节
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -46,6 +54,8 @@ models:
description: DeepSeek-V3.2-exp实验版大语言模型,支持智能体思考,131072超大上下文窗口,对话模式,支持丰富生成参数调节
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -56,6 +66,8 @@ models:
description: DeepSeek-V3.2大语言模型,支持智能体思考,131072超大上下文窗口,对话模式,支持丰富生成参数调节
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -66,6 +78,8 @@ models:
description: DeepSeek-V3大语言模型,支持智能体思考,64000上下文窗口,对话模式,支持文本与JSON格式输出
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -76,6 +90,8 @@ models:
description: farui-plus大语言模型,支持多工具调用、智能体思考、流式工具调用,12288上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -88,6 +104,8 @@ models:
description: GLM-4.7大语言模型,支持多工具调用、智能体思考、流式工具调用,202752超大上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -100,6 +118,9 @@ models:
description: qvq-max-latest大语言模型,支持视觉、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- vision
@@ -112,6 +133,9 @@ models:
description: qvq-max大语言模型,支持视觉、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- vision
@@ -124,6 +148,8 @@ models:
description: qwen-coder-turbo-0919代码专用大语言模型,支持智能体思考,131072上下文窗口,对话模式,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- 代码模型
@@ -135,6 +161,8 @@ models:
description: qwen-max-latest大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -147,6 +175,8 @@ models:
description: qwen-max-longcontext长上下文大语言模型,支持多工具调用、智能体思考、流式工具调用,32000上下文窗口,对话模式,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -159,6 +189,8 @@ models:
description: qwen-max大语言模型,支持多工具调用、智能体思考、流式工具调用,32768上下文窗口,对话模式,支持联网搜索
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -171,6 +203,8 @@ models:
description: qwen-mt-plus多语言翻译大语言模型,支持智能体思考,16384上下文窗口,对话模式,支持多语种互译与领域翻译适配
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- 翻译模型
@@ -182,6 +216,8 @@ models:
description: qwen-mt-turbo轻量化多语言翻译大语言模型,支持智能体思考,16384上下文窗口,对话模式,支持多语种互译与领域翻译适配
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- 翻译模型
@@ -193,6 +229,8 @@ models:
description: qwen-plus-0112大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -205,6 +243,8 @@ models:
description: qwen-plus-0125大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -217,6 +257,8 @@ models:
description: qwen-plus-0723大语言模型,支持多工具调用、智能体思考、流式工具调用,32000上下文窗口,对话模式,支持联网搜索,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -229,6 +271,8 @@ models:
description: qwen-plus-0806大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -241,6 +285,8 @@ models:
description: qwen-plus-0919大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -253,6 +299,8 @@ models:
description: qwen-plus-1125大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -265,6 +313,8 @@ models:
description: qwen-plus-1127大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -277,6 +327,8 @@ models:
description: qwen-plus-1220大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -289,6 +341,10 @@ models:
description: qwen-vl-max多模态大模型,支持视觉理解、智能体思考、视频理解,131072上下文窗口,对话模式,未废弃
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -302,6 +358,10 @@ models:
description: qwen-vl-plus-0809多模态大模型,支持视觉理解、智能体思考、视频理解,32768上下文窗口,对话模式,已废弃
is_deprecated: true
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -315,6 +375,10 @@ models:
description: qwen-vl-plus-2025-01-02多模态大模型,支持视觉理解、智能体思考、视频理解,32768上下文窗口,对话模式,未废弃
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -328,6 +392,10 @@ models:
description: qwen-vl-plus-2025-01-25多模态大模型,支持视觉理解、智能体思考、视频理解,131072上下文窗口,对话模式,未废弃
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -341,6 +409,10 @@ models:
description: qwen-vl-plus-latest多模态大模型,支持视觉理解、智能体思考、视频理解,131072上下文窗口,对话模式,未废弃
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -354,6 +426,10 @@ models:
description: qwen-vl-plus多模态大模型,支持视觉理解、智能体思考、视频理解,131072上下文窗口,对话模式,未废弃
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -367,6 +443,8 @@ models:
description: qwen2.5-0.5b-instruct大语言模型,支持多工具调用、智能体思考、流式工具调用,32768上下文窗口,对话模式,未废弃
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -379,6 +457,8 @@ models:
description: qwen3-14b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -391,6 +471,8 @@ models:
description: qwen3-235b-a22b-instruct-2507大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -403,6 +485,8 @@ models:
description: qwen3-235b-a22b-thinking-2507大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -415,6 +499,8 @@ models:
description: qwen3-235b-a22b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -427,6 +513,8 @@ models:
description: qwen3-30b-a3b-instruct-2507大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -439,6 +527,8 @@ models:
description: qwen3-30b-a3b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -451,6 +541,8 @@ models:
description: qwen3-32b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -463,6 +555,8 @@ models:
description: qwen3-4b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -475,6 +569,8 @@ models:
description: qwen3-8b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -487,6 +583,8 @@ models:
description: qwen3-coder-30b-a3b-instruct大语言模型,支持智能体思考,262144上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- 代码模型
@@ -498,6 +596,8 @@ models:
description: qwen3-coder-480b-a35b-instruct大语言模型,支持智能体思考,262144上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- 代码模型
@@ -509,6 +609,8 @@ models:
description: qwen3-coder-plus-2025-09-23大语言模型,支持智能体思考,1000000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- 代码模型
@@ -520,6 +622,8 @@ models:
description: qwen3-coder-plus大语言模型,支持智能体思考,1000000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- 代码模型
@@ -531,6 +635,8 @@ models:
description: qwen3-max-2025-09-23大语言模型,支持多工具调用、智能体思考、流式工具调用,262144上下文窗口,对话模式,支持联网搜索
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -544,6 +650,8 @@ models:
description: qwen3-max-2026-01-23大语言模型,支持多工具调用、智能体思考、流式工具调用,262144上下文窗口,对话模式,支持联网搜索
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -557,6 +665,8 @@ models:
description: qwen3-max-preview大语言模型,支持多工具调用、智能体思考、流式工具调用,262144上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -569,6 +679,8 @@ models:
description: qwen3-max大语言模型,支持多工具调用、智能体思考、流式工具调用,262144上下文窗口,对话模式,支持联网搜索
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -582,6 +694,8 @@ models:
description: qwen3-next-80b-a3b-instruct大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -594,6 +708,8 @@ models:
description: qwen3-next-80b-a3b-thinking大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -606,6 +722,11 @@ models:
description: qwen3-omni-flash-2025-12-01多模态大语言模型,支持视觉、智能体思考、视频、音频能力,65536上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ - audio
+ is_omni: true
tags:
- 大语言模型
- 多模态模型
@@ -620,6 +741,10 @@ models:
description: qwen3-vl-235b-a22b-instruct多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -635,6 +760,10 @@ models:
description: qwen3-vl-235b-a22b-thinking多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -650,6 +779,10 @@ models:
description: qwen3-vl-30b-a3b-instruct多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -665,6 +798,10 @@ models:
description: qwen3-vl-30b-a3b-thinking多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -680,6 +817,10 @@ models:
description: qwen3-vl-flash多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -695,6 +836,10 @@ models:
description: qwen3-vl-plus-2025-09-23多模态大语言模型,支持视觉、智能体思考、视频能力,262144上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -708,6 +853,10 @@ models:
description: qwen3-vl-plus多模态大语言模型,支持视觉、智能体思考、视频能力,262144上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - video
+ is_omni: false
tags:
- 大语言模型
- 多模态模型
@@ -721,6 +870,8 @@ models:
description: qwq-32b大语言模型,支持智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -732,6 +883,8 @@ models:
description: qwq-plus-0305大语言模型,支持智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -743,6 +896,8 @@ models:
description: qwq-plus大语言模型,支持智能体思考、流式工具调用,131072上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -754,6 +909,8 @@ models:
description: gte-rerank-v2重排序模型,4000上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 重排序模型
logo: dashscope
@@ -763,6 +920,8 @@ models:
description: gte-rerank重排序模型,4000上下文窗口
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 重排序模型
logo: dashscope
@@ -772,6 +931,9 @@ models:
description: multimodal-embedding-v1多模态嵌入模型,支持视觉能力,8192上下文窗口,最大分块数10
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 嵌入模型
- 多模态模型
@@ -783,6 +945,8 @@ models:
description: text-embedding-v1文本嵌入模型,2048上下文窗口,最大分块数25
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 嵌入模型
- 文本嵌入
@@ -793,6 +957,8 @@ models:
description: text-embedding-v2文本嵌入模型,2048上下文窗口,最大分块数25
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 嵌入模型
- 文本嵌入
@@ -803,6 +969,8 @@ models:
description: text-embedding-v3文本嵌入模型,8192上下文窗口,最大分块数10
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 嵌入模型
- 文本嵌入
@@ -813,7 +981,9 @@ models:
description: text-embedding-v4文本嵌入模型,8192上下文窗口,最大分块数10
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 嵌入模型
- 文本嵌入
- logo: dashscope
+ logo: dashscope
\ No newline at end of file
diff --git a/api/app/core/models/scripts/loader.py b/api/app/core/models/scripts/loader.py
index a14d3268..e4462efa 100644
--- a/api/app/core/models/scripts/loader.py
+++ b/api/app/core/models/scripts/loader.py
@@ -6,7 +6,7 @@ from typing import Callable
import yaml
from sqlalchemy.orm import Session
-from app.models.models_model import ModelBase, ModelProvider
+from app.models.models_model import ModelBase, ModelProvider, ModelConfig
def _load_yaml_config(provider: ModelProvider) -> list[dict]:
@@ -55,6 +55,15 @@ def load_models(db: Session, providers: list[str] = None, silent: bool = False)
print(f"\n正在加载 {provider.value} 的 {len(models)} 个模型...")
for model_data in models:
+ config_sync_fields = {
+ "logo": None,
+ "capability": None,
+ "is_omni": None,
+ "name": None,
+ "provider": None,
+ "type": None,
+ "description": None
+ }
try:
# 检查模型是否已存在
existing = db.query(ModelBase).filter(
@@ -66,6 +75,40 @@ def load_models(db: Session, providers: list[str] = None, silent: bool = False)
# 更新现有模型配置
for key, value in model_data.items():
setattr(existing, key, value)
+
+ # 更新绑定了该 model_id 的 ModelConfig 和 ModelApiKey
+ sync_fields = [k for k in config_sync_fields.keys() if k in model_data]
+ if sync_fields:
+ # 批量更新 ModelConfig
+ update_kwargs = {k: model_data[k] for k in sync_fields}
+ db.query(ModelConfig).filter(ModelConfig.model_id == existing.id).update(
+ update_kwargs,
+ synchronize_session=False
+ )
+
+ # 更新 ModelApiKey 的 capability 和 is_omni
+ if 'capability' in model_data or 'is_omni' in model_data:
+ from app.models.models_model import ModelApiKey, model_config_api_key_association
+ api_key_update = {}
+ if 'capability' in model_data:
+ api_key_update['capability'] = model_data['capability']
+ if 'is_omni' in model_data:
+ api_key_update['is_omni'] = model_data['is_omni']
+
+ if api_key_update:
+ # 查找所有关联的 API Key
+ api_key_ids = db.query(model_config_api_key_association.c.api_key_id).join(
+ ModelConfig,
+ ModelConfig.id == model_config_api_key_association.c.model_config_id
+ ).filter(ModelConfig.model_id == existing.id).distinct().all()
+
+ if api_key_ids:
+ api_key_ids = [aid[0] for aid in api_key_ids]
+ db.query(ModelApiKey).filter(ModelApiKey.id.in_(api_key_ids)).update(
+ api_key_update,
+ synchronize_session=False
+ )
+
db.commit()
if not silent:
print(f"更新成功: {model_data['name']}")
diff --git a/api/app/core/models/scripts/openai_models.yaml b/api/app/core/models/scripts/openai_models.yaml
index 68c63ee2..7f6d3a51 100644
--- a/api/app/core/models/scripts/openai_models.yaml
+++ b/api/app/core/models/scripts/openai_models.yaml
@@ -6,12 +6,19 @@ models:
description: chatgpt-4o-latest大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉能力,128000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ - audio
+ - video
+ is_omni: true
tags:
- 大语言模型
- multi-tool-call
- agent-thought
- stream-tool-call
- vision
+ - audio
+ - video
logo: openai
- name: gpt-3.5-turbo-0125
type: llm
@@ -19,6 +26,8 @@ models:
description: gpt-3.5-turbo-0125大语言模型,支持多工具调用、智能体思考、流式工具调用,16385上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -31,6 +40,8 @@ models:
description: gpt-3.5-turbo-1106大语言模型,支持多工具调用、智能体思考、流式工具调用,16385上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -43,6 +54,8 @@ models:
description: gpt-3.5-turbo-16k大语言模型,支持多工具调用、智能体思考、流式工具调用,16385上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -55,6 +68,8 @@ models:
description: gpt-3.5-turbo-instruct大语言模型,4096上下文窗口,文本补全模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
logo: openai
@@ -64,6 +79,8 @@ models:
description: gpt-3.5-turbo大语言模型,支持多工具调用、智能体思考、流式工具调用,16385上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -76,6 +93,8 @@ models:
description: gpt-4-0125-preview大语言模型,支持多工具调用、智能体思考、流式工具调用,128000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -88,6 +107,8 @@ models:
description: gpt-4-1106-preview大语言模型,支持多工具调用、智能体思考、流式工具调用,128000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -100,6 +121,9 @@ models:
description: gpt-4-turbo-2024-04-09大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉能力,128000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -113,6 +137,8 @@ models:
description: gpt-4-turbo-preview大语言模型,支持多工具调用、智能体思考、流式工具调用,128000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -125,6 +151,9 @@ models:
description: gpt-4-turbo大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉能力,128000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -138,6 +167,8 @@ models:
description: o1-preview大语言模型,支持智能体思考,128000上下文窗口,对话模式,已废弃
is_deprecated: true
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -148,6 +179,9 @@ models:
description: o1大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉能力、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- multi-tool-call
@@ -162,6 +196,9 @@ models:
description: o3-2025-04-16大语言模型,支持智能体思考、工具调用、视觉能力、流式工具调用、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -176,6 +213,8 @@ models:
description: o3-mini-2025-01-31大语言模型,支持智能体思考、工具调用、流式工具调用、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -189,6 +228,8 @@ models:
description: o3-mini大语言模型,支持智能体思考、工具调用、流式工具调用、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -202,6 +243,9 @@ models:
description: o3-pro-2025-06-10大语言模型,支持智能体思考、工具调用、视觉能力、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -215,6 +259,9 @@ models:
description: o3-pro大语言模型,支持智能体思考、工具调用、视觉能力、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -228,6 +275,9 @@ models:
description: o3大语言模型,支持智能体思考、视觉能力、工具调用、流式工具调用、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -242,6 +292,9 @@ models:
description: o4-mini-2025-04-16大语言模型,支持智能体思考、工具调用、视觉能力、流式工具调用、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -256,6 +309,9 @@ models:
description: o4-mini大语言模型,支持智能体思考、工具调用、视觉能力、流式工具调用、结构化输出,200000上下文窗口,对话模式
is_deprecated: false
is_official: true
+ capability:
+ - vision
+ is_omni: false
tags:
- 大语言模型
- agent-thought
@@ -270,6 +326,8 @@ models:
description: text-embedding-3-large文本向量模型,8191上下文窗口,最大分块数32
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 文本向量模型
logo: openai
@@ -279,6 +337,8 @@ models:
description: text-embedding-3-small文本向量模型,8191上下文窗口,最大分块数32
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 文本向量模型
logo: openai
@@ -288,6 +348,8 @@ models:
description: text-embedding-ada-002文本向量模型,8097上下文窗口,最大分块数32
is_deprecated: false
is_official: true
+ capability: []
+ is_omni: false
tags:
- 文本向量模型
- logo: openai
+ logo: openai
\ No newline at end of file
diff --git a/api/app/models/models_model.py b/api/app/models/models_model.py
index 3e378f17..23fafcef 100644
--- a/api/app/models/models_model.py
+++ b/api/app/models/models_model.py
@@ -2,7 +2,7 @@ import datetime
import uuid
from enum import StrEnum
-from sqlalchemy import Column, String, Boolean, DateTime, Text, ForeignKey, Enum as SQLEnum, UniqueConstraint, Integer, ARRAY, Table
+from sqlalchemy import Column, String, Boolean, DateTime, Text, ForeignKey, Enum as SQLEnum, UniqueConstraint, Integer, ARRAY, Table, text
from sqlalchemy.dialects.postgresql import UUID, JSON
from sqlalchemy.orm import relationship
from sqlalchemy.sql import func
@@ -78,6 +78,9 @@ class ModelConfig(BaseModel):
description = Column(String, comment="模型描述")
# 模型配置参数
+ capability = Column(ARRAY(String), default=list, nullable=False, server_default=text("'{}'::varchar[]"),
+ comment="模型能力列表(如['vision', 'audio', 'video'])")
+ is_omni = Column(Boolean, default=False, nullable=False, server_default="false", comment="是否为Omni模型(使用特殊API调用)")
config = Column(JSON, comment="模型配置参数")
# - temperature : 控制生成文本的随机性。值越高,输出越随机、越有创造性;值越低,输出越确定、越保守。
# - top_p : 一种替代 temperature 的采样方法,控制模型从概率最高的词中选择的范围。
@@ -118,6 +121,11 @@ class ModelApiKey(BaseModel):
api_key = Column(String, nullable=False, comment="API密钥")
api_base = Column(String, comment="API基础URL")
+ # 模型能力参数
+ capability = Column(ARRAY(String), default=list, nullable=False, server_default=text("'{}'::varchar[]"),
+ comment="模型能力列表(如['vision', 'audio', 'video'])")
+ is_omni = Column(Boolean, default=False, nullable=False, server_default="false", comment="是否为Omni模型(使用特殊API调用)")
+
# 配置参数
config = Column(JSON, comment="API Key特定配置")
@@ -155,6 +163,9 @@ class ModelBase(Base):
tags = Column(ARRAY(String), default=list, nullable=False, comment="模型标签(如['聊天', '创作'])")
add_count = Column(Integer, default=0, nullable=False, comment="模型被用户添加的次数")
created_at = Column(DateTime, default=datetime.datetime.now, comment="创建时间", server_default=func.now())
+ capability = Column(ARRAY(String), default=list, nullable=False, server_default=text("'{}'::varchar[]"),
+ comment="模型能力列表(如['vision', 'audio', 'video'])")
+ is_omni = Column(Boolean, default=False, nullable=False, server_default="false", comment="是否为Omni模型(使用特殊API调用)")
# 关联关系
configs = relationship("ModelConfig", back_populates="model_base", cascade="all, delete-orphan")
diff --git a/api/app/schemas/app_schema.py b/api/app/schemas/app_schema.py
index 07875e13..f073a200 100644
--- a/api/app/schemas/app_schema.py
+++ b/api/app/schemas/app_schema.py
@@ -21,8 +21,14 @@ class FileType(StrEnum):
def trans(cls, value: str) -> 'FileType':
if value.startswith("image"):
return cls.IMAGE
- # TODO: other file type support
- raise RuntimeError("Unsupport file type")
+ elif value.startswith("document"):
+ return cls.DOCUMENT
+ elif value.startswith("audio"):
+ return cls.AUDIO
+ elif value.startswith("video"):
+ return cls.VIDEO
+ else:
+ raise RuntimeError("Unsupport file type")
class TransferMethod(str, Enum):
@@ -37,6 +43,12 @@ class FileInput(BaseModel):
transfer_method: TransferMethod = Field(..., description="传输方式: local_file/remote_url")
upload_file_id: Optional[uuid.UUID] = Field(None, description="已上传文件ID(local_file时必填)")
url: Optional[str] = Field(None, description="远程URL(remote_url时必填)")
+ file_type: Optional[str] = Field(None, description="具体文件格式(如image/jpg、audio/wav、document/docx、video/mp4)")
+
+ def __init__(self, **data):
+ if "type" in data:
+ data['file_type'] = data['type']
+ super().__init__(**data)
@field_validator("type", mode="before")
@classmethod
diff --git a/api/app/schemas/model_schema.py b/api/app/schemas/model_schema.py
index 0c0bbeed..f25d9408 100644
--- a/api/app/schemas/model_schema.py
+++ b/api/app/schemas/model_schema.py
@@ -21,6 +21,8 @@ class ModelConfigBase(BaseModel):
is_active: bool = Field(True, description="是否激活")
is_public: bool = Field(False, description="是否公开")
load_balance_strategy: Optional[str] = Field(LoadBalanceStrategy.NONE.value, description="负载均衡策略")
+ capability: List[str] = Field(default_factory=list, description="模型能力列表")
+ is_omni: bool = Field(False, description="是否为Omni模型")
class ApiKeyCreateNested(BaseModel):
@@ -30,6 +32,8 @@ class ApiKeyCreateNested(BaseModel):
provider: Optional[str] = Field(None, description="API Key提供商")
api_key: str = Field(..., description="API密钥", max_length=500)
api_base: Optional[str] = Field(None, description="API基础URL", max_length=500)
+ capability: Optional[List[str]] = Field(None, description="模型能力列表")
+ is_omni: Optional[bool] = Field(None, description="是否为Omni模型")
config: Optional[Dict[str, Any]] = Field({}, description="API Key特定配置")
priority: str = Field("1", description="优先级", max_length=10)
@@ -63,6 +67,8 @@ class ModelConfigUpdate(BaseModel):
config: Optional[Dict[str, Any]] = Field(None, description="模型配置参数")
is_active: Optional[bool] = Field(None, description="是否激活")
is_public: Optional[bool] = Field(None, description="是否公开")
+ capability: Optional[List[str]] = Field(None, description="模型能力列表")
+ is_omni: Optional[bool] = Field(None, description="是否为Omni模型")
class ModelConfig(ModelConfigBase):
@@ -95,6 +101,8 @@ class ModelApiKeyCreateByProvider(BaseModel):
api_key: str = Field(..., description="API密钥", max_length=500)
api_base: Optional[str] = Field(None, description="API基础URL", max_length=500)
description: Optional[str] = Field(None, description="备注")
+ capability: Optional[List[str]] = Field(None, description="模型能力列表")
+ is_omni: Optional[bool] = Field(None, description="是否为Omni模型")
config: Optional[Dict[str, Any]] = Field({}, description="API Key特定配置")
is_active: bool = Field(True, description="是否激活")
priority: str = Field("1", description="优先级", max_length=10)
@@ -108,6 +116,8 @@ class ModelApiKeyBase(BaseModel):
provider: ModelProvider = Field(..., description="API Key提供商")
api_key: str = Field(..., description="API密钥", max_length=500)
api_base: Optional[str] = Field(None, description="API基础URL", max_length=500)
+ capability: List[str] = Field(default_factory=list, description="模型能力列表")
+ is_omni: bool = Field(False, description="是否为Omni模型")
config: Optional[Dict[str, Any]] = Field({}, description="API Key特定配置")
is_active: bool = Field(True, description="是否激活")
priority: str = Field("1", description="优先级", max_length=10)
@@ -124,6 +134,8 @@ class ModelApiKeyUpdate(BaseModel):
provider: Optional[ModelProvider] = Field(None, description="API Key提供商")
api_key: Optional[str] = Field(None, description="API密钥", max_length=500)
api_base: Optional[str] = Field(None, description="API基础URL", max_length=500)
+ capability: Optional[List[str]] = Field(None, description="模型能力列表")
+ is_omni: Optional[bool] = Field(None, description="是否为Omni模型")
config: Optional[Dict[str, Any]] = Field(None, description="API Key特定配置")
is_active: Optional[bool] = Field(None, description="是否激活")
priority: Optional[str] = Field(None, description="优先级", max_length=10)
@@ -270,6 +282,8 @@ class ModelBaseCreate(BaseModel):
description: Optional[str] = Field(None, description="模型描述")
is_official: bool = Field(True, description="是否供应商官方模型")
tags: List[str] = Field(default_factory=list, description="模型标签")
+ capability: List[str] = Field(default_factory=list, description="模型能力列表(如['vision', 'audio', 'video'])")
+ is_omni: bool = Field(False, description="是否为Omni模型")
class ModelBaseUpdate(BaseModel):
@@ -282,6 +296,8 @@ class ModelBaseUpdate(BaseModel):
is_deprecated: Optional[bool] = Field(None, description="是否弃用")
is_official: Optional[bool] = Field(None, description="是否供应商官方模型")
tags: Optional[List[str]] = Field(None, description="模型标签")
+ capability: Optional[List[str]] = Field(None, description="模型能力列表")
+ is_omni: Optional[bool] = Field(None, description="是否为Omni模型")
class ModelBase(BaseModel):
@@ -298,6 +314,8 @@ class ModelBase(BaseModel):
is_official: bool
tags: List[str]
add_count: int
+ capability: List[str] = []
+ is_omni: bool = False
class ModelBaseQuery(BaseModel):
diff --git a/api/app/services/app_chat_service.py b/api/app/services/app_chat_service.py
index 9723121d..e6ac227b 100644
--- a/api/app/services/app_chat_service.py
+++ b/api/app/services/app_chat_service.py
@@ -157,6 +157,7 @@ class AppChatService:
api_key=api_key_obj.api_key,
provider=api_key_obj.provider,
api_base=api_key_obj.api_base,
+ is_omni=api_key_obj.is_omni,
temperature=model_parameters.get("temperature", 0.7),
max_tokens=model_parameters.get("max_tokens", 2000),
system_prompt=system_prompt,
@@ -180,7 +181,7 @@ class AppChatService:
# 处理多模态文件
processed_files = None
if files:
- multimodal_service = MultimodalService(self.db)
+ multimodal_service = MultimodalService(self.db, api_key_obj.provider, is_omni=api_key_obj.is_omni)
processed_files = await multimodal_service.process_files(files)
logger.info(f"处理了 {len(processed_files)} 个文件")
@@ -343,6 +344,7 @@ class AppChatService:
api_key=api_key_obj.api_key,
provider=api_key_obj.provider,
api_base=api_key_obj.api_base,
+ is_omni=api_key_obj.is_omni,
temperature=model_parameters.get("temperature", 0.7),
max_tokens=model_parameters.get("max_tokens", 2000),
system_prompt=system_prompt,
@@ -366,7 +368,7 @@ class AppChatService:
# 处理多模态文件
processed_files = None
if files:
- multimodal_service = MultimodalService(self.db)
+ multimodal_service = MultimodalService(self.db, api_key_obj.provider, is_omni=api_key_obj.is_omni)
processed_files = await multimodal_service.process_files(files)
logger.info(f"处理了 {len(processed_files)} 个文件")
diff --git a/api/app/services/app_service.py b/api/app/services/app_service.py
index 6e6e0ecb..c5919af9 100644
--- a/api/app/services/app_service.py
+++ b/api/app/services/app_service.py
@@ -232,7 +232,7 @@ class AppService:
# 检查主 Agent 的模型配置
multi_agent_config.default_model_config_id = master_agent_release.default_model_config_id
- model_api_key = ModelApiKeyService.get_a_api_key(self.db, multi_agent_config.default_model_config_id)
+ model_api_key = ModelApiKeyService.get_available_api_key(self.db, multi_agent_config.default_model_config_id)
if not model_api_key:
raise ResourceNotFoundException("模型配置", str(multi_agent_config.default_model_config_id))
diff --git a/api/app/services/audio_transcription_service.py b/api/app/services/audio_transcription_service.py
new file mode 100644
index 00000000..11d13f38
--- /dev/null
+++ b/api/app/services/audio_transcription_service.py
@@ -0,0 +1,101 @@
+"""
+音频转文本服务
+
+支持的服务商:
+- DashScope (阿里云通义千问)
+- OpenAI Whisper
+"""
+import httpx
+
+from app.core.logging_config import get_business_logger
+
+logger = get_business_logger()
+
+
+class AudioTranscriptionService:
+ """音频转文本服务"""
+
+ @staticmethod
+ async def transcribe_dashscope(audio_url: str, api_key: str) -> str:
+ """
+ 使用阿里云通义千问语音识别服务转换音频为文本
+
+ Args:
+ audio_url: 音频文件 URL
+ api_key: DashScope API Key
+
+ Returns:
+ str: 转录的文本
+ """
+ try:
+ async with httpx.AsyncClient(timeout=60.0) as client:
+ response = await client.post(
+ "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription",
+ headers={
+ "Authorization": f"Bearer {api_key}",
+ "Content-Type": "application/json",
+ "X-DashScope-Async": "enable",
+ },
+ json={
+ "model": "paraformer-v2",
+ "input": {
+ "file_urls": [audio_url]
+ },
+ "parameters": {
+ "language_hints": ["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"]
+ }
+ }
+ )
+ response.raise_for_status()
+ result = response.json()
+
+ if result.get("output", {}).get("results"):
+ text = result["output"]["results"][0].get("transcription_text", "")
+ logger.info(f"音频转文本成功: {len(text)} 字符")
+ return text
+
+ return "[音频转文本失败]"
+
+ except Exception as e:
+ logger.error(f"DashScope 音频转文本失败: {e}")
+ return f"[音频转文本失败: {str(e)}]"
+
+ @staticmethod
+ async def transcribe_openai(audio_url: str, api_key: str) -> str:
+ """
+ 使用 OpenAI Whisper 转换音频为文本
+
+ Args:
+ audio_url: 音频文件 URL
+ api_key: OpenAI API Key
+
+ Returns:
+ str: 转录的文本
+ """
+ try:
+ # 下载音频文件
+ async with httpx.AsyncClient(timeout=60.0) as client:
+ audio_response = await client.get(audio_url)
+ audio_response.raise_for_status()
+ audio_data = audio_response.content
+
+ # 调用 Whisper API
+ files = {"file": ("audio.mp3", audio_data, "audio/mpeg")}
+ data = {"model": "whisper-1"}
+
+ response = await client.post(
+ "https://api.openai.com/v1/audio/transcriptions",
+ headers={"Authorization": f"Bearer {api_key}"},
+ files=files,
+ data=data
+ )
+ response.raise_for_status()
+ result = response.json()
+
+ text = result.get("text", "")
+ logger.info(f"音频转文本成功: {len(text)} 字符")
+ return text
+
+ except Exception as e:
+ logger.error(f"OpenAI Whisper 音频转文本失败: {e}")
+ return f"[音频转文本失败: {str(e)}]"
diff --git a/api/app/services/collaborative_orchestrator.py b/api/app/services/collaborative_orchestrator.py
index 00a731de..68181cd1 100644
--- a/api/app/services/collaborative_orchestrator.py
+++ b/api/app/services/collaborative_orchestrator.py
@@ -445,6 +445,7 @@ class CollaborativeOrchestrator:
"provider": api_key_config.provider,
"api_key": api_key_config.api_key,
"api_base": api_key_config.api_base,
+ "is_omni": api_key_config.is_omni,
"model_parameters": config_data.get("model_parameters", {}),
"api_key_id": api_key_config.id
}
@@ -511,6 +512,7 @@ class CollaborativeOrchestrator:
provider=agent_config["provider"],
api_key=agent_config["api_key"],
base_url=agent_config.get("api_base"),
+ is_omni=agent_config.get("is_omni", False),
extra_params=extra_params
)
diff --git a/api/app/services/draft_run_service.py b/api/app/services/draft_run_service.py
index 8977710b..693f1a26 100644
--- a/api/app/services/draft_run_service.py
+++ b/api/app/services/draft_run_service.py
@@ -415,6 +415,7 @@ class DraftRunService:
api_key=api_key_config["api_key"],
provider=api_key_config.get("provider", "openai"),
api_base=api_key_config.get("api_base"),
+ is_omni=api_key_config.get("is_omni", False),
temperature=effective_params.get("temperature", 0.7),
max_tokens=effective_params.get("max_tokens", 2000),
system_prompt=system_prompt,
@@ -442,7 +443,7 @@ class DraftRunService:
if files:
# 获取 provider 信息
provider = api_key_config.get("provider", "openai")
- multimodal_service = MultimodalService(self.db, provider=provider)
+ multimodal_service = MultimodalService(self.db, provider=provider, is_omni=api_key_config.get("is_omni", False))
processed_files = await multimodal_service.process_files(files)
logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}")
@@ -683,6 +684,7 @@ class DraftRunService:
api_key=api_key_config["api_key"],
provider=api_key_config.get("provider", "openai"),
api_base=api_key_config.get("api_base"),
+ is_omni=api_key_config.get("is_omni", False),
temperature=effective_params.get("temperature", 0.7),
max_tokens=effective_params.get("max_tokens", 2000),
system_prompt=system_prompt,
@@ -711,7 +713,7 @@ class DraftRunService:
if files:
# 获取 provider 信息
provider = api_key_config.get("provider", "openai")
- multimodal_service = MultimodalService(self.db, provider=provider)
+ multimodal_service = MultimodalService(self.db, provider=provider, is_omni=api_key_config.get("is_omni", False))
processed_files = await multimodal_service.process_files(files)
logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}")
@@ -809,7 +811,7 @@ class DraftRunService:
"""
return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"
- async def _get_api_key(self, model_config_id: uuid.UUID) -> Dict[str, str]:
+ async def _get_api_key(self, model_config_id: uuid.UUID) -> Dict:
"""获取模型的 API Key
Args:
@@ -846,7 +848,8 @@ class DraftRunService:
"provider": api_key.provider,
"api_key": api_key.api_key,
"api_base": api_key.api_base,
- "api_key_id": api_key.id
+ "api_key_id": api_key.id,
+ "is_omni": api_key.is_omni
}
async def _ensure_conversation(
diff --git a/api/app/services/handoffs_service.py b/api/app/services/handoffs_service.py
index e490eea4..8418fe31 100644
--- a/api/app/services/handoffs_service.py
+++ b/api/app/services/handoffs_service.py
@@ -544,6 +544,7 @@ def convert_multi_agent_config_to_handoffs(
provider=model_api_key.provider,
api_key=model_api_key.api_key,
base_url=model_api_key.api_base,
+ is_omni=model_api_key.is_omni,
extra_params={
"temperature": 0.7,
"max_tokens": 2000,
diff --git a/api/app/services/llm_router.py b/api/app/services/llm_router.py
index e56ad5aa..02895d6b 100644
--- a/api/app/services/llm_router.py
+++ b/api/app/services/llm_router.py
@@ -414,6 +414,7 @@ class LLMRouter:
provider=api_key_config.provider,
api_key=api_key_config.api_key,
base_url=api_key_config.api_base,
+ is_omni=api_key_config.is_omni,
temperature=0.3,
max_tokens=500
)
diff --git a/api/app/services/master_agent_router.py b/api/app/services/master_agent_router.py
index 3cf3ecc3..b0f43b51 100644
--- a/api/app/services/master_agent_router.py
+++ b/api/app/services/master_agent_router.py
@@ -392,6 +392,7 @@ class MasterAgentRouter:
provider=api_key_config.provider,
api_key=api_key_config.api_key,
base_url=api_key_config.api_base,
+ is_omni=api_key_config.is_omni,
extra_params = extra_params
)
diff --git a/api/app/services/model_service.py b/api/app/services/model_service.py
index aa8cfbac..2337427a 100644
--- a/api/app/services/model_service.py
+++ b/api/app/services/model_service.py
@@ -90,7 +90,8 @@ class ModelConfigService:
api_key: str,
api_base: Optional[str] = None,
model_type: str = "llm",
- test_message: str = "Hello"
+ test_message: str = "Hello",
+ is_omni: bool = False
) -> Dict[str, Any]:
"""验证模型配置是否有效
@@ -102,6 +103,7 @@ class ModelConfigService:
api_base: API基础URL
model_type: 模型类型 (llm/chat/embedding/rerank)
test_message: 测试消息
+ is_omni: 是否为Omni模型
Returns:
Dict: 验证结果
@@ -114,14 +116,27 @@ class ModelConfigService:
try:
start_time = time.time()
- model_config = RedBearModelConfig(
- model_name=model_name,
- provider=provider,
- api_key=api_key,
- base_url=api_base,
- temperature=0.7,
- max_tokens=100
- )
+ # dashscope 的 omni 模型需要使用 compatible-mode
+ if provider.lower() == ModelProvider.DASHSCOPE and is_omni:
+ if not api_base:
+ api_base = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ model_config = RedBearModelConfig(
+ model_name=model_name,
+ provider=ModelProvider.OPENAI,
+ api_key=api_key,
+ base_url=api_base,
+ temperature=0.7,
+ max_tokens=100
+ )
+ else:
+ model_config = RedBearModelConfig(
+ model_name=model_name,
+ provider=provider,
+ api_key=api_key,
+ base_url=api_base,
+ temperature=0.7,
+ max_tokens=100
+ )
# 根据模型类型选择不同的验证方式
model_type_lower = model_type.lower()
@@ -257,8 +272,9 @@ class ModelConfigService:
provider=model_data.provider,
api_key=api_key_data.api_key,
api_base=api_key_data.api_base,
- model_type=model_data.type, # 传递模型类型
- test_message="Hello"
+ model_type=model_data.type,
+ test_message="Hello",
+ is_omni=model_data.is_omni
)
if not validation_result["valid"]:
raise BusinessException(
@@ -279,6 +295,9 @@ class ModelConfigService:
for api_key_data in api_key_datas:
api_key_data.model_name = model_data.name
api_key_data.provider = model_data.provider
+ # 同步capability和is_omni
+ api_key_data.capability = model_data.capability
+ api_key_data.is_omni = model_data.is_omni
api_key_create_schema = ModelApiKeyCreate(
model_config_ids=[model.id],
**api_key_data.model_dump()
@@ -497,6 +516,8 @@ class ModelApiKeyService:
existing_key.config = data.config
existing_key.priority = data.priority
existing_key.model_name = model_name
+ existing_key.capability = data.capability
+ existing_key.is_omni = data.is_omni
# 检查是否已关联该模型配置
if model_config not in existing_key.model_configs:
@@ -513,7 +534,8 @@ class ModelApiKeyService:
api_key=data.api_key,
api_base=data.api_base,
model_type=model_config.type,
- test_message="Hello"
+ test_message="Hello",
+ is_omni=data.is_omni
)
if not validation_result["valid"]:
# 记录验证失败的模型,但不抛出异常
@@ -528,6 +550,8 @@ class ModelApiKeyService:
provider=data.provider,
api_key=data.api_key,
api_base=data.api_base,
+ capability=data.capability if data.capability is not None else model_config.capability,
+ is_omni=data.is_omni if data.is_omni is not None else model_config.is_omni,
config=data.config,
is_active=data.is_active,
priority=data.priority
@@ -572,6 +596,8 @@ class ModelApiKeyService:
existing_key.config = api_key_data.config
existing_key.priority = api_key_data.priority
existing_key.model_name = api_key_data.model_name
+ existing_key.capability = api_key_data.capability
+ existing_key.is_omni = api_key_data.is_omni
# 检查是否已关联该模型配置
if model_config not in existing_key.model_configs:
@@ -589,7 +615,8 @@ class ModelApiKeyService:
api_key=api_key_data.api_key,
api_base=api_key_data.api_base,
model_type=model_config.type,
- test_message="Hello"
+ test_message="Hello",
+ is_omni=model_config.is_omni
)
if not validation_result["valid"]:
raise BusinessException(
@@ -620,7 +647,8 @@ class ModelApiKeyService:
api_key=api_key_data.api_key or existing_api_key.api_key,
api_base=api_key_data.api_base or existing_api_key.api_base,
model_type=model_config.type,
- test_message="Hello"
+ test_message="Hello",
+ is_omni=model_config.is_omni
)
if not validation_result["valid"]:
raise BusinessException(
@@ -755,6 +783,8 @@ class ModelBaseService:
"type": model_base.type,
"logo": model_base.logo,
"description": model_base.description,
+ "capability": model_base.capability,
+ "is_omni": model_base.is_omni,
"is_composite": False
}
model_config = ModelConfigRepository.create(db, model_config_data)
diff --git a/api/app/services/multi_agent_orchestrator.py b/api/app/services/multi_agent_orchestrator.py
index d1aa46d1..650f639b 100644
--- a/api/app/services/multi_agent_orchestrator.py
+++ b/api/app/services/multi_agent_orchestrator.py
@@ -2593,6 +2593,7 @@ class MultiAgentOrchestrator:
provider=api_key_config.provider,
api_key=api_key_config.api_key,
base_url=api_key_config.api_base,
+ is_omni=api_key_config.is_omni,
temperature=0.7, # 整合任务使用中等温度
max_tokens=2000
)
@@ -2758,6 +2759,7 @@ class MultiAgentOrchestrator:
provider=api_key_config.provider,
api_key=api_key_config.api_key,
base_url=api_key_config.api_base,
+ is_omni=api_key_config.is_omni,
temperature=0.7,
max_tokens=2000,
extra_params={"streaming": True} # 启用流式输出
diff --git a/api/app/services/multi_agent_service.py b/api/app/services/multi_agent_service.py
index c52814ed..751099d5 100644
--- a/api/app/services/multi_agent_service.py
+++ b/api/app/services/multi_agent_service.py
@@ -267,7 +267,7 @@ class MultiAgentService:
# 2. 验证模型配置(如果提供了)
if data.default_model_config_id:
- model_api_key = ModelApiKeyService.get_a_api_key(self.db, data.default_model_config_id)
+ model_api_key = ModelApiKeyService.get_available_api_key(self.db, data.default_model_config_id)
if not model_api_key:
raise ResourceNotFoundException("模型配置", str(data.default_model_config_id))
diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py
index bfb23a56..9b06c287 100644
--- a/api/app/services/multimodal_service.py
+++ b/api/app/services/multimodal_service.py
@@ -9,47 +9,100 @@
- OpenAI: 支持 URL 和 base64 格式
"""
import uuid
-from typing import List, Dict, Any, Optional, Protocol
+import httpx
+import base64
+from typing import List, Dict, Any, Optional
+from abc import ABC, abstractmethod
from sqlalchemy.orm import Session
+from docx import Document
+import io
+import PyPDF2
from app.core.logging_config import get_business_logger
from app.core.exceptions import BusinessException
from app.core.error_codes import BizCode
from app.schemas.app_schema import FileInput, FileType, TransferMethod
-from app.models.generic_file_model import GenericFile
+from app.models.file_metadata_model import FileMetadata
+from app.core.config import settings
+from app.services.audio_transcription_service import AudioTranscriptionService
logger = get_business_logger()
-class ImageFormatStrategy(Protocol):
- """图片格式策略接口"""
+class MultimodalFormatStrategy(ABC):
+ """多模态格式策略基类"""
+
+ @abstractmethod
+ async def format_image(self, url: str) -> Dict[str, Any]:
+ """格式化图片"""
+ pass
+
+ @abstractmethod
+ async def format_document(self, file_name: str, text: str) -> Dict[str, Any]:
+ """格式化文档"""
+ pass
+
+ @abstractmethod
+ async def format_audio(self, file_type: str, url: str) -> Dict[str, Any]:
+ """格式化音频"""
+ pass
+
+ @abstractmethod
+ async def format_video(self, url: str) -> Dict[str, Any]:
+ """格式化视频"""
+ pass
+
+
+class DashScopeFormatStrategy(MultimodalFormatStrategy):
+ """通义千问策略"""
async def format_image(self, url: str) -> Dict[str, Any]:
- """将图片 URL 转换为特定 provider 的格式"""
- ...
-
-
-class DashScopeImageStrategy:
- """通义千问图片格式策略"""
-
- async def format_image(self, url: str) -> Dict[str, Any]:
- """通义千问格式: {"type": "image", "image": "url"}"""
+ """通义千问图片格式:{"type": "image", "image": "url"}"""
return {
"type": "image",
"image": url
}
+ async def format_document(self, file_name: str, text: str) -> Dict[str, Any]:
+ """通义千问文档格式"""
+ return {
+ "type": "text",
+ "text": f"\n{text}\n"
+ }
-class BedrockImageStrategy:
- """Bedrock/Anthropic 图片格式策略"""
+ async def format_audio(self, file_type: str, url: str, transcription: Optional[str] = None) -> Dict[str, Any]:
+ """
+ 通义千问音频格式
+ - 原生支持: qwen-audio 系列
+ - 其他模型: 需要转录为文本
+ """
+ if transcription:
+ return {
+ "type": "text",
+ "text": f""
+ }
+ # 通义千问音频格式:{"type": "audio", "audio": "url"}
+ return {
+ "type": "audio",
+ "audio": url
+ }
+
+ async def format_video(self, url: str) -> Dict[str, Any]:
+ """通义千问视频格式(qwen-vl 系列原生支持)"""
+ return {
+ "type": "video",
+ "video": url
+ }
+
+
+class BedrockFormatStrategy(MultimodalFormatStrategy):
+ """Bedrock/Anthropic 策略"""
async def format_image(self, url: str) -> Dict[str, Any]:
"""
Bedrock/Anthropic 格式: base64 编码
{"type": "image", "source": {"type": "base64", "media_type": "...", "data": "..."}}
"""
- import httpx
- import base64
from mimetypes import guess_type
logger.info(f"下载并编码图片: {url}")
@@ -84,9 +137,46 @@ class BedrockImageStrategy:
}
}
+ async def format_document(self, file_name: str, text: str) -> Dict[str, Any]:
+ """Bedrock/Anthropic 文档格式(需要 base64 编码)"""
+ # Bedrock 文档需要 base64 编码
+ text_bytes = text.encode('utf-8')
+ base64_text = base64.b64encode(text_bytes).decode('utf-8')
-class OpenAIImageStrategy:
- """OpenAI 图片格式策略"""
+ return {
+ "type": "document",
+ "source": {
+ "type": "base64",
+ "media_type": "text/plain",
+ "data": base64_text
+ }
+ }
+
+ async def format_audio(self, file_type: str, url: str, transcription: Optional[str] = None) -> Dict[str, Any]:
+ """
+ Bedrock/Anthropic 音频格式
+ 不支持原生音频,必须转录为文本
+ """
+ if transcription:
+ return {
+ "type": "text",
+ "text": f"[音频转录]\n{transcription}"
+ }
+ return {
+ "type": "text",
+ "text": "[音频文件:Bedrock 不支持原生音频,请启用音频转文本功能]"
+ }
+
+ async def format_video(self, url: str) -> Dict[str, Any]:
+ """Bedrock/Anthropic 视频格式"""
+ return {
+ "type": "text",
+ "text": f""
+ }
+
+
+class OpenAIFormatStrategy(MultimodalFormatStrategy):
+ """OpenAI 策略"""
async def format_image(self, url: str) -> Dict[str, Any]:
"""OpenAI 格式: {"type": "image_url", "image_url": {"url": "..."}}"""
@@ -97,29 +187,97 @@ class OpenAIImageStrategy:
}
}
+ async def format_document(self, file_name: str, text: str) -> Dict[str, Any]:
+ """OpenAI 文档格式"""
+ return {
+ "type": "text",
+ "text": f"\n{text}\n"
+ }
+
+ async def format_audio(self, file_type: str, url: str, transcription: Optional[str] = None) -> Dict[str, Any]:
+ """
+ OpenAI 音频格式
+ - gpt-4o-audio 系列支持原生音频(需要 base64 编码)
+ - 其他模型使用转录文本
+ """
+ if transcription:
+ return {
+ "type": "text",
+ "text": f""
+ }
+
+ # OpenAI 音频需要 base64 编码
+ try:
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(url)
+ response.raise_for_status()
+ audio_data = response.content
+ base64_audio = base64.b64encode(audio_data).decode('utf-8')
+ # 1. 优先从 file_type (MIME) 取扩展名
+ file_ext = file_type.split('/')[-1] if file_type and '/' in file_type else None
+ # 2. 从响应头 content-type 取
+ if not file_ext:
+ ct = response.headers.get("content-type", "")
+ file_ext = ct.split('/')[-1].split(';')[0].strip() if '/' in ct else None
+ # 3. 从 URL 路径取扩展名
+ if not file_ext:
+ file_ext = url.split('?')[0].rsplit('.', 1)[-1].lower() or None
+ # 4. 默认 wav
+ # supported_ext = {"wav", "mp3", "mp4", "ogg", "flac", "webm", "m4a", "wave", "x-m4a"}
+ file_ext = "wav" if not file_ext else file_ext
+
+ return {
+ "type": "input_audio",
+ "input_audio": {
+ "data": f"data:;base64,{base64_audio}",
+ "format": file_ext
+ }
+ }
+ except Exception as e:
+ logger.error(f"下载音频失败: {e}")
+ return {
+ "type": "text",
+ "text": f"[音频处理失败: {str(e)}]"
+ }
+
+ async def format_video(self, url: str) -> Dict[str, Any]:
+ """OpenAI 视频格式"""
+ return {
+ "type": "video_url",
+ "video_url": {
+ "url": url
+ }
+ }
+
# Provider 到策略的映射
PROVIDER_STRATEGIES = {
- "dashscope": DashScopeImageStrategy,
- "bedrock": BedrockImageStrategy,
- "anthropic": BedrockImageStrategy,
- "openai": OpenAIImageStrategy,
+ "dashscope": DashScopeFormatStrategy,
+ "bedrock": BedrockFormatStrategy,
+ "anthropic": BedrockFormatStrategy,
+ "openai": OpenAIFormatStrategy,
}
class MultimodalService:
"""多模态文件处理服务"""
- def __init__(self, db: Session, provider: str = "dashscope"):
+ def __init__(self, db: Session, provider: str = "dashscope", api_key: Optional[str] = None, enable_audio_transcription: bool = False, is_omni: bool = False):
"""
初始化多模态服务
Args:
db: 数据库会话
- provider: 模型提供商(dashscope, bedrock, anthropic 等)
+ provider: 模型提供商(dashscope, bedrock, anthropic, openai 等)
+ api_key: API 密钥(用于音频转文本)
+ enable_audio_transcription: 是否启用音频转文本
+ is_omni: 是否为 Omni 模型(dashscope 的 omni 模型需要使用 OpenAI 兼容格式)
"""
self.db = db
self.provider = provider.lower()
+ self.api_key = api_key
+ self.enable_audio_transcription = enable_audio_transcription
+ self.is_omni = is_omni
async def process_files(
self,
@@ -137,20 +295,32 @@ class MultimodalService:
if not files:
return []
+ # 获取对应的策略
+ # dashscope 的 omni 模型使用 OpenAI 兼容格式
+ if self.provider == "dashscope" and self.is_omni:
+ strategy_class = OpenAIFormatStrategy
+ else:
+ strategy_class = PROVIDER_STRATEGIES.get(self.provider)
+ if not strategy_class:
+ logger.warning(f"未找到 provider '{self.provider}' 的策略,使用默认策略")
+ strategy_class = DashScopeFormatStrategy
+
+ strategy = strategy_class()
+
result = []
for idx, file in enumerate(files):
try:
if file.type == FileType.IMAGE:
- content = await self._process_image(file)
+ content = await self._process_image(file, strategy)
result.append(content)
elif file.type == FileType.DOCUMENT:
- content = await self._process_document(file)
+ content = await self._process_document(file, strategy)
result.append(content)
elif file.type == FileType.AUDIO:
- content = await self._process_audio(file)
+ content = await self._process_audio(file, strategy)
result.append(content)
elif file.type == FileType.VIDEO:
- content = await self._process_video(file)
+ content = await self._process_video(file, strategy)
result.append(content)
else:
logger.warning(f"不支持的文件类型: {file.type}")
@@ -172,55 +342,29 @@ class MultimodalService:
logger.info(f"成功处理 {len(result)}/{len(files)} 个文件,provider={self.provider}")
return result
- async def _process_image(self, file: FileInput) -> Dict[str, Any]:
+ async def _process_image(self, file: FileInput, strategy) -> Dict[str, Any]:
"""
处理图片文件
Args:
file: 图片文件输入
+ strategy: 格式化策略
Returns:
- Dict: 根据 provider 返回不同格式
- - Anthropic/Bedrock: {"type": "image", "source": {"type": "base64", "media_type": "...", "data": "..."}}
- - 通义千问: {"type": "image", "image": "url"}
+ Dict: 根据 provider 返回不同格式的图片内容
"""
- url = await self.get_file_url(file)
-
- logger.debug(f"处理图片: {url}, provider={self.provider}")
-
- # 根据 provider 返回不同格式
- if self.provider in ["bedrock", "anthropic"]:
- # Anthropic/Bedrock 只支持 base64 格式,需要下载并转换
- try:
- logger.info(f"开始下载并编码图片: {url}")
- base64_data, media_type = await self._download_and_encode_image(url)
- result = {
- "type": "image",
- "source": {
- "type": "base64",
- "media_type": media_type,
- "data": base64_data[:100] + "..." # 只记录前100个字符
- }
- }
- logger.info(f"图片编码完成: media_type={media_type}, data_length={len(base64_data)}")
- # 返回完整数据
- result["source"]["data"] = base64_data
- return result
- except Exception as e:
- logger.error(f"下载并编码图片失败: {e}", exc_info=True)
- # 返回错误提示
- return {
- "type": "text",
- "text": f"[图片加载失败: {str(e)}]"
- }
- else:
- # 通义千问等其他格式支持 URL
+ try:
+ url = await self.get_file_url(file)
+ return await strategy.format_image(url)
+ except Exception as e:
+ logger.error(f"处理图片失败: {e}", exc_info=True)
return {
- "type": "image",
- "image": url
+ "type": "text",
+ "text": f"[图片处理失败: {str(e)}]"
}
- async def _download_and_encode_image(self, url: str) -> tuple[str, str]:
+ @staticmethod
+ async def _download_and_encode_image(url: str) -> tuple[str, str]:
"""
下载图片并转换为 base64
@@ -230,8 +374,6 @@ class MultimodalService:
Returns:
tuple: (base64_data, media_type)
"""
- import httpx
- import base64
from mimetypes import guess_type
# 下载图片
@@ -258,15 +400,16 @@ class MultimodalService:
return base64_data, media_type
- async def _process_document(self, file: FileInput) -> Dict[str, Any]:
+ async def _process_document(self, file: FileInput, strategy) -> Dict[str, Any]:
"""
处理文档文件(PDF、Word 等)
Args:
file: 文档文件输入
+ strategy: 格式化策略
Returns:
- Dict: text 格式的内容(包含提取的文本)
+ Dict: 根据 provider 返回不同格式的文档内容
"""
if file.transfer_method == TransferMethod.REMOTE_URL:
# 远程文档暂不支持提取
@@ -277,48 +420,68 @@ class MultimodalService:
else:
# 本地文件,提取文本内容
text = await self._extract_document_text(file.upload_file_id)
- generic_file = self.db.query(GenericFile).filter(
- GenericFile.id == file.upload_file_id
+ file_metadata = self.db.query(FileMetadata).filter(
+ FileMetadata.id == file.upload_file_id
).first()
- file_name = generic_file.file_name if generic_file else "unknown"
+ file_name = file_metadata.file_name if file_metadata else "unknown"
- return {
- "type": "text",
- "text": f"\n{text}\n"
- }
+ # 使用策略格式化文档
+ return await strategy.format_document(file_name, text)
- async def _process_audio(self, file: FileInput) -> Dict[str, Any]:
+ async def _process_audio(self, file: FileInput, strategy) -> Dict[str, Any]:
"""
处理音频文件
Args:
file: 音频文件输入
+ strategy: 格式化策略
Returns:
- Dict: 音频内容(暂时返回占位符)
+ Dict: 根据 provider 返回不同格式的音频内容
"""
- # TODO: 实现音频转文字功能
- return {
- "type": "text",
- "text": "[音频文件,暂不支持处理]"
- }
+ try:
+ url = await self.get_file_url(file)
- async def _process_video(self, file: FileInput) -> Dict[str, Any]:
+ # 如果启用音频转文本且有 API Key
+ transcription = None
+ if self.enable_audio_transcription and self.api_key:
+ logger.info(f"开始音频转文本: {url}")
+ if self.provider == "dashscope":
+ transcription = await AudioTranscriptionService.transcribe_dashscope(url, self.api_key)
+ elif self.provider == "openai":
+ transcription = await AudioTranscriptionService.transcribe_openai(url, self.api_key)
+ else:
+ logger.warning(f"Provider {self.provider} 不支持音频转文本")
+
+ return await strategy.format_audio(file.file_type, url, transcription)
+ except Exception as e:
+ logger.error(f"处理音频失败: {e}", exc_info=True)
+ return {
+ "type": "text",
+ "text": f"[音频处理失败: {str(e)}]"
+ }
+
+ async def _process_video(self, file: FileInput, strategy) -> Dict[str, Any]:
"""
处理视频文件
Args:
file: 视频文件输入
+ strategy: 格式化策略
Returns:
- Dict: 视频内容(暂时返回占位符)
+ Dict: 根据 provider 返回不同格式的视频内容
"""
- # TODO: 实现视频处理功能
- return {
- "type": "text",
- "text": "[视频文件,暂不支持处理]"
- }
+ try:
+ url = await self.get_file_url(file)
+ return await strategy.format_video(url)
+ except Exception as e:
+ logger.error(f"处理视频失败: {e}", exc_info=True)
+ return {
+ "type": "text",
+ "text": f"[视频处理失败: {str(e)}]"
+ }
async def get_file_url(self, file: FileInput) -> str:
"""
@@ -336,26 +499,22 @@ class MultimodalService:
if file.transfer_method == TransferMethod.REMOTE_URL:
return file.url
else:
- # 本地文件,通过 file_storage 系统获取永久访问 URL
- from app.models.file_metadata_model import FileMetadata
- from app.core.config import settings
-
file_id = file.upload_file_id
print("="*50)
print("file_id",file_id)
-
+
# 查询 FileMetadata
file_metadata = self.db.query(FileMetadata).filter(
FileMetadata.id == file_id,
FileMetadata.status == "completed"
).first()
-
+
if not file_metadata:
raise BusinessException(
f"文件不存在或已删除: {file_id}",
BizCode.NOT_FOUND
)
-
+
# 返回永久URL
server_url = settings.FILE_LOCAL_SERVER_URL
return f"{server_url}/storage/permanent/{file_id}"
@@ -370,58 +529,79 @@ class MultimodalService:
Returns:
str: 提取的文本内容
"""
- generic_file = self.db.query(GenericFile).filter(
- GenericFile.id == file_id,
- GenericFile.status == "active"
+ file_metadata = self.db.query(FileMetadata).filter(
+ FileMetadata.id == file_id,
+ FileMetadata.status == "completed"
).first()
- if not generic_file:
+ if not file_metadata:
raise BusinessException(
f"文件不存在或已删除: {file_id}",
BizCode.NOT_FOUND
)
- # TODO: 根据文件类型提取文本
- # - PDF: 使用 PyPDF2 或 pdfplumber
- # - Word: 使用 python-docx
- # - TXT/MD: 直接读取
-
- file_ext = generic_file.file_ext.lower()
+ file_ext = file_metadata.file_ext.lower()
+ server_url = settings.FILE_LOCAL_SERVER_URL
+ file_url = f"{server_url}/storage/permanent/{file_id}"
if file_ext in ['.txt', '.md', '.markdown']:
- return await self._read_text_file(generic_file.storage_path)
+ return await self._read_text_file(file_url)
elif file_ext == '.pdf':
- return await self._extract_pdf_text(generic_file.storage_path)
+ return await self._extract_pdf_text(file_url)
elif file_ext in ['.doc', '.docx']:
- return await self._extract_word_text(generic_file.storage_path)
+ return await self._extract_word_text(file_url)
else:
return f"[不支持的文档格式: {file_ext}]"
- async def _read_text_file(self, storage_path: str) -> str:
+ @staticmethod
+ async def _read_text_file(file_url: str) -> str:
"""读取纯文本文件"""
try:
- with open(storage_path, 'r', encoding='utf-8') as f:
- return f.read()
+ # 下载文件
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(file_url)
+ response.raise_for_status()
+ return response.text
except Exception as e:
logger.error(f"读取文本文件失败: {e}")
return f"[文件读取失败: {str(e)}]"
- async def _extract_pdf_text(self, storage_path: str) -> str:
+ @staticmethod
+ async def _extract_pdf_text(file_url: str) -> str:
"""提取 PDF 文本"""
try:
- # TODO: 实现 PDF 文本提取
- # import PyPDF2 或 pdfplumber
- return "[PDF 文本提取功能待实现]"
+ # 下载 PDF 文件
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(file_url)
+ response.raise_for_status()
+ pdf_data = response.content
+
+ # 使用 BytesIO 读取 PDF
+ text_parts = []
+ pdf_file = io.BytesIO(pdf_data)
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
+ for page in pdf_reader.pages:
+ text_parts.append(page.extract_text())
+ return '\n'.join(text_parts)
except Exception as e:
logger.error(f"提取 PDF 文本失败: {e}")
return f"[PDF 提取失败: {str(e)}]"
- async def _extract_word_text(self, storage_path: str) -> str:
+ @staticmethod
+ async def _extract_word_text(file_url: str) -> str:
"""提取 Word 文档文本"""
try:
- # TODO: 实现 Word 文本提取
- # import docx
- return "[Word 文本提取功能待实现]"
+ # 下载 Word 文件
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(file_url)
+ response.raise_for_status()
+ word_data = response.content
+
+ # 使用 BytesIO 读取 Word 文档
+ word_file = io.BytesIO(word_data)
+ doc = Document(word_file)
+ text_parts = [paragraph.text for paragraph in doc.paragraphs]
+ return '\n'.join(text_parts)
except Exception as e:
logger.error(f"提取 Word 文本失败: {e}")
return f"[Word 提取失败: {str(e)}]"
diff --git a/api/app/services/prompt_optimizer_service.py b/api/app/services/prompt_optimizer_service.py
index 99edcc0e..184220a8 100644
--- a/api/app/services/prompt_optimizer_service.py
+++ b/api/app/services/prompt_optimizer_service.py
@@ -184,7 +184,8 @@ class PromptOptimizerService:
model_name=api_config.model_name,
provider=api_config.provider,
api_key=api_config.api_key,
- base_url=api_config.api_base
+ base_url=api_config.api_base,
+ is_omni=api_config.is_omni
), type=ModelType(model_config.type))
try:
prompt_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'prompt')
diff --git a/api/app/services/shared_chat_service.py b/api/app/services/shared_chat_service.py
index 89d3f3d6..0d659832 100644
--- a/api/app/services/shared_chat_service.py
+++ b/api/app/services/shared_chat_service.py
@@ -247,6 +247,7 @@ class SharedChatService:
api_key=api_key_obj.api_key,
provider=api_key_obj.provider,
api_base=api_key_obj.api_base,
+ is_omni=api_key_obj.is_omni,
temperature=model_parameters.get("temperature", 0.7),
max_tokens=model_parameters.get("max_tokens", 2000),
system_prompt=system_prompt,
@@ -454,6 +455,7 @@ class SharedChatService:
api_key=api_key_obj.api_key,
provider=api_key_obj.provider,
api_base=api_key_obj.api_base,
+ is_omni=api_key_obj.is_omni,
temperature=model_parameters.get("temperature", 0.7),
max_tokens=model_parameters.get("max_tokens", 2000),
system_prompt=system_prompt,