diff --git a/api/app/controllers/app_controller.py b/api/app/controllers/app_controller.py index e2849ad6..653f616c 100644 --- a/api/app/controllers/app_controller.py +++ b/api/app/controllers/app_controller.py @@ -835,7 +835,8 @@ async def draft_run_compare( web_search=True, memory=True, parallel=payload.parallel, - timeout=payload.timeout or 60 + timeout=payload.timeout or 60, + files=payload.files ) logger.info( diff --git a/api/app/controllers/model_controller.py b/api/app/controllers/model_controller.py index bb1ba526..0de3d4fe 100644 --- a/api/app/controllers/model_controller.py +++ b/api/app/controllers/model_controller.py @@ -469,7 +469,9 @@ async def create_model_api_key_by_provider( config=api_key_data.config, is_active=api_key_data.is_active, priority=api_key_data.priority, - model_config_ids=model_config_ids + model_config_ids=model_config_ids, + capability=api_key_data.capability, + is_omni=api_key_data.is_omni ) created_keys, failed_models = await ModelApiKeyService.create_api_key_by_provider(db=db, data=create_data) diff --git a/api/app/controllers/ontology_controller.py b/api/app/controllers/ontology_controller.py index e4a87141..42d4bee0 100644 --- a/api/app/controllers/ontology_controller.py +++ b/api/app/controllers/ontology_controller.py @@ -124,15 +124,23 @@ def _get_ontology_service( ) # 通过 Repository 获取可用的 API Key(负载均衡逻辑由 Repository 处理) - from app.repositories.model_repository import ModelApiKeyRepository - api_keys = ModelApiKeyRepository.get_by_model_config(db, model_config.id) - if not api_keys: + # from app.repositories.model_repository import ModelApiKeyRepository + from app.services.model_service import ModelApiKeyService + api_key_config = ModelApiKeyService.get_available_api_key(db, model_config.id) + if not api_key_config: logger.error(f"Model {llm_id} has no active API key") raise HTTPException( status_code=400, detail="指定的LLM模型没有可用的API密钥" ) - api_key_config = api_keys[0] + # api_keys = ModelApiKeyRepository.get_by_model_config(db, model_config.id) + # if not api_keys: + # logger.error(f"Model {llm_id} has no active API key") + # raise HTTPException( + # status_code=400, + # detail="指定的LLM模型没有可用的API密钥" + # ) + # api_key_config = api_keys[0] is_composite = getattr(model_config, 'is_composite', False) logger.info( @@ -154,6 +162,7 @@ def _get_ontology_service( provider=actual_provider, api_key=api_key_config.api_key, base_url=api_key_config.api_base, + is_omni=api_key_config.is_omni, max_retries=3, timeout=60.0 ) diff --git a/api/app/core/agent/langchain_agent.py b/api/app/core/agent/langchain_agent.py index fae20ea2..88b6371c 100644 --- a/api/app/core/agent/langchain_agent.py +++ b/api/app/core/agent/langchain_agent.py @@ -11,35 +11,37 @@ LangChain Agent 封装 import time from typing import Any, AsyncGenerator, Dict, List, Optional, Sequence -from app.core.memory.agent.langgraph_graph.write_graph import write_long_term +from app.core.memory.agent.langgraph_graph.write_graph import write_long_term from app.db import get_db from app.core.logging_config import get_business_logger from app.core.models import RedBearLLM, RedBearModelConfig -from app.models.models_model import ModelType +from app.models.models_model import ModelType, ModelProvider from app.services.memory_agent_service import ( get_end_user_connected_config, ) from langchain.agents import create_agent from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage from langchain_core.tools import BaseTool + logger = get_business_logger() class LangChainAgent: def __init__( - self, - model_name: str, - api_key: str, - provider: str = "openai", - api_base: Optional[str] = None, - temperature: float = 0.7, - max_tokens: int = 2000, - system_prompt: Optional[str] = None, - tools: Optional[Sequence[BaseTool]] = None, - streaming: bool = False, - max_iterations: Optional[int] = None, # 最大迭代次数(None 表示自动计算) - max_tool_consecutive_calls: int = 3 # 单个工具最大连续调用次数 + self, + model_name: str, + api_key: str, + provider: str = "openai", + api_base: Optional[str] = None, + is_omni: bool = False, + temperature: float = 0.7, + max_tokens: int = 2000, + system_prompt: Optional[str] = None, + tools: Optional[Sequence[BaseTool]] = None, + streaming: bool = False, + max_iterations: Optional[int] = None, # 最大迭代次数(None 表示自动计算) + max_tool_consecutive_calls: int = 3 # 单个工具最大连续调用次数 ): """初始化 LangChain Agent @@ -60,12 +62,13 @@ class LangChainAgent: self.provider = provider self.tools = tools or [] self.streaming = streaming + self.is_omni = is_omni self.max_tool_consecutive_calls = max_tool_consecutive_calls - + # 工具调用计数器:记录每个工具的连续调用次数 self.tool_call_counter: Dict[str, int] = {} self.last_tool_called: Optional[str] = None - + # 根据工具数量动态调整最大迭代次数 # 基础值 + 每个工具额外的调用机会 if max_iterations is None: @@ -73,9 +76,9 @@ class LangChainAgent: self.max_iterations = 5 + len(self.tools) * 2 else: self.max_iterations = max_iterations - + self.system_prompt = system_prompt or "你是一个专业的AI助手" - + logger.debug( f"Agent 迭代次数配置: max_iterations={self.max_iterations}, " f"tool_count={len(self.tools)}, " @@ -89,6 +92,7 @@ class LangChainAgent: provider=provider, api_key=api_key, base_url=api_base, + is_omni=is_omni, extra_params={ "temperature": temperature, "max_tokens": max_tokens, @@ -143,21 +147,22 @@ class LangChainAgent: """ from langchain_core.tools import StructuredTool from functools import wraps - + wrapped_tools = [] - + for original_tool in tools: tool_name = original_tool.name original_func = original_tool.func if hasattr(original_tool, 'func') else None - + if not original_func: # 如果无法获取原始函数,直接使用原工具 wrapped_tools.append(original_tool) continue - + # 创建包装函数 def make_wrapped_func(tool_name, original_func): """创建包装函数的工厂函数,避免闭包问题""" + @wraps(original_func) def wrapped_func(*args, **kwargs): """包装后的工具函数,跟踪连续调用次数""" @@ -168,13 +173,13 @@ class LangChainAgent: # 切换到新工具,重置计数器 self.tool_call_counter[tool_name] = 1 self.last_tool_called = tool_name - + current_count = self.tool_call_counter[tool_name] - + logger.debug( f"工具调用: {tool_name}, 连续调用次数: {current_count}/{self.max_tool_consecutive_calls}" ) - + # 检查是否超过最大连续调用次数 if current_count > self.max_tool_consecutive_calls: logger.warning( @@ -185,12 +190,12 @@ class LangChainAgent: f"工具 '{tool_name}' 已连续调用 {self.max_tool_consecutive_calls} 次," f"未找到有效结果。请尝试其他方法或直接回答用户的问题。" ) - + # 调用原始工具函数 return original_func(*args, **kwargs) - + return wrapped_func - + # 使用 StructuredTool 创建新工具 wrapped_tool = StructuredTool( name=original_tool.name, @@ -198,17 +203,17 @@ class LangChainAgent: func=make_wrapped_func(tool_name, original_func), args_schema=original_tool.args_schema if hasattr(original_tool, 'args_schema') else None ) - + wrapped_tools.append(wrapped_tool) - + return wrapped_tools def _prepare_messages( - self, - message: str, - history: Optional[List[Dict[str, str]]] = None, - context: Optional[str] = None, - files: Optional[List[Dict[str, Any]]] = None + self, + message: str, + history: Optional[List[Dict[str, str]]] = None, + context: Optional[str] = None, + files: Optional[List[Dict[str, Any]]] = None ) -> List[BaseMessage]: """准备消息列表 @@ -248,7 +253,7 @@ class LangChainAgent: messages.append(HumanMessage(content=user_content)) return messages - + def _build_multimodal_content(self, text: str, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ 构建多模态消息内容 @@ -261,23 +266,26 @@ class LangChainAgent: List[Dict]: 消息内容列表 """ # 根据 provider 使用不同的文本格式 - if self.provider.lower() in ["bedrock", "anthropic"]: - # Anthropic/Bedrock: {"type": "text", "text": "..."} - content_parts = [{"type": "text", "text": text}] - else: - # 通义千问等: {"text": "..."} - content_parts = [{"text": text}] - + # if (self.provider.lower() in [ModelProvider.BEDROCK, ModelProvider.OPENAI, ModelProvider.XINFERENCE, + # ModelProvider.GPUSTACK] or ( + # self.provider.lower() == ModelProvider.DASHSCOPE and self.is_omni)): + # # Anthropic/Bedrock/Xinference/Gpustack/Openai: {"type": "text", "text": "..."} + # content_parts = [{"type": "text", "text": text}] + # else: + # # 通义千问等: {"text": "..."} + # content_parts = [{"type": "text", "text": text}] + content_parts = [{"type": "text", "text": text}] + # 添加文件内容 # MultimodalService 已经根据 provider 返回了正确格式,直接使用 content_parts.extend(files) - + logger.debug( f"构建多模态消息: provider={self.provider}, " f"parts={len(content_parts)}, " f"files={len(files)}" ) - + return content_parts async def chat( @@ -302,7 +310,7 @@ class LangChainAgent: Returns: Dict: 包含 content 和元数据的字典 """ - message_chat= message + message_chat = message start_time = time.time() actual_config_id = config_id # If config_id is None, try to get from end_user's connected config @@ -322,8 +330,8 @@ class LangChainAgent: except Exception as e: logger.warning(f"Failed to get db session: {e}") actual_end_user_id = end_user_id if end_user_id is not None else "unknown" - logger.info(f'写入类型{storage_type,str(end_user_id), message, str(user_rag_memory_id)}') - print(f'写入类型{storage_type,str(end_user_id), message, str(user_rag_memory_id)}') + logger.info(f'写入类型{storage_type, str(end_user_id), message, str(user_rag_memory_id)}') + print(f'写入类型{storage_type, str(end_user_id), message, str(user_rag_memory_id)}') try: # 准备消息列表(支持多模态) messages = self._prepare_messages(message, history, context, files) @@ -367,14 +375,14 @@ class LangChainAgent: # 获取最后的 AI 消息 output_messages = result.get("messages", []) content = "" - + logger.debug(f"输出消息数量: {len(output_messages)}") total_tokens = 0 for msg in reversed(output_messages): if isinstance(msg, AIMessage): logger.debug(f"找到 AI 消息,content 类型: {type(msg.content)}") logger.debug(f"AI 消息内容: {msg.content}") - + # 处理多模态响应:content 可能是字符串或列表 if isinstance(msg.content, str): content = msg.content @@ -407,12 +415,13 @@ class LangChainAgent: response_meta = msg.response_metadata if hasattr(msg, 'response_metadata') else None total_tokens = response_meta.get("token_usage", {}).get("total_tokens", 0) if response_meta else 0 break - + logger.info(f"最终提取的内容长度: {len(content)}") elapsed_time = time.time() - start_time if memory_flag: - await write_long_term(storage_type, end_user_id, message_chat, content, user_rag_memory_id, actual_config_id) + await write_long_term(storage_type, end_user_id, message_chat, content, user_rag_memory_id, + actual_config_id) response = { "content": content, "model": self.model_name, @@ -439,16 +448,16 @@ class LangChainAgent: raise async def chat_stream( - self, - message: str, - history: Optional[List[Dict[str, str]]] = None, - context: Optional[str] = None, - end_user_id:Optional[str] = None, - config_id: Optional[str] = None, - storage_type:Optional[str] = None, - user_rag_memory_id:Optional[str] = None, - memory_flag: Optional[bool] = True, - files: Optional[List[Dict[str, Any]]] = None # 新增:多模态文件 + self, + message: str, + history: Optional[List[Dict[str, str]]] = None, + context: Optional[str] = None, + end_user_id: Optional[str] = None, + config_id: Optional[str] = None, + storage_type: Optional[str] = None, + user_rag_memory_id: Optional[str] = None, + memory_flag: Optional[bool] = True, + files: Optional[List[Dict[str, Any]]] = None # 新增:多模态文件 ) -> AsyncGenerator[str, None]: """执行流式对话 @@ -482,7 +491,6 @@ class LangChainAgent: except Exception as e: logger.warning(f"Failed to get db session: {e}") - # 注意:不在这里写入用户消息,等 AI 回复后一起写入 try: # 准备消息列表(支持多模态) @@ -500,13 +508,13 @@ class LangChainAgent: full_content = '' try: async for event in self.agent.astream_events( - {"messages": messages}, - version="v2", - config={"recursion_limit": self.max_iterations} + {"messages": messages}, + version="v2", + config={"recursion_limit": self.max_iterations} ): chunk_count += 1 kind = event.get("event") - + # 处理所有可能的流式事件 if kind == "on_chat_model_stream": # LLM 流式输出 @@ -540,7 +548,7 @@ class LangChainAgent: full_content += item yield item yielded_content = True - + elif kind == "on_llm_stream": # 另一种 LLM 流式事件 chunk = event.get("data", {}).get("chunk") @@ -577,13 +585,13 @@ class LangChainAgent: full_content += chunk yield chunk yielded_content = True - + # 记录工具调用(可选) elif kind == "on_tool_start": logger.debug(f"工具调用开始: {event.get('name')}") elif kind == "on_tool_end": logger.debug(f"工具调用结束: {event.get('name')}") - + logger.debug(f"Agent 流式完成,共 {chunk_count} 个事件") # 统计token消耗 output_messages = event.get("data", {}).get("output", {}).get("messages", []) @@ -595,7 +603,8 @@ class LangChainAgent: yield total_tokens break if memory_flag: - await write_long_term(storage_type, end_user_id, message_chat, full_content, user_rag_memory_id, actual_config_id) + await write_long_term(storage_type, end_user_id, message_chat, full_content, user_rag_memory_id, + actual_config_id) except Exception as e: logger.error(f"Agent astream_events 失败: {str(e)}", exc_info=True) raise @@ -609,5 +618,3 @@ class LangChainAgent: logger.info("=" * 80) logger.info("chat_stream 方法执行结束") logger.info("=" * 80) - - diff --git a/api/app/core/models/base.py b/api/app/core/models/base.py index f5f49af0..5d4dbd10 100644 --- a/api/app/core/models/base.py +++ b/api/app/core/models/base.py @@ -27,6 +27,7 @@ class RedBearModelConfig(BaseModel): provider: str api_key: str base_url: Optional[str] = None + is_omni: bool = False # 是否为 Omni 模型 # 请求超时时间(秒)- 默认120秒以支持复杂的LLM调用,可通过环境变量 LLM_TIMEOUT 配置 timeout: float = Field(default_factory=lambda: float(os.getenv("LLM_TIMEOUT", "120.0"))) # 最大重试次数 - 默认2次以避免过长等待,可通过环境变量 LLM_MAX_RETRIES 配置 @@ -45,7 +46,28 @@ class RedBearModelFactory: # 打印供应商信息用于调试 from app.core.logging_config import get_business_logger logger = get_business_logger() - logger.debug(f"获取模型参数 - Provider: {provider}, Model: {config.model_name}") + logger.debug(f"获取模型参数 - Provider: {provider}, Model: {config.model_name}, is_omni: {config.is_omni}") + + # dashscope 的 omni 模型使用 OpenAI 兼容模式 + if provider == ModelProvider.DASHSCOPE and config.is_omni: + import httpx + if not config.base_url: + config.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" + timeout_config = httpx.Timeout( + timeout=config.timeout, + connect=60.0, + read=config.timeout, + write=60.0, + pool=10.0, + ) + return { + "model": config.model_name, + "base_url": config.base_url, + "api_key": config.api_key, + "timeout": timeout_config, + "max_retries": config.max_retries, + **config.extra_params + } if provider in [ModelProvider.OPENAI, ModelProvider.XINFERENCE, ModelProvider.GPUSTACK, ModelProvider.OLLAMA]: # 使用 httpx.Timeout 对象来设置详细的超时配置 @@ -135,6 +157,12 @@ class RedBearModelFactory: def get_provider_llm_class(config:RedBearModelConfig, type: ModelType=ModelType.LLM) -> type[BaseLLM]: """根据模型提供商获取对应的模型类""" provider = config.provider.lower() + + # dashscope 的 omni 模型使用 OpenAI 兼容模式 + if provider == ModelProvider.DASHSCOPE and config.is_omni: + from langchain_openai import ChatOpenAI + return ChatOpenAI + if provider in [ModelProvider.OPENAI, ModelProvider.XINFERENCE, ModelProvider.GPUSTACK] : if type == ModelType.LLM: from langchain_openai import OpenAI diff --git a/api/app/core/models/scripts/bedrock_models.yaml b/api/app/core/models/scripts/bedrock_models.yaml index e5b91d1c..2c0ab757 100644 --- a/api/app/core/models/scripts/bedrock_models.yaml +++ b/api/app/core/models/scripts/bedrock_models.yaml @@ -6,6 +6,8 @@ models: description: AI21 Labs大语言模型,completion生成模式,256000上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 logo: bedrock @@ -15,6 +17,9 @@ models: description: Amazon Nova大语言模型,支持智能体思考、工具调用、流式工具调用、视觉能力,300000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -28,6 +33,9 @@ models: description: Anthropic Claude大语言模型,支持智能体思考、视觉能力、工具调用、流式工具调用、文档处理,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -42,6 +50,8 @@ models: description: Cohere大语言模型,支持智能体思考、工具调用、流式工具调用,128000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -54,6 +64,9 @@ models: description: DeepSeek大语言模型,支持智能体思考、视觉能力、工具调用、流式工具调用,32768上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -67,6 +80,8 @@ models: description: Meta Llama大语言模型,支持智能体思考、工具调用,128000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -78,6 +93,8 @@ models: description: Mistral AI大语言模型,支持智能体思考、工具调用,32000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -89,6 +106,8 @@ models: description: OpenAI大语言模型,支持智能体思考、工具调用、流式工具调用,32768上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -101,6 +120,8 @@ models: description: Qwen大语言模型,支持智能体思考、工具调用、流式工具调用,32768上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -113,6 +134,8 @@ models: description: amazon.rerank-v1:0重排序模型,5120上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 重排序模型 logo: bedrock @@ -122,6 +145,8 @@ models: description: cohere.rerank-v3-5:0重排序模型,5120上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 重排序模型 logo: bedrock @@ -131,6 +156,9 @@ models: description: amazon.nova-2-multimodal-embeddings-v1:0文本嵌入模型,支持视觉能力,8192上下文窗口 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 文本嵌入模型 - vision @@ -141,6 +169,8 @@ models: description: amazon.titan-embed-text-v1文本嵌入模型,8192上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 文本嵌入模型 logo: bedrock @@ -150,6 +180,8 @@ models: description: amazon.titan-embed-text-v2:0文本嵌入模型,8192上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 文本嵌入模型 logo: bedrock @@ -159,6 +191,8 @@ models: description: Cohere Embed 3 English文本嵌入模型,512上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 文本嵌入模型 logo: bedrock @@ -168,6 +202,8 @@ models: description: Cohere Embed 3 Multilingual文本嵌入模型,512上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 文本嵌入模型 - logo: bedrock + logo: bedrock \ No newline at end of file diff --git a/api/app/core/models/scripts/dashscope_models.yaml b/api/app/core/models/scripts/dashscope_models.yaml index af1c3619..89a16966 100644 --- a/api/app/core/models/scripts/dashscope_models.yaml +++ b/api/app/core/models/scripts/dashscope_models.yaml @@ -6,6 +6,8 @@ models: description: DeepSeek-R1-Distill-Qwen-14B大语言模型,支持智能体思考,32000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -16,6 +18,8 @@ models: description: DeepSeek-R1-Distill-Qwen-32B大语言模型,支持智能体思考,32000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -26,6 +30,8 @@ models: description: DeepSeek-R1大语言模型,支持智能体思考,131072超大上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -36,6 +42,8 @@ models: description: DeepSeek-V3.1大语言模型,支持智能体思考,131072超大上下文窗口,对话模式,支持丰富生成参数调节 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -46,6 +54,8 @@ models: description: DeepSeek-V3.2-exp实验版大语言模型,支持智能体思考,131072超大上下文窗口,对话模式,支持丰富生成参数调节 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -56,6 +66,8 @@ models: description: DeepSeek-V3.2大语言模型,支持智能体思考,131072超大上下文窗口,对话模式,支持丰富生成参数调节 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -66,6 +78,8 @@ models: description: DeepSeek-V3大语言模型,支持智能体思考,64000上下文窗口,对话模式,支持文本与JSON格式输出 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -76,6 +90,8 @@ models: description: farui-plus大语言模型,支持多工具调用、智能体思考、流式工具调用,12288上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -88,6 +104,8 @@ models: description: GLM-4.7大语言模型,支持多工具调用、智能体思考、流式工具调用,202752超大上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -100,6 +118,9 @@ models: description: qvq-max-latest大语言模型,支持视觉、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - vision @@ -112,6 +133,9 @@ models: description: qvq-max大语言模型,支持视觉、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - vision @@ -124,6 +148,8 @@ models: description: qwen-coder-turbo-0919代码专用大语言模型,支持智能体思考,131072上下文窗口,对话模式,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - 代码模型 @@ -135,6 +161,8 @@ models: description: qwen-max-latest大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -147,6 +175,8 @@ models: description: qwen-max-longcontext长上下文大语言模型,支持多工具调用、智能体思考、流式工具调用,32000上下文窗口,对话模式,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -159,6 +189,8 @@ models: description: qwen-max大语言模型,支持多工具调用、智能体思考、流式工具调用,32768上下文窗口,对话模式,支持联网搜索 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -171,6 +203,8 @@ models: description: qwen-mt-plus多语言翻译大语言模型,支持智能体思考,16384上下文窗口,对话模式,支持多语种互译与领域翻译适配 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - 翻译模型 @@ -182,6 +216,8 @@ models: description: qwen-mt-turbo轻量化多语言翻译大语言模型,支持智能体思考,16384上下文窗口,对话模式,支持多语种互译与领域翻译适配 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - 翻译模型 @@ -193,6 +229,8 @@ models: description: qwen-plus-0112大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -205,6 +243,8 @@ models: description: qwen-plus-0125大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -217,6 +257,8 @@ models: description: qwen-plus-0723大语言模型,支持多工具调用、智能体思考、流式工具调用,32000上下文窗口,对话模式,支持联网搜索,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -229,6 +271,8 @@ models: description: qwen-plus-0806大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -241,6 +285,8 @@ models: description: qwen-plus-0919大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -253,6 +299,8 @@ models: description: qwen-plus-1125大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -265,6 +313,8 @@ models: description: qwen-plus-1127大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,支持联网搜索,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -277,6 +327,8 @@ models: description: qwen-plus-1220大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -289,6 +341,10 @@ models: description: qwen-vl-max多模态大模型,支持视觉理解、智能体思考、视频理解,131072上下文窗口,对话模式,未废弃 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -302,6 +358,10 @@ models: description: qwen-vl-plus-0809多模态大模型,支持视觉理解、智能体思考、视频理解,32768上下文窗口,对话模式,已废弃 is_deprecated: true is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -315,6 +375,10 @@ models: description: qwen-vl-plus-2025-01-02多模态大模型,支持视觉理解、智能体思考、视频理解,32768上下文窗口,对话模式,未废弃 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -328,6 +392,10 @@ models: description: qwen-vl-plus-2025-01-25多模态大模型,支持视觉理解、智能体思考、视频理解,131072上下文窗口,对话模式,未废弃 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -341,6 +409,10 @@ models: description: qwen-vl-plus-latest多模态大模型,支持视觉理解、智能体思考、视频理解,131072上下文窗口,对话模式,未废弃 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -354,6 +426,10 @@ models: description: qwen-vl-plus多模态大模型,支持视觉理解、智能体思考、视频理解,131072上下文窗口,对话模式,未废弃 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -367,6 +443,8 @@ models: description: qwen2.5-0.5b-instruct大语言模型,支持多工具调用、智能体思考、流式工具调用,32768上下文窗口,对话模式,未废弃 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -379,6 +457,8 @@ models: description: qwen3-14b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -391,6 +471,8 @@ models: description: qwen3-235b-a22b-instruct-2507大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -403,6 +485,8 @@ models: description: qwen3-235b-a22b-thinking-2507大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -415,6 +499,8 @@ models: description: qwen3-235b-a22b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -427,6 +513,8 @@ models: description: qwen3-30b-a3b-instruct-2507大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -439,6 +527,8 @@ models: description: qwen3-30b-a3b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -451,6 +541,8 @@ models: description: qwen3-32b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -463,6 +555,8 @@ models: description: qwen3-4b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -475,6 +569,8 @@ models: description: qwen3-8b大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -487,6 +583,8 @@ models: description: qwen3-coder-30b-a3b-instruct大语言模型,支持智能体思考,262144上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - 代码模型 @@ -498,6 +596,8 @@ models: description: qwen3-coder-480b-a35b-instruct大语言模型,支持智能体思考,262144上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - 代码模型 @@ -509,6 +609,8 @@ models: description: qwen3-coder-plus-2025-09-23大语言模型,支持智能体思考,1000000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - 代码模型 @@ -520,6 +622,8 @@ models: description: qwen3-coder-plus大语言模型,支持智能体思考,1000000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - 代码模型 @@ -531,6 +635,8 @@ models: description: qwen3-max-2025-09-23大语言模型,支持多工具调用、智能体思考、流式工具调用,262144上下文窗口,对话模式,支持联网搜索 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -544,6 +650,8 @@ models: description: qwen3-max-2026-01-23大语言模型,支持多工具调用、智能体思考、流式工具调用,262144上下文窗口,对话模式,支持联网搜索 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -557,6 +665,8 @@ models: description: qwen3-max-preview大语言模型,支持多工具调用、智能体思考、流式工具调用,262144上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -569,6 +679,8 @@ models: description: qwen3-max大语言模型,支持多工具调用、智能体思考、流式工具调用,262144上下文窗口,对话模式,支持联网搜索 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -582,6 +694,8 @@ models: description: qwen3-next-80b-a3b-instruct大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -594,6 +708,8 @@ models: description: qwen3-next-80b-a3b-thinking大语言模型,支持多工具调用、智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -606,6 +722,11 @@ models: description: qwen3-omni-flash-2025-12-01多模态大语言模型,支持视觉、智能体思考、视频、音频能力,65536上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - video + - audio + is_omni: true tags: - 大语言模型 - 多模态模型 @@ -620,6 +741,10 @@ models: description: qwen3-vl-235b-a22b-instruct多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -635,6 +760,10 @@ models: description: qwen3-vl-235b-a22b-thinking多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -650,6 +779,10 @@ models: description: qwen3-vl-30b-a3b-instruct多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -665,6 +798,10 @@ models: description: qwen3-vl-30b-a3b-thinking多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -680,6 +817,10 @@ models: description: qwen3-vl-flash多模态大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉、视频能力,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -695,6 +836,10 @@ models: description: qwen3-vl-plus-2025-09-23多模态大语言模型,支持视觉、智能体思考、视频能力,262144上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -708,6 +853,10 @@ models: description: qwen3-vl-plus多模态大语言模型,支持视觉、智能体思考、视频能力,262144上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - video + is_omni: false tags: - 大语言模型 - 多模态模型 @@ -721,6 +870,8 @@ models: description: qwq-32b大语言模型,支持智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -732,6 +883,8 @@ models: description: qwq-plus-0305大语言模型,支持智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -743,6 +896,8 @@ models: description: qwq-plus大语言模型,支持智能体思考、流式工具调用,131072上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -754,6 +909,8 @@ models: description: gte-rerank-v2重排序模型,4000上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 重排序模型 logo: dashscope @@ -763,6 +920,8 @@ models: description: gte-rerank重排序模型,4000上下文窗口 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 重排序模型 logo: dashscope @@ -772,6 +931,9 @@ models: description: multimodal-embedding-v1多模态嵌入模型,支持视觉能力,8192上下文窗口,最大分块数10 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 嵌入模型 - 多模态模型 @@ -783,6 +945,8 @@ models: description: text-embedding-v1文本嵌入模型,2048上下文窗口,最大分块数25 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 嵌入模型 - 文本嵌入 @@ -793,6 +957,8 @@ models: description: text-embedding-v2文本嵌入模型,2048上下文窗口,最大分块数25 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 嵌入模型 - 文本嵌入 @@ -803,6 +969,8 @@ models: description: text-embedding-v3文本嵌入模型,8192上下文窗口,最大分块数10 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 嵌入模型 - 文本嵌入 @@ -813,7 +981,9 @@ models: description: text-embedding-v4文本嵌入模型,8192上下文窗口,最大分块数10 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 嵌入模型 - 文本嵌入 - logo: dashscope + logo: dashscope \ No newline at end of file diff --git a/api/app/core/models/scripts/loader.py b/api/app/core/models/scripts/loader.py index a14d3268..e4462efa 100644 --- a/api/app/core/models/scripts/loader.py +++ b/api/app/core/models/scripts/loader.py @@ -6,7 +6,7 @@ from typing import Callable import yaml from sqlalchemy.orm import Session -from app.models.models_model import ModelBase, ModelProvider +from app.models.models_model import ModelBase, ModelProvider, ModelConfig def _load_yaml_config(provider: ModelProvider) -> list[dict]: @@ -55,6 +55,15 @@ def load_models(db: Session, providers: list[str] = None, silent: bool = False) print(f"\n正在加载 {provider.value} 的 {len(models)} 个模型...") for model_data in models: + config_sync_fields = { + "logo": None, + "capability": None, + "is_omni": None, + "name": None, + "provider": None, + "type": None, + "description": None + } try: # 检查模型是否已存在 existing = db.query(ModelBase).filter( @@ -66,6 +75,40 @@ def load_models(db: Session, providers: list[str] = None, silent: bool = False) # 更新现有模型配置 for key, value in model_data.items(): setattr(existing, key, value) + + # 更新绑定了该 model_id 的 ModelConfig 和 ModelApiKey + sync_fields = [k for k in config_sync_fields.keys() if k in model_data] + if sync_fields: + # 批量更新 ModelConfig + update_kwargs = {k: model_data[k] for k in sync_fields} + db.query(ModelConfig).filter(ModelConfig.model_id == existing.id).update( + update_kwargs, + synchronize_session=False + ) + + # 更新 ModelApiKey 的 capability 和 is_omni + if 'capability' in model_data or 'is_omni' in model_data: + from app.models.models_model import ModelApiKey, model_config_api_key_association + api_key_update = {} + if 'capability' in model_data: + api_key_update['capability'] = model_data['capability'] + if 'is_omni' in model_data: + api_key_update['is_omni'] = model_data['is_omni'] + + if api_key_update: + # 查找所有关联的 API Key + api_key_ids = db.query(model_config_api_key_association.c.api_key_id).join( + ModelConfig, + ModelConfig.id == model_config_api_key_association.c.model_config_id + ).filter(ModelConfig.model_id == existing.id).distinct().all() + + if api_key_ids: + api_key_ids = [aid[0] for aid in api_key_ids] + db.query(ModelApiKey).filter(ModelApiKey.id.in_(api_key_ids)).update( + api_key_update, + synchronize_session=False + ) + db.commit() if not silent: print(f"更新成功: {model_data['name']}") diff --git a/api/app/core/models/scripts/openai_models.yaml b/api/app/core/models/scripts/openai_models.yaml index 68c63ee2..7f6d3a51 100644 --- a/api/app/core/models/scripts/openai_models.yaml +++ b/api/app/core/models/scripts/openai_models.yaml @@ -6,12 +6,19 @@ models: description: chatgpt-4o-latest大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉能力,128000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + - audio + - video + is_omni: true tags: - 大语言模型 - multi-tool-call - agent-thought - stream-tool-call - vision + - audio + - video logo: openai - name: gpt-3.5-turbo-0125 type: llm @@ -19,6 +26,8 @@ models: description: gpt-3.5-turbo-0125大语言模型,支持多工具调用、智能体思考、流式工具调用,16385上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -31,6 +40,8 @@ models: description: gpt-3.5-turbo-1106大语言模型,支持多工具调用、智能体思考、流式工具调用,16385上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -43,6 +54,8 @@ models: description: gpt-3.5-turbo-16k大语言模型,支持多工具调用、智能体思考、流式工具调用,16385上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -55,6 +68,8 @@ models: description: gpt-3.5-turbo-instruct大语言模型,4096上下文窗口,文本补全模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 logo: openai @@ -64,6 +79,8 @@ models: description: gpt-3.5-turbo大语言模型,支持多工具调用、智能体思考、流式工具调用,16385上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -76,6 +93,8 @@ models: description: gpt-4-0125-preview大语言模型,支持多工具调用、智能体思考、流式工具调用,128000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -88,6 +107,8 @@ models: description: gpt-4-1106-preview大语言模型,支持多工具调用、智能体思考、流式工具调用,128000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -100,6 +121,9 @@ models: description: gpt-4-turbo-2024-04-09大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉能力,128000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -113,6 +137,8 @@ models: description: gpt-4-turbo-preview大语言模型,支持多工具调用、智能体思考、流式工具调用,128000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -125,6 +151,9 @@ models: description: gpt-4-turbo大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉能力,128000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -138,6 +167,8 @@ models: description: o1-preview大语言模型,支持智能体思考,128000上下文窗口,对话模式,已废弃 is_deprecated: true is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -148,6 +179,9 @@ models: description: o1大语言模型,支持多工具调用、智能体思考、流式工具调用、视觉能力、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - multi-tool-call @@ -162,6 +196,9 @@ models: description: o3-2025-04-16大语言模型,支持智能体思考、工具调用、视觉能力、流式工具调用、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -176,6 +213,8 @@ models: description: o3-mini-2025-01-31大语言模型,支持智能体思考、工具调用、流式工具调用、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -189,6 +228,8 @@ models: description: o3-mini大语言模型,支持智能体思考、工具调用、流式工具调用、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 大语言模型 - agent-thought @@ -202,6 +243,9 @@ models: description: o3-pro-2025-06-10大语言模型,支持智能体思考、工具调用、视觉能力、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -215,6 +259,9 @@ models: description: o3-pro大语言模型,支持智能体思考、工具调用、视觉能力、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -228,6 +275,9 @@ models: description: o3大语言模型,支持智能体思考、视觉能力、工具调用、流式工具调用、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -242,6 +292,9 @@ models: description: o4-mini-2025-04-16大语言模型,支持智能体思考、工具调用、视觉能力、流式工具调用、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -256,6 +309,9 @@ models: description: o4-mini大语言模型,支持智能体思考、工具调用、视觉能力、流式工具调用、结构化输出,200000上下文窗口,对话模式 is_deprecated: false is_official: true + capability: + - vision + is_omni: false tags: - 大语言模型 - agent-thought @@ -270,6 +326,8 @@ models: description: text-embedding-3-large文本向量模型,8191上下文窗口,最大分块数32 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 文本向量模型 logo: openai @@ -279,6 +337,8 @@ models: description: text-embedding-3-small文本向量模型,8191上下文窗口,最大分块数32 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 文本向量模型 logo: openai @@ -288,6 +348,8 @@ models: description: text-embedding-ada-002文本向量模型,8097上下文窗口,最大分块数32 is_deprecated: false is_official: true + capability: [] + is_omni: false tags: - 文本向量模型 - logo: openai + logo: openai \ No newline at end of file diff --git a/api/app/models/models_model.py b/api/app/models/models_model.py index 3e378f17..23fafcef 100644 --- a/api/app/models/models_model.py +++ b/api/app/models/models_model.py @@ -2,7 +2,7 @@ import datetime import uuid from enum import StrEnum -from sqlalchemy import Column, String, Boolean, DateTime, Text, ForeignKey, Enum as SQLEnum, UniqueConstraint, Integer, ARRAY, Table +from sqlalchemy import Column, String, Boolean, DateTime, Text, ForeignKey, Enum as SQLEnum, UniqueConstraint, Integer, ARRAY, Table, text from sqlalchemy.dialects.postgresql import UUID, JSON from sqlalchemy.orm import relationship from sqlalchemy.sql import func @@ -78,6 +78,9 @@ class ModelConfig(BaseModel): description = Column(String, comment="模型描述") # 模型配置参数 + capability = Column(ARRAY(String), default=list, nullable=False, server_default=text("'{}'::varchar[]"), + comment="模型能力列表(如['vision', 'audio', 'video'])") + is_omni = Column(Boolean, default=False, nullable=False, server_default="false", comment="是否为Omni模型(使用特殊API调用)") config = Column(JSON, comment="模型配置参数") # - temperature : 控制生成文本的随机性。值越高,输出越随机、越有创造性;值越低,输出越确定、越保守。 # - top_p : 一种替代 temperature 的采样方法,控制模型从概率最高的词中选择的范围。 @@ -118,6 +121,11 @@ class ModelApiKey(BaseModel): api_key = Column(String, nullable=False, comment="API密钥") api_base = Column(String, comment="API基础URL") + # 模型能力参数 + capability = Column(ARRAY(String), default=list, nullable=False, server_default=text("'{}'::varchar[]"), + comment="模型能力列表(如['vision', 'audio', 'video'])") + is_omni = Column(Boolean, default=False, nullable=False, server_default="false", comment="是否为Omni模型(使用特殊API调用)") + # 配置参数 config = Column(JSON, comment="API Key特定配置") @@ -155,6 +163,9 @@ class ModelBase(Base): tags = Column(ARRAY(String), default=list, nullable=False, comment="模型标签(如['聊天', '创作'])") add_count = Column(Integer, default=0, nullable=False, comment="模型被用户添加的次数") created_at = Column(DateTime, default=datetime.datetime.now, comment="创建时间", server_default=func.now()) + capability = Column(ARRAY(String), default=list, nullable=False, server_default=text("'{}'::varchar[]"), + comment="模型能力列表(如['vision', 'audio', 'video'])") + is_omni = Column(Boolean, default=False, nullable=False, server_default="false", comment="是否为Omni模型(使用特殊API调用)") # 关联关系 configs = relationship("ModelConfig", back_populates="model_base", cascade="all, delete-orphan") diff --git a/api/app/schemas/app_schema.py b/api/app/schemas/app_schema.py index 07875e13..f073a200 100644 --- a/api/app/schemas/app_schema.py +++ b/api/app/schemas/app_schema.py @@ -21,8 +21,14 @@ class FileType(StrEnum): def trans(cls, value: str) -> 'FileType': if value.startswith("image"): return cls.IMAGE - # TODO: other file type support - raise RuntimeError("Unsupport file type") + elif value.startswith("document"): + return cls.DOCUMENT + elif value.startswith("audio"): + return cls.AUDIO + elif value.startswith("video"): + return cls.VIDEO + else: + raise RuntimeError("Unsupport file type") class TransferMethod(str, Enum): @@ -37,6 +43,12 @@ class FileInput(BaseModel): transfer_method: TransferMethod = Field(..., description="传输方式: local_file/remote_url") upload_file_id: Optional[uuid.UUID] = Field(None, description="已上传文件ID(local_file时必填)") url: Optional[str] = Field(None, description="远程URL(remote_url时必填)") + file_type: Optional[str] = Field(None, description="具体文件格式(如image/jpg、audio/wav、document/docx、video/mp4)") + + def __init__(self, **data): + if "type" in data: + data['file_type'] = data['type'] + super().__init__(**data) @field_validator("type", mode="before") @classmethod diff --git a/api/app/schemas/model_schema.py b/api/app/schemas/model_schema.py index 0c0bbeed..f25d9408 100644 --- a/api/app/schemas/model_schema.py +++ b/api/app/schemas/model_schema.py @@ -21,6 +21,8 @@ class ModelConfigBase(BaseModel): is_active: bool = Field(True, description="是否激活") is_public: bool = Field(False, description="是否公开") load_balance_strategy: Optional[str] = Field(LoadBalanceStrategy.NONE.value, description="负载均衡策略") + capability: List[str] = Field(default_factory=list, description="模型能力列表") + is_omni: bool = Field(False, description="是否为Omni模型") class ApiKeyCreateNested(BaseModel): @@ -30,6 +32,8 @@ class ApiKeyCreateNested(BaseModel): provider: Optional[str] = Field(None, description="API Key提供商") api_key: str = Field(..., description="API密钥", max_length=500) api_base: Optional[str] = Field(None, description="API基础URL", max_length=500) + capability: Optional[List[str]] = Field(None, description="模型能力列表") + is_omni: Optional[bool] = Field(None, description="是否为Omni模型") config: Optional[Dict[str, Any]] = Field({}, description="API Key特定配置") priority: str = Field("1", description="优先级", max_length=10) @@ -63,6 +67,8 @@ class ModelConfigUpdate(BaseModel): config: Optional[Dict[str, Any]] = Field(None, description="模型配置参数") is_active: Optional[bool] = Field(None, description="是否激活") is_public: Optional[bool] = Field(None, description="是否公开") + capability: Optional[List[str]] = Field(None, description="模型能力列表") + is_omni: Optional[bool] = Field(None, description="是否为Omni模型") class ModelConfig(ModelConfigBase): @@ -95,6 +101,8 @@ class ModelApiKeyCreateByProvider(BaseModel): api_key: str = Field(..., description="API密钥", max_length=500) api_base: Optional[str] = Field(None, description="API基础URL", max_length=500) description: Optional[str] = Field(None, description="备注") + capability: Optional[List[str]] = Field(None, description="模型能力列表") + is_omni: Optional[bool] = Field(None, description="是否为Omni模型") config: Optional[Dict[str, Any]] = Field({}, description="API Key特定配置") is_active: bool = Field(True, description="是否激活") priority: str = Field("1", description="优先级", max_length=10) @@ -108,6 +116,8 @@ class ModelApiKeyBase(BaseModel): provider: ModelProvider = Field(..., description="API Key提供商") api_key: str = Field(..., description="API密钥", max_length=500) api_base: Optional[str] = Field(None, description="API基础URL", max_length=500) + capability: List[str] = Field(default_factory=list, description="模型能力列表") + is_omni: bool = Field(False, description="是否为Omni模型") config: Optional[Dict[str, Any]] = Field({}, description="API Key特定配置") is_active: bool = Field(True, description="是否激活") priority: str = Field("1", description="优先级", max_length=10) @@ -124,6 +134,8 @@ class ModelApiKeyUpdate(BaseModel): provider: Optional[ModelProvider] = Field(None, description="API Key提供商") api_key: Optional[str] = Field(None, description="API密钥", max_length=500) api_base: Optional[str] = Field(None, description="API基础URL", max_length=500) + capability: Optional[List[str]] = Field(None, description="模型能力列表") + is_omni: Optional[bool] = Field(None, description="是否为Omni模型") config: Optional[Dict[str, Any]] = Field(None, description="API Key特定配置") is_active: Optional[bool] = Field(None, description="是否激活") priority: Optional[str] = Field(None, description="优先级", max_length=10) @@ -270,6 +282,8 @@ class ModelBaseCreate(BaseModel): description: Optional[str] = Field(None, description="模型描述") is_official: bool = Field(True, description="是否供应商官方模型") tags: List[str] = Field(default_factory=list, description="模型标签") + capability: List[str] = Field(default_factory=list, description="模型能力列表(如['vision', 'audio', 'video'])") + is_omni: bool = Field(False, description="是否为Omni模型") class ModelBaseUpdate(BaseModel): @@ -282,6 +296,8 @@ class ModelBaseUpdate(BaseModel): is_deprecated: Optional[bool] = Field(None, description="是否弃用") is_official: Optional[bool] = Field(None, description="是否供应商官方模型") tags: Optional[List[str]] = Field(None, description="模型标签") + capability: Optional[List[str]] = Field(None, description="模型能力列表") + is_omni: Optional[bool] = Field(None, description="是否为Omni模型") class ModelBase(BaseModel): @@ -298,6 +314,8 @@ class ModelBase(BaseModel): is_official: bool tags: List[str] add_count: int + capability: List[str] = [] + is_omni: bool = False class ModelBaseQuery(BaseModel): diff --git a/api/app/services/app_chat_service.py b/api/app/services/app_chat_service.py index 9723121d..e6ac227b 100644 --- a/api/app/services/app_chat_service.py +++ b/api/app/services/app_chat_service.py @@ -157,6 +157,7 @@ class AppChatService: api_key=api_key_obj.api_key, provider=api_key_obj.provider, api_base=api_key_obj.api_base, + is_omni=api_key_obj.is_omni, temperature=model_parameters.get("temperature", 0.7), max_tokens=model_parameters.get("max_tokens", 2000), system_prompt=system_prompt, @@ -180,7 +181,7 @@ class AppChatService: # 处理多模态文件 processed_files = None if files: - multimodal_service = MultimodalService(self.db) + multimodal_service = MultimodalService(self.db, api_key_obj.provider, is_omni=api_key_obj.is_omni) processed_files = await multimodal_service.process_files(files) logger.info(f"处理了 {len(processed_files)} 个文件") @@ -343,6 +344,7 @@ class AppChatService: api_key=api_key_obj.api_key, provider=api_key_obj.provider, api_base=api_key_obj.api_base, + is_omni=api_key_obj.is_omni, temperature=model_parameters.get("temperature", 0.7), max_tokens=model_parameters.get("max_tokens", 2000), system_prompt=system_prompt, @@ -366,7 +368,7 @@ class AppChatService: # 处理多模态文件 processed_files = None if files: - multimodal_service = MultimodalService(self.db) + multimodal_service = MultimodalService(self.db, api_key_obj.provider, is_omni=api_key_obj.is_omni) processed_files = await multimodal_service.process_files(files) logger.info(f"处理了 {len(processed_files)} 个文件") diff --git a/api/app/services/app_service.py b/api/app/services/app_service.py index 6e6e0ecb..c5919af9 100644 --- a/api/app/services/app_service.py +++ b/api/app/services/app_service.py @@ -232,7 +232,7 @@ class AppService: # 检查主 Agent 的模型配置 multi_agent_config.default_model_config_id = master_agent_release.default_model_config_id - model_api_key = ModelApiKeyService.get_a_api_key(self.db, multi_agent_config.default_model_config_id) + model_api_key = ModelApiKeyService.get_available_api_key(self.db, multi_agent_config.default_model_config_id) if not model_api_key: raise ResourceNotFoundException("模型配置", str(multi_agent_config.default_model_config_id)) diff --git a/api/app/services/audio_transcription_service.py b/api/app/services/audio_transcription_service.py new file mode 100644 index 00000000..11d13f38 --- /dev/null +++ b/api/app/services/audio_transcription_service.py @@ -0,0 +1,101 @@ +""" +音频转文本服务 + +支持的服务商: +- DashScope (阿里云通义千问) +- OpenAI Whisper +""" +import httpx + +from app.core.logging_config import get_business_logger + +logger = get_business_logger() + + +class AudioTranscriptionService: + """音频转文本服务""" + + @staticmethod + async def transcribe_dashscope(audio_url: str, api_key: str) -> str: + """ + 使用阿里云通义千问语音识别服务转换音频为文本 + + Args: + audio_url: 音频文件 URL + api_key: DashScope API Key + + Returns: + str: 转录的文本 + """ + try: + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "X-DashScope-Async": "enable", + }, + json={ + "model": "paraformer-v2", + "input": { + "file_urls": [audio_url] + }, + "parameters": { + "language_hints": ["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"] + } + } + ) + response.raise_for_status() + result = response.json() + + if result.get("output", {}).get("results"): + text = result["output"]["results"][0].get("transcription_text", "") + logger.info(f"音频转文本成功: {len(text)} 字符") + return text + + return "[音频转文本失败]" + + except Exception as e: + logger.error(f"DashScope 音频转文本失败: {e}") + return f"[音频转文本失败: {str(e)}]" + + @staticmethod + async def transcribe_openai(audio_url: str, api_key: str) -> str: + """ + 使用 OpenAI Whisper 转换音频为文本 + + Args: + audio_url: 音频文件 URL + api_key: OpenAI API Key + + Returns: + str: 转录的文本 + """ + try: + # 下载音频文件 + async with httpx.AsyncClient(timeout=60.0) as client: + audio_response = await client.get(audio_url) + audio_response.raise_for_status() + audio_data = audio_response.content + + # 调用 Whisper API + files = {"file": ("audio.mp3", audio_data, "audio/mpeg")} + data = {"model": "whisper-1"} + + response = await client.post( + "https://api.openai.com/v1/audio/transcriptions", + headers={"Authorization": f"Bearer {api_key}"}, + files=files, + data=data + ) + response.raise_for_status() + result = response.json() + + text = result.get("text", "") + logger.info(f"音频转文本成功: {len(text)} 字符") + return text + + except Exception as e: + logger.error(f"OpenAI Whisper 音频转文本失败: {e}") + return f"[音频转文本失败: {str(e)}]" diff --git a/api/app/services/collaborative_orchestrator.py b/api/app/services/collaborative_orchestrator.py index 00a731de..68181cd1 100644 --- a/api/app/services/collaborative_orchestrator.py +++ b/api/app/services/collaborative_orchestrator.py @@ -445,6 +445,7 @@ class CollaborativeOrchestrator: "provider": api_key_config.provider, "api_key": api_key_config.api_key, "api_base": api_key_config.api_base, + "is_omni": api_key_config.is_omni, "model_parameters": config_data.get("model_parameters", {}), "api_key_id": api_key_config.id } @@ -511,6 +512,7 @@ class CollaborativeOrchestrator: provider=agent_config["provider"], api_key=agent_config["api_key"], base_url=agent_config.get("api_base"), + is_omni=agent_config.get("is_omni", False), extra_params=extra_params ) diff --git a/api/app/services/draft_run_service.py b/api/app/services/draft_run_service.py index 8977710b..693f1a26 100644 --- a/api/app/services/draft_run_service.py +++ b/api/app/services/draft_run_service.py @@ -415,6 +415,7 @@ class DraftRunService: api_key=api_key_config["api_key"], provider=api_key_config.get("provider", "openai"), api_base=api_key_config.get("api_base"), + is_omni=api_key_config.get("is_omni", False), temperature=effective_params.get("temperature", 0.7), max_tokens=effective_params.get("max_tokens", 2000), system_prompt=system_prompt, @@ -442,7 +443,7 @@ class DraftRunService: if files: # 获取 provider 信息 provider = api_key_config.get("provider", "openai") - multimodal_service = MultimodalService(self.db, provider=provider) + multimodal_service = MultimodalService(self.db, provider=provider, is_omni=api_key_config.get("is_omni", False)) processed_files = await multimodal_service.process_files(files) logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}") @@ -683,6 +684,7 @@ class DraftRunService: api_key=api_key_config["api_key"], provider=api_key_config.get("provider", "openai"), api_base=api_key_config.get("api_base"), + is_omni=api_key_config.get("is_omni", False), temperature=effective_params.get("temperature", 0.7), max_tokens=effective_params.get("max_tokens", 2000), system_prompt=system_prompt, @@ -711,7 +713,7 @@ class DraftRunService: if files: # 获取 provider 信息 provider = api_key_config.get("provider", "openai") - multimodal_service = MultimodalService(self.db, provider=provider) + multimodal_service = MultimodalService(self.db, provider=provider, is_omni=api_key_config.get("is_omni", False)) processed_files = await multimodal_service.process_files(files) logger.info(f"处理了 {len(processed_files)} 个文件,provider={provider}") @@ -809,7 +811,7 @@ class DraftRunService: """ return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n" - async def _get_api_key(self, model_config_id: uuid.UUID) -> Dict[str, str]: + async def _get_api_key(self, model_config_id: uuid.UUID) -> Dict: """获取模型的 API Key Args: @@ -846,7 +848,8 @@ class DraftRunService: "provider": api_key.provider, "api_key": api_key.api_key, "api_base": api_key.api_base, - "api_key_id": api_key.id + "api_key_id": api_key.id, + "is_omni": api_key.is_omni } async def _ensure_conversation( diff --git a/api/app/services/handoffs_service.py b/api/app/services/handoffs_service.py index e490eea4..8418fe31 100644 --- a/api/app/services/handoffs_service.py +++ b/api/app/services/handoffs_service.py @@ -544,6 +544,7 @@ def convert_multi_agent_config_to_handoffs( provider=model_api_key.provider, api_key=model_api_key.api_key, base_url=model_api_key.api_base, + is_omni=model_api_key.is_omni, extra_params={ "temperature": 0.7, "max_tokens": 2000, diff --git a/api/app/services/llm_router.py b/api/app/services/llm_router.py index e56ad5aa..02895d6b 100644 --- a/api/app/services/llm_router.py +++ b/api/app/services/llm_router.py @@ -414,6 +414,7 @@ class LLMRouter: provider=api_key_config.provider, api_key=api_key_config.api_key, base_url=api_key_config.api_base, + is_omni=api_key_config.is_omni, temperature=0.3, max_tokens=500 ) diff --git a/api/app/services/master_agent_router.py b/api/app/services/master_agent_router.py index 3cf3ecc3..b0f43b51 100644 --- a/api/app/services/master_agent_router.py +++ b/api/app/services/master_agent_router.py @@ -392,6 +392,7 @@ class MasterAgentRouter: provider=api_key_config.provider, api_key=api_key_config.api_key, base_url=api_key_config.api_base, + is_omni=api_key_config.is_omni, extra_params = extra_params ) diff --git a/api/app/services/model_service.py b/api/app/services/model_service.py index aa8cfbac..2337427a 100644 --- a/api/app/services/model_service.py +++ b/api/app/services/model_service.py @@ -90,7 +90,8 @@ class ModelConfigService: api_key: str, api_base: Optional[str] = None, model_type: str = "llm", - test_message: str = "Hello" + test_message: str = "Hello", + is_omni: bool = False ) -> Dict[str, Any]: """验证模型配置是否有效 @@ -102,6 +103,7 @@ class ModelConfigService: api_base: API基础URL model_type: 模型类型 (llm/chat/embedding/rerank) test_message: 测试消息 + is_omni: 是否为Omni模型 Returns: Dict: 验证结果 @@ -114,14 +116,27 @@ class ModelConfigService: try: start_time = time.time() - model_config = RedBearModelConfig( - model_name=model_name, - provider=provider, - api_key=api_key, - base_url=api_base, - temperature=0.7, - max_tokens=100 - ) + # dashscope 的 omni 模型需要使用 compatible-mode + if provider.lower() == ModelProvider.DASHSCOPE and is_omni: + if not api_base: + api_base = "https://dashscope.aliyuncs.com/compatible-mode/v1" + model_config = RedBearModelConfig( + model_name=model_name, + provider=ModelProvider.OPENAI, + api_key=api_key, + base_url=api_base, + temperature=0.7, + max_tokens=100 + ) + else: + model_config = RedBearModelConfig( + model_name=model_name, + provider=provider, + api_key=api_key, + base_url=api_base, + temperature=0.7, + max_tokens=100 + ) # 根据模型类型选择不同的验证方式 model_type_lower = model_type.lower() @@ -257,8 +272,9 @@ class ModelConfigService: provider=model_data.provider, api_key=api_key_data.api_key, api_base=api_key_data.api_base, - model_type=model_data.type, # 传递模型类型 - test_message="Hello" + model_type=model_data.type, + test_message="Hello", + is_omni=model_data.is_omni ) if not validation_result["valid"]: raise BusinessException( @@ -279,6 +295,9 @@ class ModelConfigService: for api_key_data in api_key_datas: api_key_data.model_name = model_data.name api_key_data.provider = model_data.provider + # 同步capability和is_omni + api_key_data.capability = model_data.capability + api_key_data.is_omni = model_data.is_omni api_key_create_schema = ModelApiKeyCreate( model_config_ids=[model.id], **api_key_data.model_dump() @@ -497,6 +516,8 @@ class ModelApiKeyService: existing_key.config = data.config existing_key.priority = data.priority existing_key.model_name = model_name + existing_key.capability = data.capability + existing_key.is_omni = data.is_omni # 检查是否已关联该模型配置 if model_config not in existing_key.model_configs: @@ -513,7 +534,8 @@ class ModelApiKeyService: api_key=data.api_key, api_base=data.api_base, model_type=model_config.type, - test_message="Hello" + test_message="Hello", + is_omni=data.is_omni ) if not validation_result["valid"]: # 记录验证失败的模型,但不抛出异常 @@ -528,6 +550,8 @@ class ModelApiKeyService: provider=data.provider, api_key=data.api_key, api_base=data.api_base, + capability=data.capability if data.capability is not None else model_config.capability, + is_omni=data.is_omni if data.is_omni is not None else model_config.is_omni, config=data.config, is_active=data.is_active, priority=data.priority @@ -572,6 +596,8 @@ class ModelApiKeyService: existing_key.config = api_key_data.config existing_key.priority = api_key_data.priority existing_key.model_name = api_key_data.model_name + existing_key.capability = api_key_data.capability + existing_key.is_omni = api_key_data.is_omni # 检查是否已关联该模型配置 if model_config not in existing_key.model_configs: @@ -589,7 +615,8 @@ class ModelApiKeyService: api_key=api_key_data.api_key, api_base=api_key_data.api_base, model_type=model_config.type, - test_message="Hello" + test_message="Hello", + is_omni=model_config.is_omni ) if not validation_result["valid"]: raise BusinessException( @@ -620,7 +647,8 @@ class ModelApiKeyService: api_key=api_key_data.api_key or existing_api_key.api_key, api_base=api_key_data.api_base or existing_api_key.api_base, model_type=model_config.type, - test_message="Hello" + test_message="Hello", + is_omni=model_config.is_omni ) if not validation_result["valid"]: raise BusinessException( @@ -755,6 +783,8 @@ class ModelBaseService: "type": model_base.type, "logo": model_base.logo, "description": model_base.description, + "capability": model_base.capability, + "is_omni": model_base.is_omni, "is_composite": False } model_config = ModelConfigRepository.create(db, model_config_data) diff --git a/api/app/services/multi_agent_orchestrator.py b/api/app/services/multi_agent_orchestrator.py index d1aa46d1..650f639b 100644 --- a/api/app/services/multi_agent_orchestrator.py +++ b/api/app/services/multi_agent_orchestrator.py @@ -2593,6 +2593,7 @@ class MultiAgentOrchestrator: provider=api_key_config.provider, api_key=api_key_config.api_key, base_url=api_key_config.api_base, + is_omni=api_key_config.is_omni, temperature=0.7, # 整合任务使用中等温度 max_tokens=2000 ) @@ -2758,6 +2759,7 @@ class MultiAgentOrchestrator: provider=api_key_config.provider, api_key=api_key_config.api_key, base_url=api_key_config.api_base, + is_omni=api_key_config.is_omni, temperature=0.7, max_tokens=2000, extra_params={"streaming": True} # 启用流式输出 diff --git a/api/app/services/multi_agent_service.py b/api/app/services/multi_agent_service.py index c52814ed..751099d5 100644 --- a/api/app/services/multi_agent_service.py +++ b/api/app/services/multi_agent_service.py @@ -267,7 +267,7 @@ class MultiAgentService: # 2. 验证模型配置(如果提供了) if data.default_model_config_id: - model_api_key = ModelApiKeyService.get_a_api_key(self.db, data.default_model_config_id) + model_api_key = ModelApiKeyService.get_available_api_key(self.db, data.default_model_config_id) if not model_api_key: raise ResourceNotFoundException("模型配置", str(data.default_model_config_id)) diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py index bfb23a56..9b06c287 100644 --- a/api/app/services/multimodal_service.py +++ b/api/app/services/multimodal_service.py @@ -9,47 +9,100 @@ - OpenAI: 支持 URL 和 base64 格式 """ import uuid -from typing import List, Dict, Any, Optional, Protocol +import httpx +import base64 +from typing import List, Dict, Any, Optional +from abc import ABC, abstractmethod from sqlalchemy.orm import Session +from docx import Document +import io +import PyPDF2 from app.core.logging_config import get_business_logger from app.core.exceptions import BusinessException from app.core.error_codes import BizCode from app.schemas.app_schema import FileInput, FileType, TransferMethod -from app.models.generic_file_model import GenericFile +from app.models.file_metadata_model import FileMetadata +from app.core.config import settings +from app.services.audio_transcription_service import AudioTranscriptionService logger = get_business_logger() -class ImageFormatStrategy(Protocol): - """图片格式策略接口""" +class MultimodalFormatStrategy(ABC): + """多模态格式策略基类""" + + @abstractmethod + async def format_image(self, url: str) -> Dict[str, Any]: + """格式化图片""" + pass + + @abstractmethod + async def format_document(self, file_name: str, text: str) -> Dict[str, Any]: + """格式化文档""" + pass + + @abstractmethod + async def format_audio(self, file_type: str, url: str) -> Dict[str, Any]: + """格式化音频""" + pass + + @abstractmethod + async def format_video(self, url: str) -> Dict[str, Any]: + """格式化视频""" + pass + + +class DashScopeFormatStrategy(MultimodalFormatStrategy): + """通义千问策略""" async def format_image(self, url: str) -> Dict[str, Any]: - """将图片 URL 转换为特定 provider 的格式""" - ... - - -class DashScopeImageStrategy: - """通义千问图片格式策略""" - - async def format_image(self, url: str) -> Dict[str, Any]: - """通义千问格式: {"type": "image", "image": "url"}""" + """通义千问图片格式:{"type": "image", "image": "url"}""" return { "type": "image", "image": url } + async def format_document(self, file_name: str, text: str) -> Dict[str, Any]: + """通义千问文档格式""" + return { + "type": "text", + "text": f"\n{text}\n" + } -class BedrockImageStrategy: - """Bedrock/Anthropic 图片格式策略""" + async def format_audio(self, file_type: str, url: str, transcription: Optional[str] = None) -> Dict[str, Any]: + """ + 通义千问音频格式 + - 原生支持: qwen-audio 系列 + - 其他模型: 需要转录为文本 + """ + if transcription: + return { + "type": "text", + "text": f"" + } + # 通义千问音频格式:{"type": "audio", "audio": "url"} + return { + "type": "audio", + "audio": url + } + + async def format_video(self, url: str) -> Dict[str, Any]: + """通义千问视频格式(qwen-vl 系列原生支持)""" + return { + "type": "video", + "video": url + } + + +class BedrockFormatStrategy(MultimodalFormatStrategy): + """Bedrock/Anthropic 策略""" async def format_image(self, url: str) -> Dict[str, Any]: """ Bedrock/Anthropic 格式: base64 编码 {"type": "image", "source": {"type": "base64", "media_type": "...", "data": "..."}} """ - import httpx - import base64 from mimetypes import guess_type logger.info(f"下载并编码图片: {url}") @@ -84,9 +137,46 @@ class BedrockImageStrategy: } } + async def format_document(self, file_name: str, text: str) -> Dict[str, Any]: + """Bedrock/Anthropic 文档格式(需要 base64 编码)""" + # Bedrock 文档需要 base64 编码 + text_bytes = text.encode('utf-8') + base64_text = base64.b64encode(text_bytes).decode('utf-8') -class OpenAIImageStrategy: - """OpenAI 图片格式策略""" + return { + "type": "document", + "source": { + "type": "base64", + "media_type": "text/plain", + "data": base64_text + } + } + + async def format_audio(self, file_type: str, url: str, transcription: Optional[str] = None) -> Dict[str, Any]: + """ + Bedrock/Anthropic 音频格式 + 不支持原生音频,必须转录为文本 + """ + if transcription: + return { + "type": "text", + "text": f"[音频转录]\n{transcription}" + } + return { + "type": "text", + "text": "[音频文件:Bedrock 不支持原生音频,请启用音频转文本功能]" + } + + async def format_video(self, url: str) -> Dict[str, Any]: + """Bedrock/Anthropic 视频格式""" + return { + "type": "text", + "text": f"" + } + + +class OpenAIFormatStrategy(MultimodalFormatStrategy): + """OpenAI 策略""" async def format_image(self, url: str) -> Dict[str, Any]: """OpenAI 格式: {"type": "image_url", "image_url": {"url": "..."}}""" @@ -97,29 +187,97 @@ class OpenAIImageStrategy: } } + async def format_document(self, file_name: str, text: str) -> Dict[str, Any]: + """OpenAI 文档格式""" + return { + "type": "text", + "text": f"\n{text}\n" + } + + async def format_audio(self, file_type: str, url: str, transcription: Optional[str] = None) -> Dict[str, Any]: + """ + OpenAI 音频格式 + - gpt-4o-audio 系列支持原生音频(需要 base64 编码) + - 其他模型使用转录文本 + """ + if transcription: + return { + "type": "text", + "text": f"" + } + + # OpenAI 音频需要 base64 编码 + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(url) + response.raise_for_status() + audio_data = response.content + base64_audio = base64.b64encode(audio_data).decode('utf-8') + # 1. 优先从 file_type (MIME) 取扩展名 + file_ext = file_type.split('/')[-1] if file_type and '/' in file_type else None + # 2. 从响应头 content-type 取 + if not file_ext: + ct = response.headers.get("content-type", "") + file_ext = ct.split('/')[-1].split(';')[0].strip() if '/' in ct else None + # 3. 从 URL 路径取扩展名 + if not file_ext: + file_ext = url.split('?')[0].rsplit('.', 1)[-1].lower() or None + # 4. 默认 wav + # supported_ext = {"wav", "mp3", "mp4", "ogg", "flac", "webm", "m4a", "wave", "x-m4a"} + file_ext = "wav" if not file_ext else file_ext + + return { + "type": "input_audio", + "input_audio": { + "data": f"data:;base64,{base64_audio}", + "format": file_ext + } + } + except Exception as e: + logger.error(f"下载音频失败: {e}") + return { + "type": "text", + "text": f"[音频处理失败: {str(e)}]" + } + + async def format_video(self, url: str) -> Dict[str, Any]: + """OpenAI 视频格式""" + return { + "type": "video_url", + "video_url": { + "url": url + } + } + # Provider 到策略的映射 PROVIDER_STRATEGIES = { - "dashscope": DashScopeImageStrategy, - "bedrock": BedrockImageStrategy, - "anthropic": BedrockImageStrategy, - "openai": OpenAIImageStrategy, + "dashscope": DashScopeFormatStrategy, + "bedrock": BedrockFormatStrategy, + "anthropic": BedrockFormatStrategy, + "openai": OpenAIFormatStrategy, } class MultimodalService: """多模态文件处理服务""" - def __init__(self, db: Session, provider: str = "dashscope"): + def __init__(self, db: Session, provider: str = "dashscope", api_key: Optional[str] = None, enable_audio_transcription: bool = False, is_omni: bool = False): """ 初始化多模态服务 Args: db: 数据库会话 - provider: 模型提供商(dashscope, bedrock, anthropic 等) + provider: 模型提供商(dashscope, bedrock, anthropic, openai 等) + api_key: API 密钥(用于音频转文本) + enable_audio_transcription: 是否启用音频转文本 + is_omni: 是否为 Omni 模型(dashscope 的 omni 模型需要使用 OpenAI 兼容格式) """ self.db = db self.provider = provider.lower() + self.api_key = api_key + self.enable_audio_transcription = enable_audio_transcription + self.is_omni = is_omni async def process_files( self, @@ -137,20 +295,32 @@ class MultimodalService: if not files: return [] + # 获取对应的策略 + # dashscope 的 omni 模型使用 OpenAI 兼容格式 + if self.provider == "dashscope" and self.is_omni: + strategy_class = OpenAIFormatStrategy + else: + strategy_class = PROVIDER_STRATEGIES.get(self.provider) + if not strategy_class: + logger.warning(f"未找到 provider '{self.provider}' 的策略,使用默认策略") + strategy_class = DashScopeFormatStrategy + + strategy = strategy_class() + result = [] for idx, file in enumerate(files): try: if file.type == FileType.IMAGE: - content = await self._process_image(file) + content = await self._process_image(file, strategy) result.append(content) elif file.type == FileType.DOCUMENT: - content = await self._process_document(file) + content = await self._process_document(file, strategy) result.append(content) elif file.type == FileType.AUDIO: - content = await self._process_audio(file) + content = await self._process_audio(file, strategy) result.append(content) elif file.type == FileType.VIDEO: - content = await self._process_video(file) + content = await self._process_video(file, strategy) result.append(content) else: logger.warning(f"不支持的文件类型: {file.type}") @@ -172,55 +342,29 @@ class MultimodalService: logger.info(f"成功处理 {len(result)}/{len(files)} 个文件,provider={self.provider}") return result - async def _process_image(self, file: FileInput) -> Dict[str, Any]: + async def _process_image(self, file: FileInput, strategy) -> Dict[str, Any]: """ 处理图片文件 Args: file: 图片文件输入 + strategy: 格式化策略 Returns: - Dict: 根据 provider 返回不同格式 - - Anthropic/Bedrock: {"type": "image", "source": {"type": "base64", "media_type": "...", "data": "..."}} - - 通义千问: {"type": "image", "image": "url"} + Dict: 根据 provider 返回不同格式的图片内容 """ - url = await self.get_file_url(file) - - logger.debug(f"处理图片: {url}, provider={self.provider}") - - # 根据 provider 返回不同格式 - if self.provider in ["bedrock", "anthropic"]: - # Anthropic/Bedrock 只支持 base64 格式,需要下载并转换 - try: - logger.info(f"开始下载并编码图片: {url}") - base64_data, media_type = await self._download_and_encode_image(url) - result = { - "type": "image", - "source": { - "type": "base64", - "media_type": media_type, - "data": base64_data[:100] + "..." # 只记录前100个字符 - } - } - logger.info(f"图片编码完成: media_type={media_type}, data_length={len(base64_data)}") - # 返回完整数据 - result["source"]["data"] = base64_data - return result - except Exception as e: - logger.error(f"下载并编码图片失败: {e}", exc_info=True) - # 返回错误提示 - return { - "type": "text", - "text": f"[图片加载失败: {str(e)}]" - } - else: - # 通义千问等其他格式支持 URL + try: + url = await self.get_file_url(file) + return await strategy.format_image(url) + except Exception as e: + logger.error(f"处理图片失败: {e}", exc_info=True) return { - "type": "image", - "image": url + "type": "text", + "text": f"[图片处理失败: {str(e)}]" } - async def _download_and_encode_image(self, url: str) -> tuple[str, str]: + @staticmethod + async def _download_and_encode_image(url: str) -> tuple[str, str]: """ 下载图片并转换为 base64 @@ -230,8 +374,6 @@ class MultimodalService: Returns: tuple: (base64_data, media_type) """ - import httpx - import base64 from mimetypes import guess_type # 下载图片 @@ -258,15 +400,16 @@ class MultimodalService: return base64_data, media_type - async def _process_document(self, file: FileInput) -> Dict[str, Any]: + async def _process_document(self, file: FileInput, strategy) -> Dict[str, Any]: """ 处理文档文件(PDF、Word 等) Args: file: 文档文件输入 + strategy: 格式化策略 Returns: - Dict: text 格式的内容(包含提取的文本) + Dict: 根据 provider 返回不同格式的文档内容 """ if file.transfer_method == TransferMethod.REMOTE_URL: # 远程文档暂不支持提取 @@ -277,48 +420,68 @@ class MultimodalService: else: # 本地文件,提取文本内容 text = await self._extract_document_text(file.upload_file_id) - generic_file = self.db.query(GenericFile).filter( - GenericFile.id == file.upload_file_id + file_metadata = self.db.query(FileMetadata).filter( + FileMetadata.id == file.upload_file_id ).first() - file_name = generic_file.file_name if generic_file else "unknown" + file_name = file_metadata.file_name if file_metadata else "unknown" - return { - "type": "text", - "text": f"\n{text}\n" - } + # 使用策略格式化文档 + return await strategy.format_document(file_name, text) - async def _process_audio(self, file: FileInput) -> Dict[str, Any]: + async def _process_audio(self, file: FileInput, strategy) -> Dict[str, Any]: """ 处理音频文件 Args: file: 音频文件输入 + strategy: 格式化策略 Returns: - Dict: 音频内容(暂时返回占位符) + Dict: 根据 provider 返回不同格式的音频内容 """ - # TODO: 实现音频转文字功能 - return { - "type": "text", - "text": "[音频文件,暂不支持处理]" - } + try: + url = await self.get_file_url(file) - async def _process_video(self, file: FileInput) -> Dict[str, Any]: + # 如果启用音频转文本且有 API Key + transcription = None + if self.enable_audio_transcription and self.api_key: + logger.info(f"开始音频转文本: {url}") + if self.provider == "dashscope": + transcription = await AudioTranscriptionService.transcribe_dashscope(url, self.api_key) + elif self.provider == "openai": + transcription = await AudioTranscriptionService.transcribe_openai(url, self.api_key) + else: + logger.warning(f"Provider {self.provider} 不支持音频转文本") + + return await strategy.format_audio(file.file_type, url, transcription) + except Exception as e: + logger.error(f"处理音频失败: {e}", exc_info=True) + return { + "type": "text", + "text": f"[音频处理失败: {str(e)}]" + } + + async def _process_video(self, file: FileInput, strategy) -> Dict[str, Any]: """ 处理视频文件 Args: file: 视频文件输入 + strategy: 格式化策略 Returns: - Dict: 视频内容(暂时返回占位符) + Dict: 根据 provider 返回不同格式的视频内容 """ - # TODO: 实现视频处理功能 - return { - "type": "text", - "text": "[视频文件,暂不支持处理]" - } + try: + url = await self.get_file_url(file) + return await strategy.format_video(url) + except Exception as e: + logger.error(f"处理视频失败: {e}", exc_info=True) + return { + "type": "text", + "text": f"[视频处理失败: {str(e)}]" + } async def get_file_url(self, file: FileInput) -> str: """ @@ -336,26 +499,22 @@ class MultimodalService: if file.transfer_method == TransferMethod.REMOTE_URL: return file.url else: - # 本地文件,通过 file_storage 系统获取永久访问 URL - from app.models.file_metadata_model import FileMetadata - from app.core.config import settings - file_id = file.upload_file_id print("="*50) print("file_id",file_id) - + # 查询 FileMetadata file_metadata = self.db.query(FileMetadata).filter( FileMetadata.id == file_id, FileMetadata.status == "completed" ).first() - + if not file_metadata: raise BusinessException( f"文件不存在或已删除: {file_id}", BizCode.NOT_FOUND ) - + # 返回永久URL server_url = settings.FILE_LOCAL_SERVER_URL return f"{server_url}/storage/permanent/{file_id}" @@ -370,58 +529,79 @@ class MultimodalService: Returns: str: 提取的文本内容 """ - generic_file = self.db.query(GenericFile).filter( - GenericFile.id == file_id, - GenericFile.status == "active" + file_metadata = self.db.query(FileMetadata).filter( + FileMetadata.id == file_id, + FileMetadata.status == "completed" ).first() - if not generic_file: + if not file_metadata: raise BusinessException( f"文件不存在或已删除: {file_id}", BizCode.NOT_FOUND ) - # TODO: 根据文件类型提取文本 - # - PDF: 使用 PyPDF2 或 pdfplumber - # - Word: 使用 python-docx - # - TXT/MD: 直接读取 - - file_ext = generic_file.file_ext.lower() + file_ext = file_metadata.file_ext.lower() + server_url = settings.FILE_LOCAL_SERVER_URL + file_url = f"{server_url}/storage/permanent/{file_id}" if file_ext in ['.txt', '.md', '.markdown']: - return await self._read_text_file(generic_file.storage_path) + return await self._read_text_file(file_url) elif file_ext == '.pdf': - return await self._extract_pdf_text(generic_file.storage_path) + return await self._extract_pdf_text(file_url) elif file_ext in ['.doc', '.docx']: - return await self._extract_word_text(generic_file.storage_path) + return await self._extract_word_text(file_url) else: return f"[不支持的文档格式: {file_ext}]" - async def _read_text_file(self, storage_path: str) -> str: + @staticmethod + async def _read_text_file(file_url: str) -> str: """读取纯文本文件""" try: - with open(storage_path, 'r', encoding='utf-8') as f: - return f.read() + # 下载文件 + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(file_url) + response.raise_for_status() + return response.text except Exception as e: logger.error(f"读取文本文件失败: {e}") return f"[文件读取失败: {str(e)}]" - async def _extract_pdf_text(self, storage_path: str) -> str: + @staticmethod + async def _extract_pdf_text(file_url: str) -> str: """提取 PDF 文本""" try: - # TODO: 实现 PDF 文本提取 - # import PyPDF2 或 pdfplumber - return "[PDF 文本提取功能待实现]" + # 下载 PDF 文件 + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(file_url) + response.raise_for_status() + pdf_data = response.content + + # 使用 BytesIO 读取 PDF + text_parts = [] + pdf_file = io.BytesIO(pdf_data) + pdf_reader = PyPDF2.PdfReader(pdf_file) + for page in pdf_reader.pages: + text_parts.append(page.extract_text()) + return '\n'.join(text_parts) except Exception as e: logger.error(f"提取 PDF 文本失败: {e}") return f"[PDF 提取失败: {str(e)}]" - async def _extract_word_text(self, storage_path: str) -> str: + @staticmethod + async def _extract_word_text(file_url: str) -> str: """提取 Word 文档文本""" try: - # TODO: 实现 Word 文本提取 - # import docx - return "[Word 文本提取功能待实现]" + # 下载 Word 文件 + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(file_url) + response.raise_for_status() + word_data = response.content + + # 使用 BytesIO 读取 Word 文档 + word_file = io.BytesIO(word_data) + doc = Document(word_file) + text_parts = [paragraph.text for paragraph in doc.paragraphs] + return '\n'.join(text_parts) except Exception as e: logger.error(f"提取 Word 文本失败: {e}") return f"[Word 提取失败: {str(e)}]" diff --git a/api/app/services/prompt_optimizer_service.py b/api/app/services/prompt_optimizer_service.py index 99edcc0e..184220a8 100644 --- a/api/app/services/prompt_optimizer_service.py +++ b/api/app/services/prompt_optimizer_service.py @@ -184,7 +184,8 @@ class PromptOptimizerService: model_name=api_config.model_name, provider=api_config.provider, api_key=api_config.api_key, - base_url=api_config.api_base + base_url=api_config.api_base, + is_omni=api_config.is_omni ), type=ModelType(model_config.type)) try: prompt_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'prompt') diff --git a/api/app/services/shared_chat_service.py b/api/app/services/shared_chat_service.py index 89d3f3d6..0d659832 100644 --- a/api/app/services/shared_chat_service.py +++ b/api/app/services/shared_chat_service.py @@ -247,6 +247,7 @@ class SharedChatService: api_key=api_key_obj.api_key, provider=api_key_obj.provider, api_base=api_key_obj.api_base, + is_omni=api_key_obj.is_omni, temperature=model_parameters.get("temperature", 0.7), max_tokens=model_parameters.get("max_tokens", 2000), system_prompt=system_prompt, @@ -454,6 +455,7 @@ class SharedChatService: api_key=api_key_obj.api_key, provider=api_key_obj.provider, api_base=api_key_obj.api_base, + is_omni=api_key_obj.is_omni, temperature=model_parameters.get("temperature", 0.7), max_tokens=model_parameters.get("max_tokens", 2000), system_prompt=system_prompt,