diff --git a/api/app/services/app_chat_service.py b/api/app/services/app_chat_service.py index d81ba7b7..a1002919 100644 --- a/api/app/services/app_chat_service.py +++ b/api/app/services/app_chat_service.py @@ -118,28 +118,54 @@ class AppChatService: ) + model_info = ModelInfo( + model_name=api_key_obj.model_name, + provider=api_key_obj.provider, + api_key=api_key_obj.api_key, + api_base=api_key_obj.api_base, + capability=api_key_obj.capability, + is_omni=api_key_obj.is_omni, + model_type=ModelType.LLM + ) + # 加载历史消息 messages = self.conversation_service.get_messages( conversation_id=conversation_id, limit=10 ) - history = [ - {"role": msg.role, "content": [{"type": "text", "text": msg.content}] + (msg.meta_data.get("files", []) if msg.meta_data else [])} - for msg in messages - ] + history = [] + for msg in messages: + content = [{"type": "text", "text": msg.content}] + + # 处理 meta_data 中的 files + if msg.meta_data and msg.meta_data.get("files"): + files = msg.meta_data.get("files", []) + # 使用 MultimodalService 处理文件 + multimodal_service = MultimodalService(self.db, api_config=model_info) + + # 将 files 转换为 FileInput 格式 + file_inputs = [] + for file in files: + from app.schemas.app_schema import FileInput, TransferMethod + file_input = FileInput( + type=file.get("type"), + transfer_method=TransferMethod.REMOTE_URL, + url=file.get("url") + ) + file_inputs.append(file_input) + + history_processed_files = await multimodal_service.history_process_files(files=file_inputs) + + content.extend(history_processed_files) + + history.append({ + "role": msg.role, + "content": content + }) # 处理多模态文件 processed_files = None if files: - model_info = ModelInfo( - model_name=api_key_obj.model_name, - provider=api_key_obj.provider, - api_key=api_key_obj.api_key, - api_base=api_key_obj.api_base, - capability=api_key_obj.capability, - is_omni=api_key_obj.is_omni, - model_type=ModelType.LLM - ) multimodal_service = MultimodalService(self.db, model_info) processed_files = await multimodal_service.process_files(user_id, files) logger.info(f"处理了 {len(processed_files)} 个文件") @@ -187,8 +213,13 @@ class AppChatService: "usage": result.get("usage", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}), "audio_url": None } - if processed_files: - human_meta["files"].extend(processed_files) + if files: + for f in files: + # url = await MultimodalService(self.db).get_file_url(f) + human_meta["files"].append({ + "type": f.type, + "url": f.url + }) # 保存消息 if audio_url: @@ -308,31 +339,54 @@ class AppChatService: streaming=True ) + model_info = ModelInfo( + model_name=api_key_obj.model_name, + provider=api_key_obj.provider, + api_key=api_key_obj.api_key, + api_base=api_key_obj.api_base, + capability=api_key_obj.capability, + is_omni=api_key_obj.is_omni, + model_type=ModelType.LLM + ) + # 加载历史消息 + messages = self.conversation_service.get_messages( + conversation_id=conversation_id, + limit=10 + ) history = [] - memory_config = {"enabled": True, 'max_history': 10} - if memory_config.get("enabled"): - messages = self.conversation_service.get_messages( - conversation_id=conversation_id, - limit=memory_config.get("max_history", 10) - ) - history = [ - {"role": msg.role, "content": [{"type": "text", "text": msg.content}] + (msg.meta_data.get("files", []) if msg.meta_data else [])} - for msg in messages - ] + for msg in messages: + content = [{"type": "text", "text": msg.content}] + + # 处理 meta_data 中的 files + if msg.meta_data and msg.meta_data.get("files"): + files = msg.meta_data.get("files", []) + # 使用 MultimodalService 处理文件 + multimodal_service = MultimodalService(self.db, api_config=model_info) + + # 将 files 转换为 FileInput 格式 + file_inputs = [] + for file in files: + from app.schemas.app_schema import FileInput, TransferMethod + file_input = FileInput( + type=file.get("type"), + transfer_method=TransferMethod.REMOTE_URL, + url=file.get("url") + ) + file_inputs.append(file_input) + + history_processed_files = await multimodal_service.history_process_files(files=file_inputs) + + content.extend(history_processed_files) + + history.append({ + "role": msg.role, + "content": content + }) # 处理多模态文件 processed_files = None if files: - model_info = ModelInfo( - model_name=api_key_obj.model_name, - provider=api_key_obj.provider, - api_key=api_key_obj.api_key, - api_base=api_key_obj.api_base, - capability=api_key_obj.capability, - is_omni=api_key_obj.is_omni, - model_type=ModelType.LLM - ) multimodal_service = MultimodalService(self.db, model_info) processed_files = await multimodal_service.process_files(user_id, files) logger.info(f"处理了 {len(processed_files)} 个文件") @@ -342,8 +396,14 @@ class AppChatService: total_tokens = 0 text_queue: asyncio.Queue = asyncio.Queue() + api_key_config = { + "model_name": api_key_obj.model_name, + "api_key": api_key_obj.api_key, + "api_base": api_key_obj.api_base, + "provider": api_key_obj.provider, + } stream_audio_url, tts_task = await self.agent_service._generate_tts_streaming( - features_config, api_key_obj, + features_config, api_key_config, text_queue=text_queue, tenant_id=tenant_id, workspace_id=workspace_id ) @@ -395,8 +455,13 @@ class AppChatService: "audio_url": None } - if processed_files: - human_meta["files"].extend(processed_files) + if files: + for f in files: + # url = await MultimodalService(self.db).get_file_url(f) + human_meta["files"].append({ + "type": f.type, + "url": f.url + }) if stream_audio_url: assistant_meta["audio_url"] = stream_audio_url diff --git a/api/app/services/conversation_service.py b/api/app/services/conversation_service.py index 30c3feba..f8a01a40 100644 --- a/api/app/services/conversation_service.py +++ b/api/app/services/conversation_service.py @@ -21,6 +21,7 @@ from app.models.conversation_model import ConversationDetail from app.models.prompt_optimizer_model import RoleType from app.repositories.conversation_repository import ConversationRepository, MessageRepository from app.schemas.conversation_schema import ConversationOut +from app.schemas.model_schema import ModelInfo from app.services import workspace_service from app.services.model_service import ModelConfigService @@ -269,10 +270,11 @@ class ConversationService: return messages - def get_conversation_history( + async def get_conversation_history( self, conversation_id: uuid.UUID, - max_history: Optional[int] = None + max_history: Optional[int] = None, + api_config: Optional[ModelInfo] = None ) -> List[dict]: """ Retrieve historical conversation messages formatted as dictionaries. @@ -280,6 +282,7 @@ class ConversationService: Args: conversation_id (uuid.UUID): Conversation UUID. max_history (Optional[int]): Maximum number of messages to retrieve. + api_config (Optional[ModelInfo]): Model API configuration for multimodal processing. Returns: List[dict]: List of message dictionaries with keys 'role' and 'content'. @@ -290,13 +293,37 @@ class ConversationService: ) # 转换为字典格式 - history = [ - { + history = [] + for msg in messages: + content = [{"type": "text", "text": msg.content}] + + # 处理 meta_data 中的 files + if msg.meta_data and msg.meta_data.get("files"): + files = msg.meta_data.get("files", []) + if api_config: + # 使用 MultimodalService 处理文件 + from app.services.multimodal_service import MultimodalService + multimodal_service = MultimodalService(self.db, api_config=api_config) + + # 将 files 转换为 FileInput 格式 + file_inputs = [] + for file in files: + from app.schemas.app_schema import FileInput, TransferMethod + file_input = FileInput( + type=file.get("type"), + transfer_method=TransferMethod.REMOTE_URL, + url=file.get("url") + ) + file_inputs.append(file_input) + + processed_files = await multimodal_service.history_process_files(files=file_inputs) + + content.extend(processed_files) + + history.append({ "role": msg.role, - "content": [{"type": "text", "text": msg.content}] + (msg.meta_data.get("files", []) if msg.meta_data else []) - } - for msg in messages - ] + "content": content + }) return history @@ -524,9 +551,18 @@ class ConversationService: type=ModelType(model_type) ) - conversation_messages = self.get_conversation_history( + conversation_messages = await self.get_conversation_history( conversation_id=conversation_id, - max_history=20 + max_history=20, + api_config=ModelInfo( + model_name=model_name, + provider=provider, + api_key=api_key, + api_base=api_base, + capability=api_config.capability, + is_omni=api_config.is_omni, + model_type=model_type + ) ) if len(conversation_messages) == 0: return ConversationOut( diff --git a/api/app/services/draft_run_service.py b/api/app/services/draft_run_service.py index 0af33357..5989f0f8 100644 --- a/api/app/services/draft_run_service.py +++ b/api/app/services/draft_run_service.py @@ -579,9 +579,20 @@ class AgentRunService: user_id=user_id ) + model_info = ModelInfo( + model_name=api_key_config["model_name"], + provider=api_key_config["provider"], + api_key=api_key_config["api_key"], + api_base=api_key_config["api_base"], + capability=api_key_config["capability"], + is_omni=api_key_config["is_omni"], + model_type=model_config.type + ) + # 6. 加载历史消息 history = await self._load_conversation_history( conversation_id=conversation_id, + api_config=model_info, max_history=10 ) @@ -589,15 +600,6 @@ class AgentRunService: processed_files = None if files: # 获取 provider 信息 - model_info = ModelInfo( - model_name=api_key_config["model_name"], - provider=api_key_config["provider"], - api_key=api_key_config["api_key"], - api_base=api_key_config["api_base"], - capability=api_key_config["capability"], - is_omni=api_key_config["is_omni"], - model_type=ModelType.LLM - ) provider = api_key_config.get("provider", "openai") multimodal_service = MultimodalService(self.db, model_info) processed_files = await multimodal_service.process_files(user_id, files) @@ -658,7 +660,7 @@ class AgentRunService: "total_tokens": 0 }) }, - files=processed_files, + files=files, audio_url=audio_url ) @@ -815,9 +817,20 @@ class AgentRunService: sub_agent=sub_agent ) + model_info = ModelInfo( + model_name=api_key_config["model_name"], + provider=api_key_config["provider"], + api_key=api_key_config["api_key"], + api_base=api_key_config["api_base"], + capability=api_key_config["capability"], + is_omni=api_key_config["is_omni"], + model_type=model_config.type + ) + # 6. 加载历史消息 history = await self._load_conversation_history( conversation_id=conversation_id, + api_config=model_info, max_history=memory_config.get("max_history", 10) ) @@ -825,15 +838,6 @@ class AgentRunService: processed_files = None if files: # 获取 provider 信息 - model_info = ModelInfo( - model_name=api_key_config["model_name"], - provider=api_key_config["provider"], - api_key=api_key_config["api_key"], - api_base=api_key_config["api_base"], - capability=api_key_config["capability"], - is_omni=api_key_config["is_omni"], - model_type=ModelType.LLM - ) provider = api_key_config.get("provider", "openai") multimodal_service = MultimodalService(self.db, model_info) processed_files = await multimodal_service.process_files(user_id, files) @@ -904,7 +908,7 @@ class AgentRunService: meta_data={ "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": total_tokens} }, - files=processed_files, + files=files, audio_url=stream_audio_url ) @@ -1115,6 +1119,7 @@ class AgentRunService: async def _load_conversation_history( self, conversation_id: str, + api_config: ModelInfo | None = None, max_history: int = 10 ) -> List[Dict[str, str]]: """加载会话历史消息 @@ -1129,9 +1134,11 @@ class AgentRunService: try: conversation_service = ConversationService(self.db) - history = conversation_service.get_conversation_history( + # 获取 API 配置用于多模态处理 + history = await conversation_service.get_conversation_history( conversation_id=uuid.UUID(conversation_id), - max_history=max_history + max_history=max_history, + api_config=api_config ) logger.debug( @@ -1182,7 +1189,12 @@ class AgentRunService: "files": [] } if files: - human_meta["files"].extend(files) + for f in files: + # url = await MultimodalService(self.db).get_file_url(f) + human_meta["files"].append({ + "type": f.type, + "url": f.url + }) # 保存用户消息 conversation_service.add_message( conversation_id=conv_uuid, diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py index d8cca67e..6cb0a7f0 100644 --- a/api/app/services/multimodal_service.py +++ b/api/app/services/multimodal_service.py @@ -418,6 +418,71 @@ class MultimodalService: logger.info(f"成功处理 {len(result)}/{len(files)} 个文件,provider={self.provider}") return result + async def history_process_files( + self, + files: Optional[List[FileInput]], + ) -> List[Dict[str, Any]]: + """ + 处理文件列表,返回 LLM 可用的格式 + + Args: + files: 文件输入列表 + + Returns: + List[Dict]: LLM 可用的内容格式列表(根据 provider 返回不同格式) + """ + if not files: + return [] + + # 获取对应的策略 + # dashscope 的 omni 模型使用 OpenAI 兼容格式 + if self.provider == "dashscope" and self.is_omni: + strategy_class = OpenAIFormatStrategy + else: + strategy_class = PROVIDER_STRATEGIES.get(self.provider) + if not strategy_class: + logger.warning(f"未找到 provider '{self.provider}' 的策略,使用默认策略") + strategy_class = DashScopeFormatStrategy + + result = [] + for idx, file in enumerate(files): + strategy = strategy_class(file) + if not file.url: + file.url = await self.get_file_url(file) + try: + if file.type == FileType.IMAGE and "vision" in self.capability: + is_support, content = await self._process_image(file, strategy) + result.append(content) + elif file.type == FileType.DOCUMENT: + is_support, content = await self._process_document(file, strategy) + result.append(content) + elif file.type == FileType.AUDIO and "audio" in self.capability: + is_support, content = await self._process_audio(file, strategy) + result.append(content) + elif file.type == FileType.VIDEO and "video" in self.capability: + is_support, content = await self._process_video(file, strategy) + result.append(content) + else: + logger.warning(f"不支持的文件类型: {file.type}") + except Exception as e: + logger.error( + f"处理文件失败", + extra={ + "file_index": idx, + "file_type": file.type, + "error": str(e) + }, + exc_info=True + ) + # 继续处理其他文件,不中断整个流程 + result.append({ + "type": "text", + "text": f"[文件处理失败: {str(e)}]" + }) + + logger.info(f"成功处理 {len(result)}/{len(files)} 个文件,provider={self.provider}") + return result + def write_perceptual_memory( self, end_user_id: str, diff --git a/api/app/services/shared_chat_service.py b/api/app/services/shared_chat_service.py index 5e18ee42..0d659832 100644 --- a/api/app/services/shared_chat_service.py +++ b/api/app/services/shared_chat_service.py @@ -264,7 +264,7 @@ class SharedChatService: limit=memory_config.get("max_history", 10) ) history = [ - {"role": msg.role, "content": [{"type": "text", "text": msg.content}] + (msg.meta_data.get("files", []) if msg.meta_data else [])} + {"role": msg.role, "content": msg.content} for msg in messages ] @@ -472,7 +472,7 @@ class SharedChatService: limit=memory_config.get("max_history", 10) ) history = [ - {"role": msg.role, "content": [{"type": "text", "text": msg.content}] + (msg.meta_data.get("files", []) if msg.meta_data else [])} + {"role": msg.role, "content": msg.content} for msg in messages ]