From f6efa0d71135de41cb31817e36b4dff79549f7fe Mon Sep 17 00:00:00 2001 From: Timebomb2018 <18868801967@163.com> Date: Wed, 18 Mar 2026 22:29:10 +0800 Subject: [PATCH] fix(agent): Reading of docx multimodal files; Multimodal attachment history record --- api/app/services/app_chat_service.py | 104 +++++++++++++++++-------- api/app/services/draft_run_service.py | 58 ++++++++++---- api/app/services/multimodal_service.py | 5 +- 3 files changed, 116 insertions(+), 51 deletions(-) diff --git a/api/app/services/app_chat_service.py b/api/app/services/app_chat_service.py index cd9d3e81..58beea53 100644 --- a/api/app/services/app_chat_service.py +++ b/api/app/services/app_chat_service.py @@ -24,6 +24,7 @@ from app.services.model_service import ModelApiKeyService from app.services.multi_agent_orchestrator import MultiAgentOrchestrator from app.services.multimodal_service import MultimodalService from app.services.workflow_service import WorkflowService +from app.schemas import FileType logger = get_business_logger() @@ -156,20 +157,6 @@ class AppChatService: files=processed_files # 传递处理后的文件 ) - # 保存消息 - message_id = self.conversation_service.save_conversation_messages( - conversation_id=conversation_id, - user_message=message, - assistant_message=result["content"], - meta_data={ - "usage": result.get("usage", { - "prompt_tokens": 0, - "completion_tokens": 0, - "total_tokens": 0 - }) - } - ) - ModelApiKeyService.record_api_key_usage(self.db, api_key_obj.id) elapsed_time = time.time() - start_time @@ -191,6 +178,40 @@ class AppChatService: tenant_id=tenant_id, workspace_id=workspace_id ) + # 构建用户消息内容(含多模态文件) + human_meta = { + "files": [] + } + assistant_meta = { + "model": api_key_obj.model_name, + "usage": result.get("usage", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}), + "audio_url": None + } + if files: + for f in files: + # url = await MultimodalService(self.db).get_file_url(f) + human_meta["files"].append({ + "type": FileType.IMAGE, + "url": f.url + }) + + # 保存消息 + if audio_url: + assistant_meta["audio_url"] = audio_url + self.conversation_service.add_message( + conversation_id=conversation_id, + role="user", + content=message, + meta_data=human_meta + ) + ai_message = self.conversation_service.add_message( + conversation_id=conversation_id, + role="assistant", + content=result["content"], + meta_data=assistant_meta + ) + message_id = ai_message.id + return { "conversation_id": conversation_id, "message_id": str(message_id), @@ -344,24 +365,6 @@ class AppChatService: elapsed_time = time.time() - start_time - # 保存消息 - self.conversation_service.add_message( - conversation_id=conversation_id, - role="user", - content=message - ) - - self.conversation_service.add_message( - message_id=message_id, - conversation_id=conversation_id, - role="assistant", - content=full_content, - meta_data={ - "model": api_key_obj.model_name, - "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": total_tokens} - } - ) - ModelApiKeyService.record_api_key_usage(self.db, api_key_obj.id) # 发送结束事件(包含 suggested_questions、tts、citations) @@ -373,13 +376,48 @@ class AppChatService: {"model_name": api_key_obj.model_name, "api_key": api_key_obj.api_key, "api_base": api_key_obj.api_base}, {} ) - end_data["audio_url"] = await self.agent_service._generate_tts( + stream_audio_url = await self.agent_service._generate_tts( features_config, full_content, {"model_name": api_key_obj.model_name, "api_key": api_key_obj.api_key, "api_base": api_key_obj.api_base, "provider": api_key_obj.provider}, tenant_id=tenant_id, workspace_id=workspace_id ) + end_data["audio_url"] = stream_audio_url end_data["citations"] = self.agent_service._filter_citations(features_config, []) + + # 保存消息 + human_meta = { + "files":[] + } + assistant_meta = { + "model": api_key_obj.model_name, + "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": total_tokens}, + "audio_url": None + } + + if files: + for f in files: + # url = await MultimodalService(self.db).get_file_url(f) + human_meta["files"].append({ + "type": FileType.IMAGE, + "url": f.url + }) + + if stream_audio_url: + assistant_meta["audio_url"] = stream_audio_url + self.conversation_service.add_message( + conversation_id=conversation_id, + role="user", + content=message, + meta_data=human_meta + ) + self.conversation_service.add_message( + message_id=message_id, + conversation_id=conversation_id, + role="assistant", + content=full_content, + meta_data=assistant_meta + ) yield f"event: end\ndata: {json.dumps(end_data, ensure_ascii=False)}\n\n" logger.info( diff --git a/api/app/services/draft_run_service.py b/api/app/services/draft_run_service.py index 92b13bfc..5b8bbfa7 100644 --- a/api/app/services/draft_run_service.py +++ b/api/app/services/draft_run_service.py @@ -37,6 +37,7 @@ from app.services.model_parameter_merger import ModelParameterMerger from app.services.model_service import ModelApiKeyService from app.services.multimodal_service import MultimodalService from app.services.tool_service import ToolService +from app.schemas import FileType logger = get_business_logger() @@ -636,7 +637,13 @@ class AgentRunService: ModelApiKeyService.record_api_key_usage(self.db, api_key_config.get("api_key_id")) - # 9. 保存会话消息 + # 9. 生成 TTS audio_url(在保存消息前生成,以便一并存入 meta_data) + audio_url = await self._generate_tts( + features_config, result["content"], api_key_config, + tenant_id=tenant_id, workspace_id=workspace_id + ) if not sub_agent else None + + # 10. 保存会话消息 if not sub_agent: await self._save_conversation_message( conversation_id=conversation_id, @@ -650,7 +657,9 @@ class AgentRunService: "completion_tokens": 0, "total_tokens": 0 }) - } + }, + files=files, + audio_url=audio_url ) response = { @@ -666,10 +675,7 @@ class AgentRunService: features_config, result["content"], api_key_config, effective_params ) if not sub_agent else [], "citations": self._filter_citations(features_config, result.get("citations", [])), - "audio_url": await self._generate_tts( - features_config, result["content"], api_key_config, - tenant_id=tenant_id, workspace_id=workspace_id - ) if not sub_agent else None, + "audio_url": audio_url, } logger.info( @@ -878,7 +884,13 @@ class AgentRunService: "total_tokens": total_tokens }) - # 10. 保存会话消息 + # 10. 生成 audio_url(在保存消息前生成,以便一并存入 meta_data) + stream_audio_url = await self._generate_tts( + features_config, full_content, api_key_config, + tenant_id=tenant_id, workspace_id=workspace_id + ) if not sub_agent else None + + # 11. 保存会话消息 if not sub_agent: await self._save_conversation_message( conversation_id=conversation_id, @@ -888,10 +900,12 @@ class AgentRunService: user_id=user_id, meta_data={ "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": total_tokens} - } + }, + files=files, + audio_url=stream_audio_url ) - # 11. 发送结束事件(包含 suggested_questions 和 tts) + # 12. 发送结束事件(包含 suggested_questions 和 tts) end_data: Dict[str, Any] = { "conversation_id": conversation_id, "elapsed_time": elapsed_time, @@ -901,10 +915,7 @@ class AgentRunService: end_data["suggested_questions"] = await self._generate_suggested_questions( features_config, full_content, api_key_config, effective_params ) - end_data["audio_url"] = await self._generate_tts( - features_config, full_content, api_key_config, - tenant_id=tenant_id, workspace_id=workspace_id - ) + end_data["audio_url"] = stream_audio_url end_data["citations"] = self._filter_citations(features_config, []) yield self._format_sse_event("end", end_data) @@ -1143,7 +1154,9 @@ class AgentRunService: assistant_message: str, meta_data: dict, app_id: Optional[uuid.UUID] = None, - user_id: Optional[str] = None + user_id: Optional[str] = None, + files: Optional[List[FileInput]] = None, + audio_url: Optional[str] = None ) -> None: """保存会话消息(会话已通过 _ensure_conversation 确保存在) @@ -1162,13 +1175,26 @@ class AgentRunService: conv_uuid = uuid.UUID(conversation_id) # 保存消息(会话已经存在) + human_meta = { + "files": [] + } + if files: + for f in files: + # url = await MultimodalService(self.db).get_file_url(f) + human_meta["files"].append({ + "type": FileType.IMAGE, + "url": f.url + }) # 保存用户消息 conversation_service.add_message( conversation_id=conv_uuid, role="user", - content=user_message + content=user_message, + meta_data=human_meta ) - # 保存助手消息 + # 保存助手消息(含 audio_url) + if audio_url: + meta_data["audio_url"] = audio_url conversation_service.add_message( conversation_id=conv_uuid, role="assistant", diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py index 1f0e1cc2..f0c7cee2 100644 --- a/api/app/services/multimodal_service.py +++ b/api/app/services/multimodal_service.py @@ -41,7 +41,8 @@ TEXT_MIME = ['text/plain', 'text/x-markdown'] PDF_MIME = ['application/pdf'] DOC_MIME = [ 'application/msword', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/zip' ] XLSX_MIME = [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', @@ -590,7 +591,7 @@ class MultimodalService: return file_content.decode("utf-8") elif file_mime_type in PDF_MIME: return await self._extract_pdf_text(file_content) - elif file_mime_type in DOC_MIME: + elif file_mime_type in DOC_MIME and file.file_type.endswith(('docx', 'doc')): return await self._extract_word_text(file_content) elif file_mime_type in XLSX_MIME and file.file_type.endswith(("xlsx", "xls")): return await self._extract_xlsx_text(file_content)