From f6efa0d71135de41cb31817e36b4dff79549f7fe Mon Sep 17 00:00:00 2001
From: Timebomb2018 <18868801967@163.com>
Date: Wed, 18 Mar 2026 22:29:10 +0800
Subject: [PATCH] fix(agent): Reading of docx multimodal files; Multimodal
 attachment history record

---
 api/app/services/app_chat_service.py   | 104 +++++++++++++++++--------
 api/app/services/draft_run_service.py  |  58 ++++++++++----
 api/app/services/multimodal_service.py |   5 +-
 3 files changed, 116 insertions(+), 51 deletions(-)

diff --git a/api/app/services/app_chat_service.py b/api/app/services/app_chat_service.py
index cd9d3e81..58beea53 100644
--- a/api/app/services/app_chat_service.py
+++ b/api/app/services/app_chat_service.py
@@ -24,6 +24,7 @@ from app.services.model_service import ModelApiKeyService
 from app.services.multi_agent_orchestrator import MultiAgentOrchestrator
 from app.services.multimodal_service import MultimodalService
 from app.services.workflow_service import WorkflowService
+from app.schemas import FileType
 
 logger = get_business_logger()
 
@@ -156,20 +157,6 @@ class AppChatService:
             files=processed_files  # 传递处理后的文件
         )
 
-        # 保存消息
-        message_id = self.conversation_service.save_conversation_messages(
-            conversation_id=conversation_id,
-            user_message=message,
-            assistant_message=result["content"],
-            meta_data={
-                "usage": result.get("usage", {
-                    "prompt_tokens": 0,
-                    "completion_tokens": 0,
-                    "total_tokens": 0
-                })
-            }
-        )
-
         ModelApiKeyService.record_api_key_usage(self.db, api_key_obj.id)
 
         elapsed_time = time.time() - start_time
@@ -191,6 +178,40 @@ class AppChatService:
             tenant_id=tenant_id, workspace_id=workspace_id
         )
 
+        # 构建用户消息内容（含多模态文件）
+        human_meta = {
+            "files": []
+        }
+        assistant_meta = {
+            "model": api_key_obj.model_name,
+            "usage": result.get("usage", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}),
+            "audio_url": None
+        }
+        if files:
+            for f in files:
+                # url = await MultimodalService(self.db).get_file_url(f)
+                human_meta["files"].append({
+                    "type": FileType.IMAGE,
+                    "url": f.url
+                })
+
+        # 保存消息
+        if audio_url:
+            assistant_meta["audio_url"] = audio_url
+        self.conversation_service.add_message(
+            conversation_id=conversation_id,
+            role="user",
+            content=message,
+            meta_data=human_meta
+        )
+        ai_message = self.conversation_service.add_message(
+            conversation_id=conversation_id,
+            role="assistant",
+            content=result["content"],
+            meta_data=assistant_meta
+        )
+        message_id = ai_message.id
+
         return {
             "conversation_id": conversation_id,
             "message_id": str(message_id),
@@ -344,24 +365,6 @@ class AppChatService:
 
             elapsed_time = time.time() - start_time
 
-            # 保存消息
-            self.conversation_service.add_message(
-                conversation_id=conversation_id,
-                role="user",
-                content=message
-            )
-
-            self.conversation_service.add_message(
-                message_id=message_id,
-                conversation_id=conversation_id,
-                role="assistant",
-                content=full_content,
-                meta_data={
-                    "model": api_key_obj.model_name,
-                    "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": total_tokens}
-                }
-            )
-
             ModelApiKeyService.record_api_key_usage(self.db, api_key_obj.id)
 
             # 发送结束事件（包含 suggested_questions、tts、citations）
@@ -373,13 +376,48 @@ class AppChatService:
                     {"model_name": api_key_obj.model_name, "api_key": api_key_obj.api_key,
                      "api_base": api_key_obj.api_base}, {}
                 )
-            end_data["audio_url"] = await self.agent_service._generate_tts(
+            stream_audio_url = await self.agent_service._generate_tts(
                 features_config, full_content,
                 {"model_name": api_key_obj.model_name, "api_key": api_key_obj.api_key,
                  "api_base": api_key_obj.api_base, "provider": api_key_obj.provider},
                 tenant_id=tenant_id, workspace_id=workspace_id
             )
+            end_data["audio_url"] = stream_audio_url
             end_data["citations"] = self.agent_service._filter_citations(features_config, [])
+
+            # 保存消息
+            human_meta = {
+                "files":[]
+            }
+            assistant_meta = {
+                "model": api_key_obj.model_name,
+                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": total_tokens},
+                "audio_url": None
+            }
+
+            if files:
+                for f in files:
+                    # url = await MultimodalService(self.db).get_file_url(f)
+                    human_meta["files"].append({
+                        "type": FileType.IMAGE,
+                        "url": f.url
+                    })
+
+            if stream_audio_url:
+                assistant_meta["audio_url"] = stream_audio_url
+            self.conversation_service.add_message(
+                conversation_id=conversation_id,
+                role="user",
+                content=message,
+                meta_data=human_meta
+            )
+            self.conversation_service.add_message(
+                message_id=message_id,
+                conversation_id=conversation_id,
+                role="assistant",
+                content=full_content,
+                meta_data=assistant_meta
+            )
             yield f"event: end\ndata: {json.dumps(end_data, ensure_ascii=False)}\n\n"
 
             logger.info(
diff --git a/api/app/services/draft_run_service.py b/api/app/services/draft_run_service.py
index 92b13bfc..5b8bbfa7 100644
--- a/api/app/services/draft_run_service.py
+++ b/api/app/services/draft_run_service.py
@@ -37,6 +37,7 @@ from app.services.model_parameter_merger import ModelParameterMerger
 from app.services.model_service import ModelApiKeyService
 from app.services.multimodal_service import MultimodalService
 from app.services.tool_service import ToolService
+from app.schemas import FileType
 
 logger = get_business_logger()
 
@@ -636,7 +637,13 @@ class AgentRunService:
 
             ModelApiKeyService.record_api_key_usage(self.db, api_key_config.get("api_key_id"))
 
-            # 9. 保存会话消息
+            # 9. 生成 TTS audio_url（在保存消息前生成，以便一并存入 meta_data）
+            audio_url = await self._generate_tts(
+                features_config, result["content"], api_key_config,
+                tenant_id=tenant_id, workspace_id=workspace_id
+            ) if not sub_agent else None
+
+            # 10. 保存会话消息
             if not sub_agent:
                 await self._save_conversation_message(
                     conversation_id=conversation_id,
@@ -650,7 +657,9 @@ class AgentRunService:
                             "completion_tokens": 0,
                             "total_tokens": 0
                         })
-                    }
+                    },
+                    files=files,
+                    audio_url=audio_url
                 )
 
             response = {
@@ -666,10 +675,7 @@ class AgentRunService:
                     features_config, result["content"], api_key_config, effective_params
                 ) if not sub_agent else [],
                 "citations": self._filter_citations(features_config, result.get("citations", [])),
-                "audio_url": await self._generate_tts(
-                    features_config, result["content"], api_key_config,
-                    tenant_id=tenant_id, workspace_id=workspace_id
-                ) if not sub_agent else None,
+                "audio_url": audio_url,
             }
 
             logger.info(
@@ -878,7 +884,13 @@ class AgentRunService:
                     "total_tokens": total_tokens
                 })
 
-            # 10. 保存会话消息
+            # 10. 生成 audio_url（在保存消息前生成，以便一并存入 meta_data）
+            stream_audio_url = await self._generate_tts(
+                features_config, full_content, api_key_config,
+                tenant_id=tenant_id, workspace_id=workspace_id
+            ) if not sub_agent else None
+
+            # 11. 保存会话消息
             if not sub_agent:
                 await self._save_conversation_message(
                     conversation_id=conversation_id,
@@ -888,10 +900,12 @@ class AgentRunService:
                     user_id=user_id,
                     meta_data={
                         "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": total_tokens}
-                    }
+                    },
+                    files=files,
+                    audio_url=stream_audio_url
                 )
 
-            # 11. 发送结束事件（包含 suggested_questions 和 tts）
+            # 12. 发送结束事件（包含 suggested_questions 和 tts）
             end_data: Dict[str, Any] = {
                 "conversation_id": conversation_id,
                 "elapsed_time": elapsed_time,
@@ -901,10 +915,7 @@ class AgentRunService:
                 end_data["suggested_questions"] = await self._generate_suggested_questions(
                     features_config, full_content, api_key_config, effective_params
                 )
-                end_data["audio_url"] = await self._generate_tts(
-                    features_config, full_content, api_key_config,
-                    tenant_id=tenant_id, workspace_id=workspace_id
-                )
+                end_data["audio_url"] = stream_audio_url
                 end_data["citations"] = self._filter_citations(features_config, [])
             yield self._format_sse_event("end", end_data)
 
@@ -1143,7 +1154,9 @@ class AgentRunService:
             assistant_message: str,
             meta_data: dict,
             app_id: Optional[uuid.UUID] = None,
-            user_id: Optional[str] = None
+            user_id: Optional[str] = None,
+            files: Optional[List[FileInput]] = None,
+            audio_url: Optional[str] = None
     ) -> None:
         """保存会话消息（会话已通过 _ensure_conversation 确保存在）
 
@@ -1162,13 +1175,26 @@ class AgentRunService:
             conv_uuid = uuid.UUID(conversation_id)
 
             # 保存消息（会话已经存在）
+            human_meta = {
+                "files": []
+            }
+            if files:
+                for f in files:
+                    # url = await MultimodalService(self.db).get_file_url(f)
+                    human_meta["files"].append({
+                        "type": FileType.IMAGE,
+                        "url": f.url
+                    })
             # 保存用户消息
             conversation_service.add_message(
                 conversation_id=conv_uuid,
                 role="user",
-                content=user_message
+                content=user_message,
+                meta_data=human_meta
             )
-            # 保存助手消息
+            # 保存助手消息（含 audio_url）
+            if audio_url:
+                meta_data["audio_url"] = audio_url
             conversation_service.add_message(
                 conversation_id=conv_uuid,
                 role="assistant",
diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py
index 1f0e1cc2..f0c7cee2 100644
--- a/api/app/services/multimodal_service.py
+++ b/api/app/services/multimodal_service.py
@@ -41,7 +41,8 @@ TEXT_MIME = ['text/plain', 'text/x-markdown']
 PDF_MIME = ['application/pdf']
 DOC_MIME = [
     'application/msword',
-    'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    'application/zip'
 ]
 XLSX_MIME = [
     'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
@@ -590,7 +591,7 @@ class MultimodalService:
                 return file_content.decode("utf-8")
             elif file_mime_type in PDF_MIME:
                 return await self._extract_pdf_text(file_content)
-            elif file_mime_type in DOC_MIME:
+            elif file_mime_type in DOC_MIME and file.file_type.endswith(('docx', 'doc')):
                 return await self._extract_word_text(file_content)
             elif file_mime_type in XLSX_MIME and file.file_type.endswith(("xlsx", "xls")):
                 return await self._extract_xlsx_text(file_content)