From 262a9ddc4853f7d9a0a74da98c5da3f9271c0d8e Mon Sep 17 00:00:00 2001
From: Eternity <1533512157@qq.com>
Date: Tue, 17 Mar 2026 10:30:09 +0800
Subject: [PATCH 1/3] fix(multimodel): filter unsupported files during
 perception memory write

---
 api/app/services/memory_perceptual_service.py |  12 +--
 api/app/services/multimodal_service.py        | 100 +++++++++---------
 2 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/api/app/services/memory_perceptual_service.py b/api/app/services/memory_perceptual_service.py
index 53d935fe..580a8857 100644
--- a/api/app/services/memory_perceptual_service.py
+++ b/api/app/services/memory_perceptual_service.py
@@ -262,17 +262,17 @@ class MemoryPerceptualService:
         }
         if file_type in [FileType.IMAGE, FileType.VIDEO]:
             file_modalities = {
-                "scene": content.get("scene")
+                "scene": content.get("scene", [])
             }
         elif file_type in [FileType.DOCUMENT]:
             file_modalities = {
-                "section_count": content.get("section_count"),
-                "title": content.get("title"),
-                "first_line": content.get("first_line")
+                "section_count": content.get("section_count", 0),
+                "title": content.get("title", ""),
+                "first_line": content.get("first_line", "")
             }
         else:
             file_modalities = {
-                "speaker_count": content.get("speaker_count")
+                "speaker_count": content.get("speaker_count", 0)
             }
         self.repository.create_perceptual_memory(
             end_user_id=uuid.UUID(end_user_id),
@@ -280,7 +280,7 @@ class MemoryPerceptualService:
             file_path=file_url,
             file_name=filename,
             file_ext=file_ext,
-            summary=content.get('summary'),
+            summary=content.get('summary', ""),
             meta_data={
                 "content": file_content,
                 "modalities": file_modalities
diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py
index 208f6ec0..3695c56f 100644
--- a/api/app/services/multimodal_service.py
+++ b/api/app/services/multimodal_service.py
@@ -48,22 +48,22 @@ class MultimodalFormatStrategy(ABC):
         self.file = file
 
     @abstractmethod
-    async def format_image(self, url: str, content: bytes | None = None) -> Dict[str, Any]:
+    async def format_image(self, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]:
         """格式化图片"""
         pass
 
     @abstractmethod
-    async def format_document(self, file_name: str, text: str) -> Dict[str, Any]:
+    async def format_document(self, file_name: str, text: str) -> tuple[bool, Dict[str, Any]]:
         """格式化文档"""
         pass
 
     @abstractmethod
-    async def format_audio(self, file_type: str, url: str, content: bytes | None = None) -> Dict[str, Any]:
+    async def format_audio(self, file_type: str, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]:
         """格式化音频"""
         pass
 
     @abstractmethod
-    async def format_video(self, url: str) -> Dict[str, Any]:
+    async def format_video(self, url: str) -> tuple[bool, Dict[str, Any]]:
         """格式化视频"""
         pass
 
@@ -71,16 +71,16 @@ class MultimodalFormatStrategy(ABC):
 class DashScopeFormatStrategy(MultimodalFormatStrategy):
     """通义千问策略"""
 
-    async def format_image(self, url: str, content: bytes | None = None) -> Dict[str, Any]:
+    async def format_image(self, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]:
         """通义千问图片格式：{"type": "image", "image": "url"}"""
-        return {
+        return True, {
             "type": "image",
             "image": url
         }
 
-    async def format_document(self, file_name: str, text: str) -> Dict[str, Any]:
+    async def format_document(self, file_name: str, text: str) -> tuple[bool, Dict[str, Any]]:
         """通义千问文档格式"""
-        return {
+        return True, {
             "type": "text",
             "text": f"<document name=\"{file_name}\">\n{text}\n</document>"
         }
@@ -91,26 +91,26 @@ class DashScopeFormatStrategy(MultimodalFormatStrategy):
             url: str,
             content: bytes | None = None,
             transcription: Optional[str] = None
-    ) -> Dict[str, Any]:
+    ) -> tuple[bool, Dict[str, Any]]:
         """
         通义千问音频格式
         - 原生支持: qwen-audio 系列
         - 其他模型: 需要转录为文本
         """
         if transcription:
-            return {
+            return True, {
                 "type": "text",
                 "text": f"<audio url=\"{url}\">\ntext_transcription:{transcription}\n</audio>"
             }
         # 通义千问音频格式：{"type": "audio", "audio": "url"}
-        return {
+        return True, {
             "type": "audio",
             "audio": url
         }
 
-    async def format_video(self, url: str) -> Dict[str, Any]:
+    async def format_video(self, url: str) -> tuple[bool, Dict[str, Any]]:
         """通义千问视频格式（qwen-vl 系列原生支持）"""
-        return {
+        return True, {
             "type": "video",
             "video": url
         }
@@ -119,7 +119,7 @@ class DashScopeFormatStrategy(MultimodalFormatStrategy):
 class BedrockFormatStrategy(MultimodalFormatStrategy):
     """Bedrock/Anthropic 策略"""
 
-    async def format_image(self, url: str, content: bytes | None = None) -> Dict[str, Any]:
+    async def format_image(self, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]:
         """
         Bedrock/Anthropic 格式: base64 编码
         {"type": "image", "source": {"type": "base64", "media_type": "...", "data": "..."}}
@@ -142,7 +142,7 @@ class BedrockFormatStrategy(MultimodalFormatStrategy):
 
         logger.info(f"图片编码完成: media_type={media_type}, size={len(base64_data)}")
 
-        return {
+        return True, {
             "type": "image",
             "source": {
                 "type": "base64",
@@ -151,13 +151,13 @@ class BedrockFormatStrategy(MultimodalFormatStrategy):
             }
         }
 
-    async def format_document(self, file_name: str, text: str) -> Dict[str, Any]:
+    async def format_document(self, file_name: str, text: str) -> tuple[bool, Dict[str, Any]]:
         """Bedrock/Anthropic 文档格式（需要 base64 编码）"""
         # Bedrock 文档需要 base64 编码
         text_bytes = text.encode('utf-8')
         base64_text = base64.b64encode(text_bytes).decode('utf-8')
 
-        return {
+        return True, {
             "type": "document",
             "source": {
                 "type": "base64",
@@ -171,24 +171,24 @@ class BedrockFormatStrategy(MultimodalFormatStrategy):
             url: str,
             content: bytes | None = None,
             transcription: Optional[str] = None
-    ) -> Dict[str, Any]:
+    ) -> tuple[bool, Dict[str, Any]]:
         """
         Bedrock/Anthropic 音频格式
         不支持原生音频，必须转录为文本
         """
         if transcription:
-            return {
+            return True, {
                 "type": "text",
                 "text": f"[音频转录]\n{transcription}"
             }
-        return {
+        return False, {
             "type": "text",
             "text": "[音频文件：Bedrock 不支持原生音频，请启用音频转文本功能]"
         }
 
-    async def format_video(self, url: str) -> Dict[str, Any]:
+    async def format_video(self, url: str) -> tuple[bool, Dict[str, Any]]:
         """Bedrock/Anthropic 视频格式"""
-        return {
+        return False, {
             "type": "text",
             "text": f"<video url=\"{url}\">\n[视频文件，当前 provider 暂不支持]\n</video>"
         }
@@ -197,18 +197,18 @@ class BedrockFormatStrategy(MultimodalFormatStrategy):
 class OpenAIFormatStrategy(MultimodalFormatStrategy):
     """OpenAI 策略"""
 
-    async def format_image(self, url: str, content: bytes | None = None) -> Dict[str, Any]:
+    async def format_image(self, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]:
         """OpenAI 格式: {"type": "image_url", "image_url": {"url": "..."}}"""
-        return {
+        return True, {
             "type": "image_url",
             "image_url": {
                 "url": url
             }
         }
 
-    async def format_document(self, file_name: str, text: str) -> Dict[str, Any]:
+    async def format_document(self, file_name: str, text: str) -> tuple[bool, Dict[str, Any]]:
         """OpenAI 文档格式"""
-        return {
+        return True, {
             "type": "text",
             "text": f"<document name=\"{file_name}\">\n{text}\n</document>"
         }
@@ -219,14 +219,14 @@ class OpenAIFormatStrategy(MultimodalFormatStrategy):
             url: str,
             content: bytes | None = None,
             transcription: Optional[str] = None
-    ) -> Dict[str, Any]:
+    ) -> tuple[bool, Dict[str, Any]]:
         """
         OpenAI 音频格式
         - gpt-4o-audio 系列支持原生音频（需要 base64 编码）
         - 其他模型使用转录文本
         """
         if transcription:
-            return {
+            return True, {
                 "type": "text",
                 "text": f"<audio url=\"{url}\">\n{transcription}\n</audio>"
             }
@@ -255,7 +255,7 @@ class OpenAIFormatStrategy(MultimodalFormatStrategy):
             # supported_ext = {"wav", "mp3", "mp4", "ogg", "flac", "webm", "m4a", "wave", "x-m4a"}
             file_ext = "wav" if not file_ext else file_ext
 
-            return {
+            return True, {
                 "type": "input_audio",
                 "input_audio": {
                     "data": f"data:;base64,{base64_audio}",
@@ -264,14 +264,14 @@ class OpenAIFormatStrategy(MultimodalFormatStrategy):
             }
         except Exception as e:
             logger.error(f"下载音频失败: {e}")
-            return {
+            return False, {
                 "type": "text",
                 "text": f"[音频处理失败: {str(e)}]"
             }
 
-    async def format_video(self, url: str) -> Dict[str, Any]:
+    async def format_video(self, url: str) -> tuple[bool, Dict[str, Any]]:
         """OpenAI 视频格式"""
-        return {
+        return True, {
             "type": "video_url",
             "video_url": {
                 "url": url
@@ -366,21 +366,25 @@ class MultimodalService:
                 file.url = await self.get_file_url(file)
             try:
                 if file.type == FileType.IMAGE and "vision" in self.capability:
-                    content = await self._process_image(file, strategy)
+                    is_support, content = await self._process_image(file, strategy)
                     result.append(content)
-                    self.write_perceptual_memory(end_user_id, file.type, file.url, content)
+                    if is_support:
+                        self.write_perceptual_memory(end_user_id, file.type, file.url, content)
                 elif file.type == FileType.DOCUMENT:
-                    content = await self._process_document(file, strategy)
+                    is_support, content = await self._process_document(file, strategy)
                     result.append(content)
-                    self.write_perceptual_memory(end_user_id, file.type, file.url, content)
+                    if is_support:
+                        self.write_perceptual_memory(end_user_id, file.type, file.url, content)
                 elif file.type == FileType.AUDIO and "audio" in self.capability:
-                    content = await self._process_audio(file, strategy)
+                    is_support, content = await self._process_audio(file, strategy)
                     result.append(content)
-                    self.write_perceptual_memory(end_user_id, file.type, file.url, content)
+                    if is_support:
+                        self.write_perceptual_memory(end_user_id, file.type, file.url, content)
                 elif file.type == FileType.VIDEO and "video" in self.capability:
-                    content = await self._process_video(file, strategy)
+                    is_support, content = await self._process_video(file, strategy)
                     result.append(content)
-                    self.write_perceptual_memory(end_user_id, file.type, file.url, content)
+                    if is_support:
+                        self.write_perceptual_memory(end_user_id, file.type, file.url, content)
                 else:
                     logger.warning(f"不支持的文件类型: {file.type}")
             except Exception as e:
@@ -413,7 +417,7 @@ class MultimodalService:
         if end_user_id and self.api_config:
             write_perceptual_memory.delay(end_user_id, self.api_config.model_dump(), file_type, file_url, file_message)
 
-    async def _process_image(self, file: FileInput, strategy) -> Dict[str, Any]:
+    async def _process_image(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]:
         """
         处理图片文件
         
@@ -429,12 +433,12 @@ class MultimodalService:
             return await strategy.format_image(file.url, content=file.get_content())
         except Exception as e:
             logger.error(f"处理图片失败: {e}", exc_info=True)
-            return {
+            return False, {
                 "type": "text",
                 "text": f"[图片处理失败: {str(e)}]"
             }
 
-    async def _process_document(self, file: FileInput, strategy) -> Dict[str, Any]:
+    async def _process_document(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]:
         """
         处理文档文件（PDF、Word 等）
         
@@ -446,7 +450,7 @@ class MultimodalService:
             Dict: 根据 provider 返回不同格式的文档内容
         """
         if file.transfer_method == TransferMethod.REMOTE_URL:
-            return {
+            return True, {
                 "type": "text",
                 "text": f"<document url=\"{file.url}\">\n{await self._extract_document_text(file)}\n</document>"
             }
@@ -464,7 +468,7 @@ class MultimodalService:
             # 使用策略格式化文档
             return await strategy.format_document(file_name, text)
 
-    async def _process_audio(self, file: FileInput, strategy) -> Dict[str, Any]:
+    async def _process_audio(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]:
         """
         处理音频文件
         
@@ -492,12 +496,12 @@ class MultimodalService:
             return await strategy.format_audio(file.file_type, file.url, file.get_content(), transcription)
         except Exception as e:
             logger.error(f"处理音频失败: {e}", exc_info=True)
-            return {
+            return False, {
                 "type": "text",
                 "text": f"[音频处理失败: {str(e)}]"
             }
 
-    async def _process_video(self, file: FileInput, strategy) -> Dict[str, Any]:
+    async def _process_video(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]:
         """
         处理视频文件
         
@@ -513,7 +517,7 @@ class MultimodalService:
             return await strategy.format_video(file.url)
         except Exception as e:
             logger.error(f"处理视频失败: {e}", exc_info=True)
-            return {
+            return False, {
                 "type": "text",
                 "text": f"[视频处理失败: {str(e)}]"
             }

From 8ddacb7bc90b760e602bb38a160838d96e3cd69e Mon Sep 17 00:00:00 2001
From: Eternity <1533512157@qq.com>
Date: Tue, 17 Mar 2026 17:24:02 +0800
Subject: [PATCH 2/3] fix(perceptual): resolve inconsistency between local
 filename and actual filename

---
 api/app/services/memory_perceptual_service.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/api/app/services/memory_perceptual_service.py b/api/app/services/memory_perceptual_service.py
index 580a8857..8a7c86e2 100644
--- a/api/app/services/memory_perceptual_service.py
+++ b/api/app/services/memory_perceptual_service.py
@@ -5,12 +5,14 @@ from urllib.parse import urlparse, unquote
 
 import json_repair
 from jinja2 import Template
+from sqlalchemy import select
 from sqlalchemy.orm import Session
 
 from app.core.error_codes import BizCode
 from app.core.exceptions import BusinessException
 from app.core.logging_config import get_business_logger
 from app.core.models import RedBearLLM, RedBearModelConfig
+from app.models import FileMetadata
 from app.models.memory_perceptual_model import PerceptualType, FileStorageService
 from app.models.prompt_optimizer_model import RoleType
 from app.repositories.memory_perceptual_repository import MemoryPerceptualRepository
@@ -245,6 +247,18 @@ class MemoryPerceptualService:
         filename = os.path.basename(path)
         filename = unquote(filename)
         file_ext = os.path.splitext(filename)[1]
+        try:
+            file_id = uuid.UUID(filename)
+            stmt = select(FileMetadata).where(
+                FileMetadata.id == file_id
+            )
+            file = self.db.execute(stmt).scalar_one_or_none()
+
+            if file:
+                filename = file.file_name
+                file_ext = file.file_ext
+        except ValueError:
+            business_logger.debug(f"Remote file, file_id={filename}")
         if not file_ext:
             if file_type == FileType.AUDIO:
                 file_ext = ".mp3"

From 3b8a806661159080509606bb471d6facce6b024c Mon Sep 17 00:00:00 2001
From: Eternity <1533512157@qq.com>
Date: Tue, 17 Mar 2026 18:01:28 +0800
Subject: [PATCH 3/3] feat(workflow): expose workflow memory enable status in
 app share config API

---
 api/app/controllers/public_share_controller.py | 1 +
 api/app/services/workflow_service.py           | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/api/app/controllers/public_share_controller.py b/api/app/controllers/public_share_controller.py
index 19c82790..b8fec55d 100644
--- a/api/app/controllers/public_share_controller.py
+++ b/api/app/controllers/public_share_controller.py
@@ -661,6 +661,7 @@ async def config_query(
         content = {
             "app_type": release.app.type,
             "variables": workflow_service.get_start_node_variables(release.config),
+            "memory":  workflow_service.is_memory_enable(release.config),
             "features": release.config.get("features")
         }
     elif release.app.type == AppType.AGENT:
diff --git a/api/app/services/workflow_service.py b/api/app/services/workflow_service.py
index 4e7268d3..7aca3c2f 100644
--- a/api/app/services/workflow_service.py
+++ b/api/app/services/workflow_service.py
@@ -868,6 +868,14 @@ class WorkflowService:
                 return node.get("config", {}).get("variables", [])
         raise BusinessException("workflow config error - start node not found")
 
+    @staticmethod
+    def is_memory_enable(config: dict) -> bool:
+        nodes = config.get("nodes", [])
+        for node in nodes:
+            if node.get("type") in [NodeType.MEMORY_READ, NodeType.MEMORY_WRITE]:
+                return True
+        return False
+
 
 # ==================== 依赖注入函数 ====================