diff --git a/api/app/controllers/public_share_controller.py b/api/app/controllers/public_share_controller.py index 0e666898..34572964 100644 --- a/api/app/controllers/public_share_controller.py +++ b/api/app/controllers/public_share_controller.py @@ -663,6 +663,7 @@ async def config_query( content = { "app_type": release.app.type, "variables": workflow_service.get_start_node_variables(release.config), + "memory": workflow_service.is_memory_enable(release.config), "features": release.config.get("features") } elif release.app.type == AppType.AGENT: diff --git a/api/app/services/memory_perceptual_service.py b/api/app/services/memory_perceptual_service.py index 53d935fe..8a7c86e2 100644 --- a/api/app/services/memory_perceptual_service.py +++ b/api/app/services/memory_perceptual_service.py @@ -5,12 +5,14 @@ from urllib.parse import urlparse, unquote import json_repair from jinja2 import Template +from sqlalchemy import select from sqlalchemy.orm import Session from app.core.error_codes import BizCode from app.core.exceptions import BusinessException from app.core.logging_config import get_business_logger from app.core.models import RedBearLLM, RedBearModelConfig +from app.models import FileMetadata from app.models.memory_perceptual_model import PerceptualType, FileStorageService from app.models.prompt_optimizer_model import RoleType from app.repositories.memory_perceptual_repository import MemoryPerceptualRepository @@ -245,6 +247,18 @@ class MemoryPerceptualService: filename = os.path.basename(path) filename = unquote(filename) file_ext = os.path.splitext(filename)[1] + try: + file_id = uuid.UUID(filename) + stmt = select(FileMetadata).where( + FileMetadata.id == file_id + ) + file = self.db.execute(stmt).scalar_one_or_none() + + if file: + filename = file.file_name + file_ext = file.file_ext + except ValueError: + business_logger.debug(f"Remote file, file_id={filename}") if not file_ext: if file_type == FileType.AUDIO: file_ext = ".mp3" @@ -262,17 +276,17 @@ class MemoryPerceptualService: } if file_type in [FileType.IMAGE, FileType.VIDEO]: file_modalities = { - "scene": content.get("scene") + "scene": content.get("scene", []) } elif file_type in [FileType.DOCUMENT]: file_modalities = { - "section_count": content.get("section_count"), - "title": content.get("title"), - "first_line": content.get("first_line") + "section_count": content.get("section_count", 0), + "title": content.get("title", ""), + "first_line": content.get("first_line", "") } else: file_modalities = { - "speaker_count": content.get("speaker_count") + "speaker_count": content.get("speaker_count", 0) } self.repository.create_perceptual_memory( end_user_id=uuid.UUID(end_user_id), @@ -280,7 +294,7 @@ class MemoryPerceptualService: file_path=file_url, file_name=filename, file_ext=file_ext, - summary=content.get('summary'), + summary=content.get('summary', ""), meta_data={ "content": file_content, "modalities": file_modalities diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py index 908ba953..1f0e1cc2 100644 --- a/api/app/services/multimodal_service.py +++ b/api/app/services/multimodal_service.py @@ -59,22 +59,22 @@ class MultimodalFormatStrategy(ABC): self.file = file @abstractmethod - async def format_image(self, url: str, content: bytes | None = None) -> Dict[str, Any]: + async def format_image(self, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]: """格式化图片""" pass @abstractmethod - async def format_document(self, file_name: str, text: str) -> Dict[str, Any]: + async def format_document(self, file_name: str, text: str) -> tuple[bool, Dict[str, Any]]: """格式化文档""" pass @abstractmethod - async def format_audio(self, file_type: str, url: str, content: bytes | None = None) -> Dict[str, Any]: + async def format_audio(self, file_type: str, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]: """格式化音频""" pass @abstractmethod - async def format_video(self, url: str) -> Dict[str, Any]: + async def format_video(self, url: str) -> tuple[bool, Dict[str, Any]]: """格式化视频""" pass @@ -82,16 +82,16 @@ class MultimodalFormatStrategy(ABC): class DashScopeFormatStrategy(MultimodalFormatStrategy): """通义千问策略""" - async def format_image(self, url: str, content: bytes | None = None) -> Dict[str, Any]: + async def format_image(self, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]: """通义千问图片格式:{"type": "image", "image": "url"}""" - return { + return True, { "type": "image", "image": url } - async def format_document(self, file_name: str, text: str) -> Dict[str, Any]: + async def format_document(self, file_name: str, text: str) -> tuple[bool, Dict[str, Any]]: """通义千问文档格式""" - return { + return True, { "type": "text", "text": f"\n{text}\n" } @@ -102,26 +102,26 @@ class DashScopeFormatStrategy(MultimodalFormatStrategy): url: str, content: bytes | None = None, transcription: Optional[str] = None - ) -> Dict[str, Any]: + ) -> tuple[bool, Dict[str, Any]]: """ 通义千问音频格式 - 原生支持: qwen-audio 系列 - 其他模型: 需要转录为文本 """ if transcription: - return { + return True, { "type": "text", "text": f"" } # 通义千问音频格式:{"type": "audio", "audio": "url"} - return { + return True, { "type": "audio", "audio": url } - async def format_video(self, url: str) -> Dict[str, Any]: + async def format_video(self, url: str) -> tuple[bool, Dict[str, Any]]: """通义千问视频格式(qwen-vl 系列原生支持)""" - return { + return True, { "type": "video", "video": url } @@ -130,7 +130,7 @@ class DashScopeFormatStrategy(MultimodalFormatStrategy): class BedrockFormatStrategy(MultimodalFormatStrategy): """Bedrock/Anthropic 策略""" - async def format_image(self, url: str, content: bytes | None = None) -> Dict[str, Any]: + async def format_image(self, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]: """ Bedrock/Anthropic 格式: base64 编码 {"type": "image", "source": {"type": "base64", "media_type": "...", "data": "..."}} @@ -153,7 +153,7 @@ class BedrockFormatStrategy(MultimodalFormatStrategy): logger.info(f"图片编码完成: media_type={media_type}, size={len(base64_data)}") - return { + return True, { "type": "image", "source": { "type": "base64", @@ -162,13 +162,13 @@ class BedrockFormatStrategy(MultimodalFormatStrategy): } } - async def format_document(self, file_name: str, text: str) -> Dict[str, Any]: + async def format_document(self, file_name: str, text: str) -> tuple[bool, Dict[str, Any]]: """Bedrock/Anthropic 文档格式(需要 base64 编码)""" # Bedrock 文档需要 base64 编码 text_bytes = text.encode('utf-8') base64_text = base64.b64encode(text_bytes).decode('utf-8') - return { + return True, { "type": "document", "source": { "type": "base64", @@ -182,24 +182,24 @@ class BedrockFormatStrategy(MultimodalFormatStrategy): url: str, content: bytes | None = None, transcription: Optional[str] = None - ) -> Dict[str, Any]: + ) -> tuple[bool, Dict[str, Any]]: """ Bedrock/Anthropic 音频格式 不支持原生音频,必须转录为文本 """ if transcription: - return { + return True, { "type": "text", "text": f"[音频转录]\n{transcription}" } - return { + return False, { "type": "text", "text": "[音频文件:Bedrock 不支持原生音频,请启用音频转文本功能]" } - async def format_video(self, url: str) -> Dict[str, Any]: + async def format_video(self, url: str) -> tuple[bool, Dict[str, Any]]: """Bedrock/Anthropic 视频格式""" - return { + return False, { "type": "text", "text": f"" } @@ -208,18 +208,18 @@ class BedrockFormatStrategy(MultimodalFormatStrategy): class OpenAIFormatStrategy(MultimodalFormatStrategy): """OpenAI 策略""" - async def format_image(self, url: str, content: bytes | None = None) -> Dict[str, Any]: + async def format_image(self, url: str, content: bytes | None = None) -> tuple[bool, Dict[str, Any]]: """OpenAI 格式: {"type": "image_url", "image_url": {"url": "..."}}""" - return { + return True, { "type": "image_url", "image_url": { "url": url } } - async def format_document(self, file_name: str, text: str) -> Dict[str, Any]: + async def format_document(self, file_name: str, text: str) -> tuple[bool, Dict[str, Any]]: """OpenAI 文档格式""" - return { + return True, { "type": "text", "text": f"\n{text}\n" } @@ -230,14 +230,14 @@ class OpenAIFormatStrategy(MultimodalFormatStrategy): url: str, content: bytes | None = None, transcription: Optional[str] = None - ) -> Dict[str, Any]: + ) -> tuple[bool, Dict[str, Any]]: """ OpenAI 音频格式 - gpt-4o-audio 系列支持原生音频(需要 base64 编码) - 其他模型使用转录文本 """ if transcription: - return { + return True, { "type": "text", "text": f"" } @@ -266,7 +266,7 @@ class OpenAIFormatStrategy(MultimodalFormatStrategy): # supported_ext = {"wav", "mp3", "mp4", "ogg", "flac", "webm", "m4a", "wave", "x-m4a"} file_ext = "wav" if not file_ext else file_ext - return { + return True, { "type": "input_audio", "input_audio": { "data": f"data:;base64,{base64_audio}", @@ -275,14 +275,14 @@ class OpenAIFormatStrategy(MultimodalFormatStrategy): } except Exception as e: logger.error(f"下载音频失败: {e}") - return { + return False, { "type": "text", "text": f"[音频处理失败: {str(e)}]" } - async def format_video(self, url: str) -> Dict[str, Any]: + async def format_video(self, url: str) -> tuple[bool, Dict[str, Any]]: """OpenAI 视频格式""" - return { + return True, { "type": "video_url", "video_url": { "url": url @@ -377,21 +377,25 @@ class MultimodalService: file.url = await self.get_file_url(file) try: if file.type == FileType.IMAGE and "vision" in self.capability: - content = await self._process_image(file, strategy) + is_support, content = await self._process_image(file, strategy) result.append(content) - self.write_perceptual_memory(end_user_id, file.type, file.url, content) + if is_support: + self.write_perceptual_memory(end_user_id, file.type, file.url, content) elif file.type == FileType.DOCUMENT: - content = await self._process_document(file, strategy) + is_support, content = await self._process_document(file, strategy) result.append(content) - self.write_perceptual_memory(end_user_id, file.type, file.url, content) + if is_support: + self.write_perceptual_memory(end_user_id, file.type, file.url, content) elif file.type == FileType.AUDIO and "audio" in self.capability: - content = await self._process_audio(file, strategy) + is_support, content = await self._process_audio(file, strategy) result.append(content) - self.write_perceptual_memory(end_user_id, file.type, file.url, content) + if is_support: + self.write_perceptual_memory(end_user_id, file.type, file.url, content) elif file.type == FileType.VIDEO and "video" in self.capability: - content = await self._process_video(file, strategy) + is_support, content = await self._process_video(file, strategy) result.append(content) - self.write_perceptual_memory(end_user_id, file.type, file.url, content) + if is_support: + self.write_perceptual_memory(end_user_id, file.type, file.url, content) else: logger.warning(f"不支持的文件类型: {file.type}") except Exception as e: @@ -424,7 +428,7 @@ class MultimodalService: if end_user_id and self.api_config: write_perceptual_memory.delay(end_user_id, self.api_config.model_dump(), file_type, file_url, file_message) - async def _process_image(self, file: FileInput, strategy) -> Dict[str, Any]: + async def _process_image(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]: """ 处理图片文件 @@ -440,12 +444,12 @@ class MultimodalService: return await strategy.format_image(file.url, content=file.get_content()) except Exception as e: logger.error(f"处理图片失败: {e}", exc_info=True) - return { + return False, { "type": "text", "text": f"[图片处理失败: {str(e)}]" } - async def _process_document(self, file: FileInput, strategy) -> Dict[str, Any]: + async def _process_document(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]: """ 处理文档文件(PDF、Word 等) @@ -457,7 +461,7 @@ class MultimodalService: Dict: 根据 provider 返回不同格式的文档内容 """ if file.transfer_method == TransferMethod.REMOTE_URL: - return { + return True, { "type": "text", "text": f"\n{await self._extract_document_text(file)}\n" } @@ -475,7 +479,7 @@ class MultimodalService: # 使用策略格式化文档 return await strategy.format_document(file_name, text) - async def _process_audio(self, file: FileInput, strategy) -> Dict[str, Any]: + async def _process_audio(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]: """ 处理音频文件 @@ -503,12 +507,12 @@ class MultimodalService: return await strategy.format_audio(file.file_type, file.url, file.get_content(), transcription) except Exception as e: logger.error(f"处理音频失败: {e}", exc_info=True) - return { + return False, { "type": "text", "text": f"[音频处理失败: {str(e)}]" } - async def _process_video(self, file: FileInput, strategy) -> Dict[str, Any]: + async def _process_video(self, file: FileInput, strategy) -> tuple[bool, Dict[str, Any]]: """ 处理视频文件 @@ -524,7 +528,7 @@ class MultimodalService: return await strategy.format_video(file.url) except Exception as e: logger.error(f"处理视频失败: {e}", exc_info=True) - return { + return False, { "type": "text", "text": f"[视频处理失败: {str(e)}]" } diff --git a/api/app/services/workflow_service.py b/api/app/services/workflow_service.py index 4e7268d3..7aca3c2f 100644 --- a/api/app/services/workflow_service.py +++ b/api/app/services/workflow_service.py @@ -868,6 +868,14 @@ class WorkflowService: return node.get("config", {}).get("variables", []) raise BusinessException("workflow config error - start node not found") + @staticmethod + def is_memory_enable(config: dict) -> bool: + nodes = config.get("nodes", []) + for node in nodes: + if node.get("type") in [NodeType.MEMORY_READ, NodeType.MEMORY_WRITE]: + return True + return False + # ==================== 依赖注入函数 ====================