feat(model and app):

1. Increase support for visual models and multimodal models; 2. The application and workflow can input various multimodal files such as images, documents, audio, and videos.
2026-03-05 09:55:54 +08:00
parent 23bfdcefef
commit 590ec3a446
26 changed files with 958 additions and 233 deletions
--- a/api/app/services/audio_transcription_service.py
+++ b/api/app/services/audio_transcription_service.py
@@ -0,0 +1,101 @@
+"""
+音频转文本服务
+
+支持的服务商:
+- DashScope (阿里云通义千问)
+- OpenAI Whisper
+"""
+import httpx
+
+from app.core.logging_config import get_business_logger
+
+logger = get_business_logger()
+
+
+class AudioTranscriptionService:
+    """音频转文本服务"""
+
+    @staticmethod
+    async def transcribe_dashscope(audio_url: str, api_key: str) -> str:
+        """
+        使用阿里云通义千问语音识别服务转换音频为文本
+        
+        Args:
+            audio_url: 音频文件 URL
+            api_key: DashScope API Key
+            
+        Returns:
+            str: 转录的文本
+        """
+        try:
+            async with httpx.AsyncClient(timeout=60.0) as client:
+                response = await client.post(
+                    "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription",
+                    headers={
+                        "Authorization": f"Bearer {api_key}",
+                        "Content-Type": "application/json",
+                        "X-DashScope-Async": "enable",
+                    },
+                    json={
+                        "model": "paraformer-v2",
+                        "input": {
+                            "file_urls": [audio_url]
+                        },
+                        "parameters": {
+                            "language_hints": ["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"]
+                        }
+                    }
+                )
+                response.raise_for_status()
+                result = response.json()
+                
+                if result.get("output", {}).get("results"):
+                    text = result["output"]["results"][0].get("transcription_text", "")
+                    logger.info(f"音频转文本成功: {len(text)} 字符")
+                    return text
+                
+                return "[音频转文本失败]"
+                
+        except Exception as e:
+            logger.error(f"DashScope 音频转文本失败: {e}")
+            return f"[音频转文本失败: {str(e)}]"
+
+    @staticmethod
+    async def transcribe_openai(audio_url: str, api_key: str) -> str:
+        """
+        使用 OpenAI Whisper 转换音频为文本
+        
+        Args:
+            audio_url: 音频文件 URL
+            api_key: OpenAI API Key
+            
+        Returns:
+            str: 转录的文本
+        """
+        try:
+            # 下载音频文件
+            async with httpx.AsyncClient(timeout=60.0) as client:
+                audio_response = await client.get(audio_url)
+                audio_response.raise_for_status()
+                audio_data = audio_response.content
+                
+                # 调用 Whisper API
+                files = {"file": ("audio.mp3", audio_data, "audio/mpeg")}
+                data = {"model": "whisper-1"}
+                
+                response = await client.post(
+                    "https://api.openai.com/v1/audio/transcriptions",
+                    headers={"Authorization": f"Bearer {api_key}"},
+                    files=files,
+                    data=data
+                )
+                response.raise_for_status()
+                result = response.json()
+                
+                text = result.get("text", "")
+                logger.info(f"音频转文本成功: {len(text)} 字符")
+                return text
+                
+        except Exception as e:
+            logger.error(f"OpenAI Whisper 音频转文本失败: {e}")
+            return f"[音频转文本失败: {str(e)}]"