[ADD]Support parsing of unstructured data MP3, MP4, etc

This commit is contained in:
lixiangcheng1
2025-12-24 17:50:03 +08:00
parent ef6fca5317
commit 6338edda11
7 changed files with 59 additions and 10 deletions

View File

@@ -252,7 +252,7 @@ class QWenCV(GptV4):
"fps": 2,
},
{
"text": "Please summarize this video in proper sentences.",
"text": "视频的内容是什么?,并且,请用恰当的句子总结这个视频。" if self.lang.lower() == "chinese" else "What is the content of the video? And please summarize this video in proper sentences.",
},
],
}

View File

@@ -46,11 +46,12 @@ class GPTSeq2txt(Base):
class QWenSeq2txt(Base):
_FACTORY_NAME = "Tongyi-Qianwen"
def __init__(self, key, model_name="qwen-audio-asr", **kwargs):
def __init__(self, key, model_name="qwen-audio-asr", lang="Chinese", **kwargs):
import dashscope
dashscope.api_key = key
self.model_name = model_name
self.lang = lang
def transcription(self, audio_path):
if "paraformer" in self.model_name or "sensevoice" in self.model_name:
@@ -62,14 +63,21 @@ class QWenSeq2txt(Base):
messages = [
{
"role": "user",
"content": [{"audio": audio_path}],
"content": [
{
"audio": audio_path
},
{
"text": "这段音频在说什么?" if self.lang.lower() == "chinese" else "What is this audio saying?",
},
],
}
]
response = None
full_content = ""
try:
response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True)
response = MultiModalConversation.call(model=self.model_name, messages=messages, result_format="message", stream=True)
for response in response:
try:
full_content += response["output"]["choices"][0]["message"].content[0]["text"]