[ADD]Support parsing of unstructured data MP3, MP4, etc
This commit is contained in:
@@ -26,7 +26,7 @@ def chunk(filename, binary, lang, callback=None, seq2txt_mdl=None, **kwargs):
|
||||
tmp_path = os.path.abspath(tmpf.name)
|
||||
|
||||
callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
|
||||
ans = seq2txt_mdl.transcription(tmp_path)
|
||||
ans, ans_num_tokens = seq2txt_mdl.transcription(tmp_path)
|
||||
callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
|
||||
|
||||
tokenize(doc, ans, eng)
|
||||
|
||||
@@ -13,6 +13,7 @@ from PIL import Image
|
||||
import copy
|
||||
|
||||
from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
|
||||
from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
|
||||
from app.core.rag.common.file_utils import get_project_base_directory
|
||||
from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
||||
from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
|
||||
@@ -809,13 +810,31 @@ if __name__ == "__main__":
|
||||
# chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
# Prepare to configure vision_model information
|
||||
vision_model = QWenCV(
|
||||
key="sk-8e9e40cd171749858ce2d3722ea75669",
|
||||
model_name="qwen-vl-max",
|
||||
# 文字+图片
|
||||
# vision_model = QWenCV(
|
||||
# key="",
|
||||
# model_name="qwen-vl-max",
|
||||
# lang="chinese", # 默认使用中文
|
||||
# base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
# )
|
||||
|
||||
# 音频
|
||||
vision_model = QWenSeq2txt(
|
||||
key="",
|
||||
model_name="qwen3-omni-flash",
|
||||
lang="chinese", # 默认使用中文
|
||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
)
|
||||
|
||||
|
||||
# 视频
|
||||
# vision_model = QWenCV(
|
||||
# key="",
|
||||
# model_name="qwen3-omni-flash",
|
||||
# lang="chinese", # 默认使用中文
|
||||
# base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
# )
|
||||
|
||||
def progress_callback(prog=None, msg=None):
|
||||
print(f"prog: {prog} msg: {msg}\n")
|
||||
|
||||
|
||||
@@ -24,9 +24,8 @@ def chunk(filename, binary, lang, callback=None, vision_model=None, **kwargs):
|
||||
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
|
||||
try:
|
||||
doc.update({"doc_type_kwd": "video"})
|
||||
ans = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
|
||||
ans, ans_num_tokens = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
|
||||
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
|
||||
ans += "\n" + ans
|
||||
tokenize(doc, ans, eng)
|
||||
return [doc]
|
||||
except Exception as e:
|
||||
|
||||
@@ -252,7 +252,7 @@ class QWenCV(GptV4):
|
||||
"fps": 2,
|
||||
},
|
||||
{
|
||||
"text": "Please summarize this video in proper sentences.",
|
||||
"text": "视频的内容是什么?,并且,请用恰当的句子总结这个视频。" if self.lang.lower() == "chinese" else "What is the content of the video? And please summarize this video in proper sentences.",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
@@ -46,11 +46,12 @@ class GPTSeq2txt(Base):
|
||||
class QWenSeq2txt(Base):
|
||||
_FACTORY_NAME = "Tongyi-Qianwen"
|
||||
|
||||
def __init__(self, key, model_name="qwen-audio-asr", **kwargs):
|
||||
def __init__(self, key, model_name="qwen-audio-asr", lang="Chinese", **kwargs):
|
||||
import dashscope
|
||||
|
||||
dashscope.api_key = key
|
||||
self.model_name = model_name
|
||||
self.lang = lang
|
||||
|
||||
def transcription(self, audio_path):
|
||||
if "paraformer" in self.model_name or "sensevoice" in self.model_name:
|
||||
@@ -62,14 +63,21 @@ class QWenSeq2txt(Base):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"audio": audio_path}],
|
||||
"content": [
|
||||
{
|
||||
"audio": audio_path
|
||||
},
|
||||
{
|
||||
"text": "这段音频在说什么?" if self.lang.lower() == "chinese" else "What is this audio saying?",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = None
|
||||
full_content = ""
|
||||
try:
|
||||
response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True)
|
||||
response = MultiModalConversation.call(model=self.model_name, messages=messages, result_format="message", stream=True)
|
||||
for response in response:
|
||||
try:
|
||||
full_content += response["output"]["choices"][0]["message"].content[0]["text"]
|
||||
|
||||
Reference in New Issue
Block a user