[ADD]Support parsing of unstructured data MP3, MP4, etc

This commit is contained in:
lixiangcheng1
2025-12-24 17:50:03 +08:00
parent ef6fca5317
commit 6338edda11
7 changed files with 59 additions and 10 deletions

View File

@@ -26,7 +26,7 @@ def chunk(filename, binary, lang, callback=None, seq2txt_mdl=None, **kwargs):
tmp_path = os.path.abspath(tmpf.name)
callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
ans = seq2txt_mdl.transcription(tmp_path)
ans, ans_num_tokens = seq2txt_mdl.transcription(tmp_path)
callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
tokenize(doc, ans, eng)

View File

@@ -13,6 +13,7 @@ from PIL import Image
import copy
from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
from app.core.rag.common.file_utils import get_project_base_directory
from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
@@ -809,13 +810,31 @@ if __name__ == "__main__":
# chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
# Prepare to configure vision_model information
vision_model = QWenCV(
key="sk-8e9e40cd171749858ce2d3722ea75669",
model_name="qwen-vl-max",
# 文字+图片
# vision_model = QWenCV(
# key="",
# model_name="qwen-vl-max",
# lang="chinese", # 默认使用中文
# base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
# )
# 音频
vision_model = QWenSeq2txt(
key="",
model_name="qwen3-omni-flash",
lang="chinese", # 默认使用中文
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
# 视频
# vision_model = QWenCV(
# key="",
# model_name="qwen3-omni-flash",
# lang="chinese", # 默认使用中文
# base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
# )
def progress_callback(prog=None, msg=None):
print(f"prog: {prog} msg: {msg}\n")

View File

@@ -24,9 +24,8 @@ def chunk(filename, binary, lang, callback=None, vision_model=None, **kwargs):
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
try:
doc.update({"doc_type_kwd": "video"})
ans = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
ans, ans_num_tokens = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
ans += "\n" + ans
tokenize(doc, ans, eng)
return [doc]
except Exception as e:

View File

@@ -252,7 +252,7 @@ class QWenCV(GptV4):
"fps": 2,
},
{
"text": "Please summarize this video in proper sentences.",
"text": "视频的内容是什么?,并且,请用恰当的句子总结这个视频。" if self.lang.lower() == "chinese" else "What is the content of the video? And please summarize this video in proper sentences.",
},
],
}

View File

@@ -46,11 +46,12 @@ class GPTSeq2txt(Base):
class QWenSeq2txt(Base):
_FACTORY_NAME = "Tongyi-Qianwen"
def __init__(self, key, model_name="qwen-audio-asr", **kwargs):
def __init__(self, key, model_name="qwen-audio-asr", lang="Chinese", **kwargs):
import dashscope
dashscope.api_key = key
self.model_name = model_name
self.lang = lang
def transcription(self, audio_path):
if "paraformer" in self.model_name or "sensevoice" in self.model_name:
@@ -62,14 +63,21 @@ class QWenSeq2txt(Base):
messages = [
{
"role": "user",
"content": [{"audio": audio_path}],
"content": [
{
"audio": audio_path
},
{
"text": "这段音频在说什么?" if self.lang.lower() == "chinese" else "What is this audio saying?",
},
],
}
]
response = None
full_content = ""
try:
response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True)
response = MultiModalConversation.call(model=self.model_name, messages=messages, result_format="message", stream=True)
for response in response:
try:
full_content += response["output"]["choices"][0]["message"].content[0]["text"]

View File

@@ -6,6 +6,7 @@ import uuid
from datetime import datetime, timezone
from math import ceil
from typing import Any, Dict, List, Optional
import re
import redis
import requests
@@ -16,6 +17,7 @@ from app.core.config import settings
from app.core.rag.graphrag.utils import get_llm_cache, set_llm_cache
from app.core.rag.llm.chat_model import Base
from app.core.rag.llm.cv_model import QWenCV
from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
from app.core.rag.models.chunk import DocumentChunk
from app.core.rag.prompts.generator import question_proposal
from app.core.rag.vdb.elasticsearch.elasticsearch_vector import (
@@ -83,6 +85,22 @@ def parse_document(file_path: str, document_id: uuid.UUID):
lang="Chinese",
base_url=db_knowledge.image2text.api_keys[0].api_base
)
if re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", file_path, re.IGNORECASE):
vision_model = QWenSeq2txt(
key=os.getenv("QWEN3_OMNI_API_KEY", ""),
model_name=os.getenv("QWEN3_OMNI_MODEL_NAME", "qwen3-omni-flash"),
lang="Chinese",
base_url=os.getenv("QWEN3_OMNI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
)
elif re.search(r"\.(png|jpeg|jpg|gif|bmp|svg|mp4|mov|avi|flv|mpeg|mpg|webm|wmv|3gp|3gpp|mkv?)$", file_path, re.IGNORECASE):
vision_model = QWenCV(
key=os.getenv("QWEN3_OMNI_API_KEY", ""),
model_name=os.getenv("QWEN3_OMNI_MODEL_NAME", "qwen3-omni-flash"),
lang="Chinese",
base_url=os.getenv("QWEN3_OMNI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
)
else:
print(file_path)
from app.core.rag.app.naive import chunk
res = chunk(filename=file_path,
from_page=0,

View File

@@ -83,6 +83,11 @@ TEXTLN_APISERVER=https://api.textin.com/ai/service/v1/pdf_to_markdown
TEXTLN_APP_ID=
TEXTLN_SECRET_CODE=
# vision model
QWEN3_OMNI_API_KEY=
QWEN3_OMNI_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
QWEN3_OMNI_MODEL_NAME=qwen3-omni-flash
# VOLC ASR
VOLC_APP_KEY=
VOLC_ACCESS_KEY=