[ADD]Support parsing of unstructured data MP3, MP4, etc

This commit is contained in:
lixiangcheng1
2025-12-24 17:50:03 +08:00
parent ef6fca5317
commit 6338edda11
7 changed files with 59 additions and 10 deletions

View File

@@ -26,7 +26,7 @@ def chunk(filename, binary, lang, callback=None, seq2txt_mdl=None, **kwargs):
tmp_path = os.path.abspath(tmpf.name) tmp_path = os.path.abspath(tmpf.name)
callback(0.1, "USE Sequence2Txt LLM to transcription the audio") callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
ans = seq2txt_mdl.transcription(tmp_path) ans, ans_num_tokens = seq2txt_mdl.transcription(tmp_path)
callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32]) callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
tokenize(doc, ans, eng) tokenize(doc, ans, eng)

View File

@@ -13,6 +13,7 @@ from PIL import Image
import copy import copy
from app.core.rag.llm.cv_model import AzureGptV4, QWenCV from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
from app.core.rag.common.file_utils import get_project_base_directory from app.core.rag.common.file_utils import get_project_base_directory
from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
@@ -809,13 +810,31 @@ if __name__ == "__main__":
# chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) # chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
# Prepare to configure vision_model information # Prepare to configure vision_model information
vision_model = QWenCV( # 文字+图片
key="sk-8e9e40cd171749858ce2d3722ea75669", # vision_model = QWenCV(
model_name="qwen-vl-max", # key="",
# model_name="qwen-vl-max",
# lang="chinese", # 默认使用中文
# base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
# )
# 音频
vision_model = QWenSeq2txt(
key="",
model_name="qwen3-omni-flash",
lang="chinese", # 默认使用中文 lang="chinese", # 默认使用中文
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
) )
# 视频
# vision_model = QWenCV(
# key="",
# model_name="qwen3-omni-flash",
# lang="chinese", # 默认使用中文
# base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
# )
def progress_callback(prog=None, msg=None): def progress_callback(prog=None, msg=None):
print(f"prog: {prog} msg: {msg}\n") print(f"prog: {prog} msg: {msg}\n")

View File

@@ -24,9 +24,8 @@ def chunk(filename, binary, lang, callback=None, vision_model=None, **kwargs):
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS): if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
try: try:
doc.update({"doc_type_kwd": "video"}) doc.update({"doc_type_kwd": "video"})
ans = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename) ans, ans_num_tokens = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
callback(0.8, "CV LLM respond: %s ..." % ans[:32]) callback(0.8, "CV LLM respond: %s ..." % ans[:32])
ans += "\n" + ans
tokenize(doc, ans, eng) tokenize(doc, ans, eng)
return [doc] return [doc]
except Exception as e: except Exception as e:

View File

@@ -252,7 +252,7 @@ class QWenCV(GptV4):
"fps": 2, "fps": 2,
}, },
{ {
"text": "Please summarize this video in proper sentences.", "text": "视频的内容是什么?,并且,请用恰当的句子总结这个视频。" if self.lang.lower() == "chinese" else "What is the content of the video? And please summarize this video in proper sentences.",
}, },
], ],
} }

View File

@@ -46,11 +46,12 @@ class GPTSeq2txt(Base):
class QWenSeq2txt(Base): class QWenSeq2txt(Base):
_FACTORY_NAME = "Tongyi-Qianwen" _FACTORY_NAME = "Tongyi-Qianwen"
def __init__(self, key, model_name="qwen-audio-asr", **kwargs): def __init__(self, key, model_name="qwen-audio-asr", lang="Chinese", **kwargs):
import dashscope import dashscope
dashscope.api_key = key dashscope.api_key = key
self.model_name = model_name self.model_name = model_name
self.lang = lang
def transcription(self, audio_path): def transcription(self, audio_path):
if "paraformer" in self.model_name or "sensevoice" in self.model_name: if "paraformer" in self.model_name or "sensevoice" in self.model_name:
@@ -62,14 +63,21 @@ class QWenSeq2txt(Base):
messages = [ messages = [
{ {
"role": "user", "role": "user",
"content": [{"audio": audio_path}], "content": [
{
"audio": audio_path
},
{
"text": "这段音频在说什么?" if self.lang.lower() == "chinese" else "What is this audio saying?",
},
],
} }
] ]
response = None response = None
full_content = "" full_content = ""
try: try:
response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True) response = MultiModalConversation.call(model=self.model_name, messages=messages, result_format="message", stream=True)
for response in response: for response in response:
try: try:
full_content += response["output"]["choices"][0]["message"].content[0]["text"] full_content += response["output"]["choices"][0]["message"].content[0]["text"]

View File

@@ -6,6 +6,7 @@ import uuid
from datetime import datetime, timezone from datetime import datetime, timezone
from math import ceil from math import ceil
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import re
import redis import redis
import requests import requests
@@ -16,6 +17,7 @@ from app.core.config import settings
from app.core.rag.graphrag.utils import get_llm_cache, set_llm_cache from app.core.rag.graphrag.utils import get_llm_cache, set_llm_cache
from app.core.rag.llm.chat_model import Base from app.core.rag.llm.chat_model import Base
from app.core.rag.llm.cv_model import QWenCV from app.core.rag.llm.cv_model import QWenCV
from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
from app.core.rag.models.chunk import DocumentChunk from app.core.rag.models.chunk import DocumentChunk
from app.core.rag.prompts.generator import question_proposal from app.core.rag.prompts.generator import question_proposal
from app.core.rag.vdb.elasticsearch.elasticsearch_vector import ( from app.core.rag.vdb.elasticsearch.elasticsearch_vector import (
@@ -83,6 +85,22 @@ def parse_document(file_path: str, document_id: uuid.UUID):
lang="Chinese", lang="Chinese",
base_url=db_knowledge.image2text.api_keys[0].api_base base_url=db_knowledge.image2text.api_keys[0].api_base
) )
if re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", file_path, re.IGNORECASE):
vision_model = QWenSeq2txt(
key=os.getenv("QWEN3_OMNI_API_KEY", ""),
model_name=os.getenv("QWEN3_OMNI_MODEL_NAME", "qwen3-omni-flash"),
lang="Chinese",
base_url=os.getenv("QWEN3_OMNI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
)
elif re.search(r"\.(png|jpeg|jpg|gif|bmp|svg|mp4|mov|avi|flv|mpeg|mpg|webm|wmv|3gp|3gpp|mkv?)$", file_path, re.IGNORECASE):
vision_model = QWenCV(
key=os.getenv("QWEN3_OMNI_API_KEY", ""),
model_name=os.getenv("QWEN3_OMNI_MODEL_NAME", "qwen3-omni-flash"),
lang="Chinese",
base_url=os.getenv("QWEN3_OMNI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
)
else:
print(file_path)
from app.core.rag.app.naive import chunk from app.core.rag.app.naive import chunk
res = chunk(filename=file_path, res = chunk(filename=file_path,
from_page=0, from_page=0,

View File

@@ -83,6 +83,11 @@ TEXTLN_APISERVER=https://api.textin.com/ai/service/v1/pdf_to_markdown
TEXTLN_APP_ID= TEXTLN_APP_ID=
TEXTLN_SECRET_CODE= TEXTLN_SECRET_CODE=
# vision model
QWEN3_OMNI_API_KEY=
QWEN3_OMNI_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
QWEN3_OMNI_MODEL_NAME=qwen3-omni-flash
# VOLC ASR # VOLC ASR
VOLC_APP_KEY= VOLC_APP_KEY=
VOLC_ACCESS_KEY= VOLC_ACCESS_KEY=