From 6338edda11e951e259b441a08da3424097e295a3 Mon Sep 17 00:00:00 2001 From: lixiangcheng1 Date: Wed, 24 Dec 2025 17:50:03 +0800 Subject: [PATCH] [ADD]Support parsing of unstructured data MP3, MP4, etc --- api/app/core/rag/app/audio.py | 2 +- api/app/core/rag/app/naive.py | 25 +++++++++++++++++++--- api/app/core/rag/app/picture.py | 3 +-- api/app/core/rag/llm/cv_model.py | 2 +- api/app/core/rag/llm/sequence2txt_model.py | 14 +++++++++--- api/app/tasks.py | 18 ++++++++++++++++ api/env.example | 5 +++++ 7 files changed, 59 insertions(+), 10 deletions(-) diff --git a/api/app/core/rag/app/audio.py b/api/app/core/rag/app/audio.py index 1bddc048..6ce422e5 100644 --- a/api/app/core/rag/app/audio.py +++ b/api/app/core/rag/app/audio.py @@ -26,7 +26,7 @@ def chunk(filename, binary, lang, callback=None, seq2txt_mdl=None, **kwargs): tmp_path = os.path.abspath(tmpf.name) callback(0.1, "USE Sequence2Txt LLM to transcription the audio") - ans = seq2txt_mdl.transcription(tmp_path) + ans, ans_num_tokens = seq2txt_mdl.transcription(tmp_path) callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32]) tokenize(doc, ans, eng) diff --git a/api/app/core/rag/app/naive.py b/api/app/core/rag/app/naive.py index 5430e38f..6d6b933a 100644 --- a/api/app/core/rag/app/naive.py +++ b/api/app/core/rag/app/naive.py @@ -13,6 +13,7 @@ from PIL import Image import copy from app.core.rag.llm.cv_model import AzureGptV4, QWenCV +from app.core.rag.llm.sequence2txt_model import QWenSeq2txt from app.core.rag.common.file_utils import get_project_base_directory from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf @@ -809,13 +810,31 @@ if __name__ == "__main__": # chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) # Prepare to configure vision_model information - vision_model = QWenCV( - key="sk-8e9e40cd171749858ce2d3722ea75669", - model_name="qwen-vl-max", + # 文字+图片 + # vision_model = QWenCV( + # key="", + # model_name="qwen-vl-max", + # lang="chinese", # 默认使用中文 + # base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" + # ) + + # 音频 + vision_model = QWenSeq2txt( + key="", + model_name="qwen3-omni-flash", lang="chinese", # 默认使用中文 base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" ) + + # 视频 + # vision_model = QWenCV( + # key="", + # model_name="qwen3-omni-flash", + # lang="chinese", # 默认使用中文 + # base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" + # ) + def progress_callback(prog=None, msg=None): print(f"prog: {prog} msg: {msg}\n") diff --git a/api/app/core/rag/app/picture.py b/api/app/core/rag/app/picture.py index addc7d9b..da133c27 100644 --- a/api/app/core/rag/app/picture.py +++ b/api/app/core/rag/app/picture.py @@ -24,9 +24,8 @@ def chunk(filename, binary, lang, callback=None, vision_model=None, **kwargs): if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS): try: doc.update({"doc_type_kwd": "video"}) - ans = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename) + ans, ans_num_tokens = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename) callback(0.8, "CV LLM respond: %s ..." % ans[:32]) - ans += "\n" + ans tokenize(doc, ans, eng) return [doc] except Exception as e: diff --git a/api/app/core/rag/llm/cv_model.py b/api/app/core/rag/llm/cv_model.py index 663272ce..5f841433 100644 --- a/api/app/core/rag/llm/cv_model.py +++ b/api/app/core/rag/llm/cv_model.py @@ -252,7 +252,7 @@ class QWenCV(GptV4): "fps": 2, }, { - "text": "Please summarize this video in proper sentences.", + "text": "视频的内容是什么?,并且,请用恰当的句子总结这个视频。" if self.lang.lower() == "chinese" else "What is the content of the video? And please summarize this video in proper sentences.", }, ], } diff --git a/api/app/core/rag/llm/sequence2txt_model.py b/api/app/core/rag/llm/sequence2txt_model.py index dcea9346..be4d3649 100644 --- a/api/app/core/rag/llm/sequence2txt_model.py +++ b/api/app/core/rag/llm/sequence2txt_model.py @@ -46,11 +46,12 @@ class GPTSeq2txt(Base): class QWenSeq2txt(Base): _FACTORY_NAME = "Tongyi-Qianwen" - def __init__(self, key, model_name="qwen-audio-asr", **kwargs): + def __init__(self, key, model_name="qwen-audio-asr", lang="Chinese", **kwargs): import dashscope dashscope.api_key = key self.model_name = model_name + self.lang = lang def transcription(self, audio_path): if "paraformer" in self.model_name or "sensevoice" in self.model_name: @@ -62,14 +63,21 @@ class QWenSeq2txt(Base): messages = [ { "role": "user", - "content": [{"audio": audio_path}], + "content": [ + { + "audio": audio_path + }, + { + "text": "这段音频在说什么?" if self.lang.lower() == "chinese" else "What is this audio saying?", + }, + ], } ] response = None full_content = "" try: - response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True) + response = MultiModalConversation.call(model=self.model_name, messages=messages, result_format="message", stream=True) for response in response: try: full_content += response["output"]["choices"][0]["message"].content[0]["text"] diff --git a/api/app/tasks.py b/api/app/tasks.py index 907fae00..00bc7237 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -6,6 +6,7 @@ import uuid from datetime import datetime, timezone from math import ceil from typing import Any, Dict, List, Optional +import re import redis import requests @@ -16,6 +17,7 @@ from app.core.config import settings from app.core.rag.graphrag.utils import get_llm_cache, set_llm_cache from app.core.rag.llm.chat_model import Base from app.core.rag.llm.cv_model import QWenCV +from app.core.rag.llm.sequence2txt_model import QWenSeq2txt from app.core.rag.models.chunk import DocumentChunk from app.core.rag.prompts.generator import question_proposal from app.core.rag.vdb.elasticsearch.elasticsearch_vector import ( @@ -83,6 +85,22 @@ def parse_document(file_path: str, document_id: uuid.UUID): lang="Chinese", base_url=db_knowledge.image2text.api_keys[0].api_base ) + if re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", file_path, re.IGNORECASE): + vision_model = QWenSeq2txt( + key=os.getenv("QWEN3_OMNI_API_KEY", ""), + model_name=os.getenv("QWEN3_OMNI_MODEL_NAME", "qwen3-omni-flash"), + lang="Chinese", + base_url=os.getenv("QWEN3_OMNI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"), + ) + elif re.search(r"\.(png|jpeg|jpg|gif|bmp|svg|mp4|mov|avi|flv|mpeg|mpg|webm|wmv|3gp|3gpp|mkv?)$", file_path, re.IGNORECASE): + vision_model = QWenCV( + key=os.getenv("QWEN3_OMNI_API_KEY", ""), + model_name=os.getenv("QWEN3_OMNI_MODEL_NAME", "qwen3-omni-flash"), + lang="Chinese", + base_url=os.getenv("QWEN3_OMNI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"), + ) + else: + print(file_path) from app.core.rag.app.naive import chunk res = chunk(filename=file_path, from_page=0, diff --git a/api/env.example b/api/env.example index 8ceb3934..371a0561 100644 --- a/api/env.example +++ b/api/env.example @@ -83,6 +83,11 @@ TEXTLN_APISERVER=https://api.textin.com/ai/service/v1/pdf_to_markdown TEXTLN_APP_ID= TEXTLN_SECRET_CODE= +# vision model +QWEN3_OMNI_API_KEY= +QWEN3_OMNI_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 +QWEN3_OMNI_MODEL_NAME=qwen3-omni-flash + # VOLC ASR VOLC_APP_KEY= VOLC_ACCESS_KEY=