From 6338edda11e951e259b441a08da3424097e295a3 Mon Sep 17 00:00:00 2001
From: lixiangcheng1 <lixiangcheng1@wanda.cn>
Date: Wed, 24 Dec 2025 17:50:03 +0800
Subject: [PATCH] [ADD]Support parsing of unstructured data MP3, MP4, etc

---
 api/app/core/rag/app/audio.py              |  2 +-
 api/app/core/rag/app/naive.py              | 25 +++++++++++++++++++---
 api/app/core/rag/app/picture.py            |  3 +--
 api/app/core/rag/llm/cv_model.py           |  2 +-
 api/app/core/rag/llm/sequence2txt_model.py | 14 +++++++++---
 api/app/tasks.py                           | 18 ++++++++++++++++
 api/env.example                            |  5 +++++
 7 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/api/app/core/rag/app/audio.py b/api/app/core/rag/app/audio.py
index 1bddc048..6ce422e5 100644
--- a/api/app/core/rag/app/audio.py
+++ b/api/app/core/rag/app/audio.py
@@ -26,7 +26,7 @@ def chunk(filename, binary, lang, callback=None, seq2txt_mdl=None, **kwargs):
             tmp_path = os.path.abspath(tmpf.name)
 
         callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
-        ans = seq2txt_mdl.transcription(tmp_path)
+        ans, ans_num_tokens = seq2txt_mdl.transcription(tmp_path)
         callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
 
         tokenize(doc, ans, eng)
diff --git a/api/app/core/rag/app/naive.py b/api/app/core/rag/app/naive.py
index 5430e38f..6d6b933a 100644
--- a/api/app/core/rag/app/naive.py
+++ b/api/app/core/rag/app/naive.py
@@ -13,6 +13,7 @@ from PIL import Image
 import copy
 
 from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
+from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
 from app.core.rag.common.file_utils import get_project_base_directory
 from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
 from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
@@ -809,13 +810,31 @@ if __name__ == "__main__":
     # chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
 
     # Prepare to configure vision_model information
-    vision_model = QWenCV(
-        key="sk-8e9e40cd171749858ce2d3722ea75669",
-        model_name="qwen-vl-max",
+    # 文字+图片
+    # vision_model = QWenCV(
+    #     key="",
+    #     model_name="qwen-vl-max",
+    #     lang="chinese",  # 默认使用中文
+    #     base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
+    # )
+
+    # 音频
+    vision_model = QWenSeq2txt(
+        key="",
+        model_name="qwen3-omni-flash",
         lang="chinese",  # 默认使用中文
         base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
     )
 
+
+    # 视频
+    # vision_model = QWenCV(
+    #     key="",
+    #     model_name="qwen3-omni-flash",
+    #     lang="chinese",  # 默认使用中文
+    #     base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
+    # )
+
     def progress_callback(prog=None, msg=None):
         print(f"prog: {prog} msg: {msg}\n")
 
diff --git a/api/app/core/rag/app/picture.py b/api/app/core/rag/app/picture.py
index addc7d9b..da133c27 100644
--- a/api/app/core/rag/app/picture.py
+++ b/api/app/core/rag/app/picture.py
@@ -24,9 +24,8 @@ def chunk(filename, binary, lang, callback=None, vision_model=None, **kwargs):
     if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
         try:
             doc.update({"doc_type_kwd": "video"})
-            ans = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
+            ans, ans_num_tokens = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
             callback(0.8, "CV LLM respond: %s ..." % ans[:32])
-            ans += "\n" + ans
             tokenize(doc, ans, eng)
             return [doc]
         except Exception as e:
diff --git a/api/app/core/rag/llm/cv_model.py b/api/app/core/rag/llm/cv_model.py
index 663272ce..5f841433 100644
--- a/api/app/core/rag/llm/cv_model.py
+++ b/api/app/core/rag/llm/cv_model.py
@@ -252,7 +252,7 @@ class QWenCV(GptV4):
                             "fps": 2,
                         },
                         {
-                            "text": "Please summarize this video in proper sentences.",
+                            "text": "视频的内容是什么?,并且，请用恰当的句子总结这个视频。" if self.lang.lower() == "chinese" else "What is the content of the video? And please summarize this video in proper sentences.",
                         },
                     ],
                 }
diff --git a/api/app/core/rag/llm/sequence2txt_model.py b/api/app/core/rag/llm/sequence2txt_model.py
index dcea9346..be4d3649 100644
--- a/api/app/core/rag/llm/sequence2txt_model.py
+++ b/api/app/core/rag/llm/sequence2txt_model.py
@@ -46,11 +46,12 @@ class GPTSeq2txt(Base):
 class QWenSeq2txt(Base):
     _FACTORY_NAME = "Tongyi-Qianwen"
 
-    def __init__(self, key, model_name="qwen-audio-asr", **kwargs):
+    def __init__(self, key, model_name="qwen-audio-asr", lang="Chinese", **kwargs):
         import dashscope
 
         dashscope.api_key = key
         self.model_name = model_name
+        self.lang = lang
 
     def transcription(self, audio_path):
         if "paraformer" in self.model_name or "sensevoice" in self.model_name:
@@ -62,14 +63,21 @@ class QWenSeq2txt(Base):
         messages = [
             {
                 "role": "user",
-                "content": [{"audio": audio_path}],
+                "content": [
+                    {
+                        "audio": audio_path
+                    },
+                    {
+                        "text": "这段音频在说什么?" if self.lang.lower() == "chinese" else "What is this audio saying?",
+                    },
+                ],
             }
         ]
 
         response = None
         full_content = ""
         try:
-            response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True)
+            response = MultiModalConversation.call(model=self.model_name, messages=messages, result_format="message", stream=True)
             for response in response:
                 try:
                     full_content += response["output"]["choices"][0]["message"].content[0]["text"]
diff --git a/api/app/tasks.py b/api/app/tasks.py
index 907fae00..00bc7237 100644
--- a/api/app/tasks.py
+++ b/api/app/tasks.py
@@ -6,6 +6,7 @@ import uuid
 from datetime import datetime, timezone
 from math import ceil
 from typing import Any, Dict, List, Optional
+import re
 
 import redis
 import requests
@@ -16,6 +17,7 @@ from app.core.config import settings
 from app.core.rag.graphrag.utils import get_llm_cache, set_llm_cache
 from app.core.rag.llm.chat_model import Base
 from app.core.rag.llm.cv_model import QWenCV
+from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
 from app.core.rag.models.chunk import DocumentChunk
 from app.core.rag.prompts.generator import question_proposal
 from app.core.rag.vdb.elasticsearch.elasticsearch_vector import (
@@ -83,6 +85,22 @@ def parse_document(file_path: str, document_id: uuid.UUID):
                 lang="Chinese",
                 base_url=db_knowledge.image2text.api_keys[0].api_base
             )
+            if re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", file_path, re.IGNORECASE):
+                vision_model = QWenSeq2txt(
+                    key=os.getenv("QWEN3_OMNI_API_KEY", ""),
+                    model_name=os.getenv("QWEN3_OMNI_MODEL_NAME", "qwen3-omni-flash"),
+                    lang="Chinese",
+                    base_url=os.getenv("QWEN3_OMNI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
+                )
+            elif re.search(r"\.(png|jpeg|jpg|gif|bmp|svg|mp4|mov|avi|flv|mpeg|mpg|webm|wmv|3gp|3gpp|mkv?)$", file_path, re.IGNORECASE):
+                vision_model = QWenCV(
+                    key=os.getenv("QWEN3_OMNI_API_KEY", ""),
+                    model_name=os.getenv("QWEN3_OMNI_MODEL_NAME", "qwen3-omni-flash"),
+                    lang="Chinese",
+                    base_url=os.getenv("QWEN3_OMNI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
+                )
+            else:
+                print(file_path)
             from app.core.rag.app.naive import chunk
             res = chunk(filename=file_path,
                         from_page=0,
diff --git a/api/env.example b/api/env.example
index 8ceb3934..371a0561 100644
--- a/api/env.example
+++ b/api/env.example
@@ -83,6 +83,11 @@ TEXTLN_APISERVER=https://api.textin.com/ai/service/v1/pdf_to_markdown
 TEXTLN_APP_ID=
 TEXTLN_SECRET_CODE=
 
+# vision model
+QWEN3_OMNI_API_KEY=
+QWEN3_OMNI_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+QWEN3_OMNI_MODEL_NAME=qwen3-omni-flash
+
 # VOLC ASR
 VOLC_APP_KEY=
 VOLC_ACCESS_KEY=