[ADD]Support parsing of unstructured data MP3, MP4, etc

2025-12-24 17:50:03 +08:00
parent ef6fca5317
commit 6338edda11
7 changed files with 59 additions and 10 deletions
--- a/api/app/core/rag/app/audio.py
+++ b/api/app/core/rag/app/audio.py
@@ -26,7 +26,7 @@ def chunk(filename, binary, lang, callback=None, seq2txt_mdl=None, **kwargs):
            tmp_path = os.path.abspath(tmpf.name)

        callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
-        ans = seq2txt_mdl.transcription(tmp_path)
+        ans, ans_num_tokens = seq2txt_mdl.transcription(tmp_path)
        callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])

        tokenize(doc, ans, eng)
--- a/api/app/core/rag/app/naive.py
+++ b/api/app/core/rag/app/naive.py
@@ -13,6 +13,7 @@ from PIL import Image
 import copy

 from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
+from app.core.rag.llm.sequence2txt_model import QWenSeq2txt
 from app.core.rag.common.file_utils import get_project_base_directory
 from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
 from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
@@ -809,13 +810,31 @@ if __name__ == "__main__":
    # chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

    # Prepare to configure vision_model information
-    vision_model = QWenCV(
-        key="sk-8e9e40cd171749858ce2d3722ea75669",
-        model_name="qwen-vl-max",
+    # 文字+图片
+    # vision_model = QWenCV(
+    #     key="",
+    #     model_name="qwen-vl-max",
+    #     lang="chinese",  # 默认使用中文
+    #     base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
+    # )
+
+    # 音频
+    vision_model = QWenSeq2txt(
+        key="",
+        model_name="qwen3-omni-flash",
        lang="chinese",  # 默认使用中文
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
    )

+
+    # 视频
+    # vision_model = QWenCV(
+    #     key="",
+    #     model_name="qwen3-omni-flash",
+    #     lang="chinese",  # 默认使用中文
+    #     base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
+    # )
+
    def progress_callback(prog=None, msg=None):
        print(f"prog: {prog} msg: {msg}\n")

--- a/api/app/core/rag/app/picture.py
+++ b/api/app/core/rag/app/picture.py
@@ -24,9 +24,8 @@ def chunk(filename, binary, lang, callback=None, vision_model=None, **kwargs):
    if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
        try:
            doc.update({"doc_type_kwd": "video"})
-            ans = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
+            ans, ans_num_tokens = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
-            ans += "\n" + ans
            tokenize(doc, ans, eng)
            return [doc]
        except Exception as e: