[MODIFY] Code optimization

2025-12-15 14:09:43 +08:00
parent d2a630addb
commit a4e276ab27
157 changed files with 15976 additions and 3601 deletions
--- a/api/app/core/rag/app/naive.py
+++ b/api/app/core/rag/app/naive.py
@@ -15,13 +15,15 @@ import copy
 from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
 from app.core.rag.common.file_utils import get_project_base_directory
 from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
+from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
 from app.core.rag.deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
 from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
 from app.core.rag.deepdoc.parser.pdf_parser import PlainParser, VisionParser
 from app.core.rag.deepdoc.parser.mineru_parser import MinerUParser
+from app.core.rag.app.textin_parser import TextLnParser
 from app.core.rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, tokenize, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table

-def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
+def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
    callback = callback
    binary = binary
    pdf_parser = pdf_cls() if pdf_cls else Pdf()
@@ -39,7 +41,7 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
    return sections, tables, pdf_parser


-def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
+def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
    mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
    mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
    pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
@@ -59,23 +61,19 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
    return sections, tables, pdf_parser


-def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
-    textln_app_id = os.environ.get("TEXTLN_APP_ID", "")
-    textln_secret_code = os.environ.get("TEXTLN_SECRET_CODE", "")
+def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
    textln_api = os.environ.get("TEXTLN_APISERVER", "https://api.textin.com/ai/service/v1/pdf_to_markdown")
-    pdf_parser = MinerUParser(mineru_path=textln_app_id, mineru_api=textln_api)
-
-    if not pdf_parser.check_installation():
-        callback(-1, "MinerU not found.")
-        return None, None, pdf_parser
+    app_id = os.environ.get("TEXTLN_APP_ID", "fa3f24380683ad53e6c620c0f0878a09")
+    secret_code = os.environ.get("TEXTLN_SECRET_CODE", "6130caac9aabc6eb26433758d7898f4a")
+    pdf_parser = TextLnParser(textln_api=textln_api, app_id=app_id, secret_code=secret_code)

    sections, tables = pdf_parser.parse_pdf(
        filepath=filename,
        binary=binary,
        callback=callback,
-        output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
-        backend=os.environ.get("MINERU_BACKEND", "pipeline"),
-        delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
+        vision_model=vision_model,
+        lang=lang,
+        **kwargs
    )
    return sections, tables, pdf_parser

@@ -605,7 +603,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        parser = PARSERS.get(name, by_plaintext)
        callback(0.1, "Start to parse.")

-        sections, tables, pdf_parser = parser(
+        sections, tables, pdf_parser= parser(
            filename=filename,
            binary=binary,
            from_page=from_page,
@@ -626,24 +624,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        res = tokenize_table(tables, doc, is_english)
        callback(0.8, "Finish parsing.")

-    elif re.search(r"\.pptx?$", filename, re.IGNORECASE):
-        if not binary:
-            with open(filename, "rb") as f:
-                binary = f.read()
-        from app.core.rag.app.presentation import Ppt
-        ppt_parser = Ppt()
-        for pn, (txt, img) in enumerate(ppt_parser(
-                filename if not binary else binary, from_page, to_page, callback)):
-            d = copy.deepcopy(doc)
-            pn += from_page
-            d["image"] = img
-            d["doc_type_kwd"] = "image"
-            d["page_num_int"] = [pn + 1]
-            d["top_int"] = [0]
-            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
-            tokenize(d, txt, is_english)
-            res.append(d)
-        return res
+    elif re.search(r"\.(pptx|ppt?)$", filename, re.IGNORECASE):
+        # 方法1.Aspose.Slides是商业级库，其核心功能（如幻灯片创建、动画处理、格式转换等）需通过付费许可证使用。尽管它为符合条件的开源项目提供免费许可证（需申请），但商业闭源项目必须购买授权
+        # if not binary:
+        #     with open(filename, "rb") as f:
+        #         binary = f.read()
+        # from app.core.rag.app.presentation import Ppt
+        # ppt_parser = Ppt()
+        # for pn, (txt, img) in enumerate(ppt_parser(
+        #         filename if not binary else binary, from_page, to_page, callback)):
+        #     d = copy.deepcopy(doc)
+        #     pn += from_page
+        #     d["image"] = img
+        #     d["doc_type_kwd"] = "image"
+        #     d["page_num_int"] = [pn + 1]
+        #     d["top_int"] = [0]
+        #     d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
+        #     tokenize(d, txt, is_english)
+        #     res.append(d)
+        # return res
+        # 方法2.提交任务-文件转换为pdf
+        future = async_convert_to_pdf(filename)
+        dest_pdf_path = future.result()
+        # 解析pdf
+        return chunk(dest_pdf_path, binary=None, lang=lang, callback=callback, vision_model=vision_model, **kwargs)

    elif re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", filename, re.IGNORECASE):
        if not binary:
@@ -818,14 +822,14 @@ if __name__ == "__main__":
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/1.txt"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/2.md"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/3.md" # 带图url
-    file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/4.doc"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/5.json"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/6.html"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/7.xlsx"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/8.pdf"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/9.pptx"
-    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
+    file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/11.mp4"
    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/12.mp3"
    res = chunk(filename=file_path,
@@ -834,7 +838,8 @@ if __name__ == "__main__":
                callback=progress_callback,
                vision_model=vision_model,
                parser_config={
-                    "layout_recognize": "DeepDOC",
+                    # "layout_recognize": "DeepDOC",
+                    "layout_recognize": "TextLn",
                    "chunk_token_num": 128,
                    "delimiter": "\n",
                    "analyze_hyperlink": True,
--- a/api/app/core/rag/app/presentation.py
+++ b/api/app/core/rag/app/presentation.py
@@ -5,7 +5,8 @@ from PIL import Image

 from app.core.rag.nlp import tokenize, is_english
 from app.core.rag.nlp import rag_tokenizer
-from app.core.rag.deepdoc.parser import PdfParser, PptParser, PlainParser
+from app.core.rag.deepdoc.parser import PdfParser, PlainParser
+from app.core.rag.deepdoc.parser.ppt_parser import RAGPptParser as PptParser
 from PyPDF2 import PdfReader as pdf2_read
 from app.core.rag.app.naive import by_plaintext, PARSERS

--- a/api/app/core/rag/app/textin_parser.py
+++ b/api/app/core/rag/app/textin_parser.py
@@ -0,0 +1,217 @@
+import json
+import os
+import re
+import sys
+import threading
+from io import BytesIO
+from os import PathLike
+from typing import Any, Callable, Optional
+import numpy as np
+import pdfplumber
+from functools import reduce
+import requests
+import logging
+from PIL import Image
+
+from app.core.rag.nlp import concat_img
+from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser
+
+LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
+if LOCK_KEY_pdfplumber not in sys.modules:
+    sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
+
+
+class TextLnParser:
+    def __init__(self, textln_api: str, app_id: str, secret_code: str):
+        self.textln_api = textln_api
+        self.app_id = app_id
+        self.secret_code = secret_code
+
+    def recognize(self, file_content: bytes, options: dict) -> str:
+        params = {}
+        for key, value in options.items():
+            params[key] = str(value)
+
+        headers = {
+            "x-ti-app-id": self.app_id,
+            "x-ti-secret-code": self.secret_code,
+            "Content-Type": "application/octet-stream"
+        }
+
+        response = requests.post(
+            url=self.textln_api,
+            params=params,
+            headers=headers,
+            data=file_content
+        )
+
+        response.raise_for_status()
+        return response.text
+
+    def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
+        self.page_from = page_from
+        self.page_to = page_to
+        try:
+            with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
+                self.pdf = pdf
+                self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
+        except Exception as e:
+            self.page_images = None
+            logging.exception(e)
+
+
+    def parse_pdf(
+        self,
+        filepath: str | PathLike[str],
+        binary: BytesIO | bytes,
+        callback: Optional[Callable] = None,
+        vision_model=None,
+        lang: Optional[str] = None,
+        **kwargs
+    ):
+        try:
+            callback(0.15, "USE [Textln] to recognize the file")
+            self.__images__(filepath, zoomin=1)
+            base_name, ext = os.path.splitext(filepath)
+            if not os.path.exists(f"{base_name}_result.md"):
+                with open(filepath, "rb") as f:
+                    file_content = f.read()
+                options = dict(
+                    dpi=144,
+                    get_image="objects",
+                    markdown_details=1,
+                    page_count=1000,  # 当上传的是pdf时，表示要进行解析的pdf页数。总页数不得超过1000页，默认为1000页
+                    parse_mode="auto",
+                    table_flavor="md"
+                )
+                response = self.recognize(file_content, options)
+                # 保存完整的JSON响应到result.json文件
+                with open(f"{base_name}_result.json", "w", encoding="utf-8") as f:
+                    f.write(response)
+                # 解析JSON响应以提取markdown内容
+                json_response = json.loads(response)
+                if "result" in json_response and "markdown" in json_response["result"]:
+                    markdown_content = json_response["result"]["markdown"]
+                    with open(f"{base_name}_result.md", "w", encoding="utf-8") as f:
+                        f.write(markdown_content)
+                else:
+                    callback(prog=-1, msg=json_response["message"])
+                    return None, None, None
+            callback(0.75, f"[Textln] respond md: {base_name}_result.md")
+
+            from app.core.rag.app.naive import Markdown
+            parser_config = kwargs.get(
+                "parser_config", {
+                    "layout_recognize": "TextLn", "chunk_token_num": 512, "delimiter": "\n!?。；！？",
+                    "analyze_hyperlink": True})
+            markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
+            sections, tables = markdown_parser(f"{base_name}_result.md", binary, separate_tables=False,
+                                               delimiter=parser_config.get("delimiter", "\n!?;。；！？"))
+            return sections, tables
+            # # Process images for each section
+            # section_images = []
+            # if vision_model:
+            #     for idx, (section_text, _) in enumerate(sections):
+            #         images = markdown_parser.get_pictures(section_text) if section_text else None
+            #
+            #         if images:
+            #             # If multiple images found, combine them using concat_img
+            #             combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
+            #             section_images.append(combined_image)
+            #             markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
+            #                 ((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
+            #             boosted_figures = markdown_vision_parser(callback=callback)
+            #             sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1][0] for fig in boosted_figures]),
+            #                              sections[idx][1])
+            #         else:
+            #             section_images.append(None)
+            #
+            # else:
+            #     logging.warning("No visual model detected. Skipping figure parsing enhancement.")
+            # return sections, tables, section_images
+        except Exception as e:
+            logging.warning(f"Error: {e}")
+            callback(prog=-1, msg=str(e))
+        return None, None
+
+    @staticmethod
+    def extract_positions(txt: str):
+        poss = []
+        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
+            pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
+            left, right, top, bottom = float(left), float(right), float(top), float(bottom)
+            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
+        return poss
+
+    def crop(self, text, ZM=1, need_position=False):
+        imgs = []
+        poss = self.extract_positions(text)
+        if not poss:
+            if need_position:
+                return None, None
+            return
+
+        max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
+        GAP = 6
+        pos = poss[0]
+        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
+        pos = poss[-1]
+        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
+
+        positions = []
+        for ii, (pns, left, right, top, bottom) in enumerate(poss):
+            right = left + max_width
+
+            if bottom <= top:
+                bottom = top + 2
+
+            for pn in pns[1:]:
+                bottom += self.page_images[pn - 1].size[1]
+
+            img0 = self.page_images[pns[0]]
+            x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
+            crop0 = img0.crop((x0, y0, x1, y1))
+            imgs.append(crop0)
+            if 0 < ii < len(poss) - 1:
+                positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
+
+            bottom -= img0.size[1]
+            for pn in pns[1:]:
+                page = self.page_images[pn]
+                x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
+                cimgp = page.crop((x0, y0, x1, y1))
+                imgs.append(cimgp)
+                if 0 < ii < len(poss) - 1:
+                    positions.append((pn + self.page_from, x0, x1, y0, y1))
+                bottom -= page.size[1]
+
+        if not imgs:
+            if need_position:
+                return None, None
+            return
+
+        height = 0
+        for img in imgs:
+            height += img.size[1] + GAP
+        height = int(height)
+        width = int(np.max([i.size[0] for i in imgs]))
+        pic = Image.new("RGB", (width, height), (245, 245, 245))
+        height = 0
+        for ii, img in enumerate(imgs):
+            if ii == 0 or ii + 1 == len(imgs):
+                img = img.convert("RGBA")
+                overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
+                overlay.putalpha(128)
+                img = Image.alpha_composite(img, overlay).convert("RGB")
+            pic.paste(img, (0, int(height)))
+            height += img.size[1] + GAP
+
+        if need_position:
+            return pic, positions
+        return pic
+
+    @staticmethod
+    def remove_tag(txt):
+        return re.sub(r"@@[\t0-9.-]+?##", "", txt)
+
+