[MODIFY] Code optimization

This commit is contained in:
Mark
2025-12-15 14:09:43 +08:00
parent d2a630addb
commit a4e276ab27
157 changed files with 15976 additions and 3601 deletions

View File

@@ -15,13 +15,15 @@ import copy
from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
from app.core.rag.common.file_utils import get_project_base_directory
from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
from app.core.rag.deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
from app.core.rag.deepdoc.parser.pdf_parser import PlainParser, VisionParser
from app.core.rag.deepdoc.parser.mineru_parser import MinerUParser
from app.core.rag.app.textin_parser import TextLnParser
from app.core.rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, tokenize, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
@@ -39,7 +41,7 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
return sections, tables, pdf_parser
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
@@ -59,23 +61,19 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
return sections, tables, pdf_parser
def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
textln_app_id = os.environ.get("TEXTLN_APP_ID", "")
textln_secret_code = os.environ.get("TEXTLN_SECRET_CODE", "")
def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
textln_api = os.environ.get("TEXTLN_APISERVER", "https://api.textin.com/ai/service/v1/pdf_to_markdown")
pdf_parser = MinerUParser(mineru_path=textln_app_id, mineru_api=textln_api)
if not pdf_parser.check_installation():
callback(-1, "MinerU not found.")
return None, None, pdf_parser
app_id = os.environ.get("TEXTLN_APP_ID", "fa3f24380683ad53e6c620c0f0878a09")
secret_code = os.environ.get("TEXTLN_SECRET_CODE", "6130caac9aabc6eb26433758d7898f4a")
pdf_parser = TextLnParser(textln_api=textln_api, app_id=app_id, secret_code=secret_code)
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
vision_model=vision_model,
lang=lang,
**kwargs
)
return sections, tables, pdf_parser
@@ -605,7 +603,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, tables, pdf_parser = parser(
sections, tables, pdf_parser= parser(
filename=filename,
binary=binary,
from_page=from_page,
@@ -626,24 +624,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.pptx?$", filename, re.IGNORECASE):
if not binary:
with open(filename, "rb") as f:
binary = f.read()
from app.core.rag.app.presentation import Ppt
ppt_parser = Ppt()
for pn, (txt, img) in enumerate(ppt_parser(
filename if not binary else binary, from_page, to_page, callback)):
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["doc_type_kwd"] = "image"
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, is_english)
res.append(d)
return res
elif re.search(r"\.(pptx|ppt?)$", filename, re.IGNORECASE):
# 方法1.Aspose.Slides是商业级库其核心功能如幻灯片创建、动画处理、格式转换等需通过付费许可证使用。尽管它为符合条件的开源项目提供免费许可证需申请但商业闭源项目必须购买授权
# if not binary:
# with open(filename, "rb") as f:
# binary = f.read()
# from app.core.rag.app.presentation import Ppt
# ppt_parser = Ppt()
# for pn, (txt, img) in enumerate(ppt_parser(
# filename if not binary else binary, from_page, to_page, callback)):
# d = copy.deepcopy(doc)
# pn += from_page
# d["image"] = img
# d["doc_type_kwd"] = "image"
# d["page_num_int"] = [pn + 1]
# d["top_int"] = [0]
# d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
# tokenize(d, txt, is_english)
# res.append(d)
# return res
# 方法2.提交任务-文件转换为pdf
future = async_convert_to_pdf(filename)
dest_pdf_path = future.result()
# 解析pdf
return chunk(dest_pdf_path, binary=None, lang=lang, callback=callback, vision_model=vision_model, **kwargs)
elif re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", filename, re.IGNORECASE):
if not binary:
@@ -818,14 +822,14 @@ if __name__ == "__main__":
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/1.txt"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/2.md"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/3.md" # 带图url
file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/4.doc"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/5.json"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/6.html"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/7.xlsx"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/8.pdf"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/9.pptx"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/11.mp4"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/12.mp3"
res = chunk(filename=file_path,
@@ -834,7 +838,8 @@ if __name__ == "__main__":
callback=progress_callback,
vision_model=vision_model,
parser_config={
"layout_recognize": "DeepDOC",
# "layout_recognize": "DeepDOC",
"layout_recognize": "TextLn",
"chunk_token_num": 128,
"delimiter": "\n",
"analyze_hyperlink": True,

View File

@@ -5,7 +5,8 @@ from PIL import Image
from app.core.rag.nlp import tokenize, is_english
from app.core.rag.nlp import rag_tokenizer
from app.core.rag.deepdoc.parser import PdfParser, PptParser, PlainParser
from app.core.rag.deepdoc.parser import PdfParser, PlainParser
from app.core.rag.deepdoc.parser.ppt_parser import RAGPptParser as PptParser
from PyPDF2 import PdfReader as pdf2_read
from app.core.rag.app.naive import by_plaintext, PARSERS

View File

@@ -0,0 +1,217 @@
import json
import os
import re
import sys
import threading
from io import BytesIO
from os import PathLike
from typing import Any, Callable, Optional
import numpy as np
import pdfplumber
from functools import reduce
import requests
import logging
from PIL import Image
from app.core.rag.nlp import concat_img
from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
class TextLnParser:
def __init__(self, textln_api: str, app_id: str, secret_code: str):
self.textln_api = textln_api
self.app_id = app_id
self.secret_code = secret_code
def recognize(self, file_content: bytes, options: dict) -> str:
params = {}
for key, value in options.items():
params[key] = str(value)
headers = {
"x-ti-app-id": self.app_id,
"x-ti-secret-code": self.secret_code,
"Content-Type": "application/octet-stream"
}
response = requests.post(
url=self.textln_api,
params=params,
headers=headers,
data=file_content
)
response.raise_for_status()
return response.text
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
self.page_from = page_from
self.page_to = page_to
try:
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
self.pdf = pdf
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
except Exception as e:
self.page_images = None
logging.exception(e)
def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes,
callback: Optional[Callable] = None,
vision_model=None,
lang: Optional[str] = None,
**kwargs
):
try:
callback(0.15, "USE [Textln] to recognize the file")
self.__images__(filepath, zoomin=1)
base_name, ext = os.path.splitext(filepath)
if not os.path.exists(f"{base_name}_result.md"):
with open(filepath, "rb") as f:
file_content = f.read()
options = dict(
dpi=144,
get_image="objects",
markdown_details=1,
page_count=1000, # 当上传的是pdf时表示要进行解析的pdf页数。总页数不得超过1000页默认为1000页
parse_mode="auto",
table_flavor="md"
)
response = self.recognize(file_content, options)
# 保存完整的JSON响应到result.json文件
with open(f"{base_name}_result.json", "w", encoding="utf-8") as f:
f.write(response)
# 解析JSON响应以提取markdown内容
json_response = json.loads(response)
if "result" in json_response and "markdown" in json_response["result"]:
markdown_content = json_response["result"]["markdown"]
with open(f"{base_name}_result.md", "w", encoding="utf-8") as f:
f.write(markdown_content)
else:
callback(prog=-1, msg=json_response["message"])
return None, None, None
callback(0.75, f"[Textln] respond md: {base_name}_result.md")
from app.core.rag.app.naive import Markdown
parser_config = kwargs.get(
"parser_config", {
"layout_recognize": "TextLn", "chunk_token_num": 512, "delimiter": "\n!?。;!?",
"analyze_hyperlink": True})
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(f"{base_name}_result.md", binary, separate_tables=False,
delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
return sections, tables
# # Process images for each section
# section_images = []
# if vision_model:
# for idx, (section_text, _) in enumerate(sections):
# images = markdown_parser.get_pictures(section_text) if section_text else None
#
# if images:
# # If multiple images found, combine them using concat_img
# combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
# section_images.append(combined_image)
# markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
# ((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
# boosted_figures = markdown_vision_parser(callback=callback)
# sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1][0] for fig in boosted_figures]),
# sections[idx][1])
# else:
# section_images.append(None)
#
# else:
# logging.warning("No visual model detected. Skipping figure parsing enhancement.")
# return sections, tables, section_images
except Exception as e:
logging.warning(f"Error: {e}")
callback(prog=-1, msg=str(e))
return None, None
@staticmethod
def extract_positions(txt: str):
poss = []
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss
def crop(self, text, ZM=1, need_position=False):
imgs = []
poss = self.extract_positions(text)
if not poss:
if need_position:
return None, None
return
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
pos = poss[-1]
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
positions = []
for ii, (pns, left, right, top, bottom) in enumerate(poss):
right = left + max_width
if bottom <= top:
bottom = top + 2
for pn in pns[1:]:
bottom += self.page_images[pn - 1].size[1]
img0 = self.page_images[pns[0]]
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
crop0 = img0.crop((x0, y0, x1, y1))
imgs.append(crop0)
if 0 < ii < len(poss) - 1:
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
bottom -= img0.size[1]
for pn in pns[1:]:
page = self.page_images[pn]
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
cimgp = page.crop((x0, y0, x1, y1))
imgs.append(cimgp)
if 0 < ii < len(poss) - 1:
positions.append((pn + self.page_from, x0, x1, y0, y1))
bottom -= page.size[1]
if not imgs:
if need_position:
return None, None
return
height = 0
for img in imgs:
height += img.size[1] + GAP
height = int(height)
width = int(np.max([i.size[0] for i in imgs]))
pic = Image.new("RGB", (width, height), (245, 245, 245))
height = 0
for ii, img in enumerate(imgs):
if ii == 0 or ii + 1 == len(imgs):
img = img.convert("RGBA")
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
overlay.putalpha(128)
img = Image.alpha_composite(img, overlay).convert("RGB")
pic.paste(img, (0, int(height)))
height += img.size[1] + GAP
if need_position:
return pic, positions
return pic
@staticmethod
def remove_tag(txt):
return re.sub(r"@@[\t0-9.-]+?##", "", txt)