[MODIFY] Code optimization

This commit is contained in:
Mark
2025-12-15 14:09:43 +08:00
parent d2a630addb
commit a4e276ab27
157 changed files with 15976 additions and 3601 deletions

View File

@@ -15,13 +15,15 @@ import copy
from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
from app.core.rag.common.file_utils import get_project_base_directory
from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
from app.core.rag.deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
from app.core.rag.deepdoc.parser.pdf_parser import PlainParser, VisionParser
from app.core.rag.deepdoc.parser.mineru_parser import MinerUParser
from app.core.rag.app.textin_parser import TextLnParser
from app.core.rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, tokenize, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
@@ -39,7 +41,7 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
return sections, tables, pdf_parser
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
@@ -59,23 +61,19 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
return sections, tables, pdf_parser
def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
textln_app_id = os.environ.get("TEXTLN_APP_ID", "")
textln_secret_code = os.environ.get("TEXTLN_SECRET_CODE", "")
def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
textln_api = os.environ.get("TEXTLN_APISERVER", "https://api.textin.com/ai/service/v1/pdf_to_markdown")
pdf_parser = MinerUParser(mineru_path=textln_app_id, mineru_api=textln_api)
if not pdf_parser.check_installation():
callback(-1, "MinerU not found.")
return None, None, pdf_parser
app_id = os.environ.get("TEXTLN_APP_ID", "fa3f24380683ad53e6c620c0f0878a09")
secret_code = os.environ.get("TEXTLN_SECRET_CODE", "6130caac9aabc6eb26433758d7898f4a")
pdf_parser = TextLnParser(textln_api=textln_api, app_id=app_id, secret_code=secret_code)
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
vision_model=vision_model,
lang=lang,
**kwargs
)
return sections, tables, pdf_parser
@@ -605,7 +603,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, tables, pdf_parser = parser(
sections, tables, pdf_parser= parser(
filename=filename,
binary=binary,
from_page=from_page,
@@ -626,24 +624,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.pptx?$", filename, re.IGNORECASE):
if not binary:
with open(filename, "rb") as f:
binary = f.read()
from app.core.rag.app.presentation import Ppt
ppt_parser = Ppt()
for pn, (txt, img) in enumerate(ppt_parser(
filename if not binary else binary, from_page, to_page, callback)):
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["doc_type_kwd"] = "image"
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, is_english)
res.append(d)
return res
elif re.search(r"\.(pptx|ppt?)$", filename, re.IGNORECASE):
# 方法1.Aspose.Slides是商业级库其核心功能如幻灯片创建、动画处理、格式转换等需通过付费许可证使用。尽管它为符合条件的开源项目提供免费许可证需申请但商业闭源项目必须购买授权
# if not binary:
# with open(filename, "rb") as f:
# binary = f.read()
# from app.core.rag.app.presentation import Ppt
# ppt_parser = Ppt()
# for pn, (txt, img) in enumerate(ppt_parser(
# filename if not binary else binary, from_page, to_page, callback)):
# d = copy.deepcopy(doc)
# pn += from_page
# d["image"] = img
# d["doc_type_kwd"] = "image"
# d["page_num_int"] = [pn + 1]
# d["top_int"] = [0]
# d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
# tokenize(d, txt, is_english)
# res.append(d)
# return res
# 方法2.提交任务-文件转换为pdf
future = async_convert_to_pdf(filename)
dest_pdf_path = future.result()
# 解析pdf
return chunk(dest_pdf_path, binary=None, lang=lang, callback=callback, vision_model=vision_model, **kwargs)
elif re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", filename, re.IGNORECASE):
if not binary:
@@ -818,14 +822,14 @@ if __name__ == "__main__":
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/1.txt"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/2.md"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/3.md" # 带图url
file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/4.doc"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/5.json"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/6.html"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/7.xlsx"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/8.pdf"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/9.pptx"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/11.mp4"
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/12.mp3"
res = chunk(filename=file_path,
@@ -834,7 +838,8 @@ if __name__ == "__main__":
callback=progress_callback,
vision_model=vision_model,
parser_config={
"layout_recognize": "DeepDOC",
# "layout_recognize": "DeepDOC",
"layout_recognize": "TextLn",
"chunk_token_num": 128,
"delimiter": "\n",
"analyze_hyperlink": True,

View File

@@ -5,7 +5,8 @@ from PIL import Image
from app.core.rag.nlp import tokenize, is_english
from app.core.rag.nlp import rag_tokenizer
from app.core.rag.deepdoc.parser import PdfParser, PptParser, PlainParser
from app.core.rag.deepdoc.parser import PdfParser, PlainParser
from app.core.rag.deepdoc.parser.ppt_parser import RAGPptParser as PptParser
from PyPDF2 import PdfReader as pdf2_read
from app.core.rag.app.naive import by_plaintext, PARSERS

View File

@@ -0,0 +1,217 @@
import json
import os
import re
import sys
import threading
from io import BytesIO
from os import PathLike
from typing import Any, Callable, Optional
import numpy as np
import pdfplumber
from functools import reduce
import requests
import logging
from PIL import Image
from app.core.rag.nlp import concat_img
from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
class TextLnParser:
def __init__(self, textln_api: str, app_id: str, secret_code: str):
self.textln_api = textln_api
self.app_id = app_id
self.secret_code = secret_code
def recognize(self, file_content: bytes, options: dict) -> str:
params = {}
for key, value in options.items():
params[key] = str(value)
headers = {
"x-ti-app-id": self.app_id,
"x-ti-secret-code": self.secret_code,
"Content-Type": "application/octet-stream"
}
response = requests.post(
url=self.textln_api,
params=params,
headers=headers,
data=file_content
)
response.raise_for_status()
return response.text
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
self.page_from = page_from
self.page_to = page_to
try:
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
self.pdf = pdf
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
except Exception as e:
self.page_images = None
logging.exception(e)
def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes,
callback: Optional[Callable] = None,
vision_model=None,
lang: Optional[str] = None,
**kwargs
):
try:
callback(0.15, "USE [Textln] to recognize the file")
self.__images__(filepath, zoomin=1)
base_name, ext = os.path.splitext(filepath)
if not os.path.exists(f"{base_name}_result.md"):
with open(filepath, "rb") as f:
file_content = f.read()
options = dict(
dpi=144,
get_image="objects",
markdown_details=1,
page_count=1000, # 当上传的是pdf时表示要进行解析的pdf页数。总页数不得超过1000页默认为1000页
parse_mode="auto",
table_flavor="md"
)
response = self.recognize(file_content, options)
# 保存完整的JSON响应到result.json文件
with open(f"{base_name}_result.json", "w", encoding="utf-8") as f:
f.write(response)
# 解析JSON响应以提取markdown内容
json_response = json.loads(response)
if "result" in json_response and "markdown" in json_response["result"]:
markdown_content = json_response["result"]["markdown"]
with open(f"{base_name}_result.md", "w", encoding="utf-8") as f:
f.write(markdown_content)
else:
callback(prog=-1, msg=json_response["message"])
return None, None, None
callback(0.75, f"[Textln] respond md: {base_name}_result.md")
from app.core.rag.app.naive import Markdown
parser_config = kwargs.get(
"parser_config", {
"layout_recognize": "TextLn", "chunk_token_num": 512, "delimiter": "\n!?。;!?",
"analyze_hyperlink": True})
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(f"{base_name}_result.md", binary, separate_tables=False,
delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
return sections, tables
# # Process images for each section
# section_images = []
# if vision_model:
# for idx, (section_text, _) in enumerate(sections):
# images = markdown_parser.get_pictures(section_text) if section_text else None
#
# if images:
# # If multiple images found, combine them using concat_img
# combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
# section_images.append(combined_image)
# markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
# ((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
# boosted_figures = markdown_vision_parser(callback=callback)
# sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1][0] for fig in boosted_figures]),
# sections[idx][1])
# else:
# section_images.append(None)
#
# else:
# logging.warning("No visual model detected. Skipping figure parsing enhancement.")
# return sections, tables, section_images
except Exception as e:
logging.warning(f"Error: {e}")
callback(prog=-1, msg=str(e))
return None, None
@staticmethod
def extract_positions(txt: str):
poss = []
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss
def crop(self, text, ZM=1, need_position=False):
imgs = []
poss = self.extract_positions(text)
if not poss:
if need_position:
return None, None
return
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
pos = poss[-1]
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
positions = []
for ii, (pns, left, right, top, bottom) in enumerate(poss):
right = left + max_width
if bottom <= top:
bottom = top + 2
for pn in pns[1:]:
bottom += self.page_images[pn - 1].size[1]
img0 = self.page_images[pns[0]]
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
crop0 = img0.crop((x0, y0, x1, y1))
imgs.append(crop0)
if 0 < ii < len(poss) - 1:
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
bottom -= img0.size[1]
for pn in pns[1:]:
page = self.page_images[pn]
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
cimgp = page.crop((x0, y0, x1, y1))
imgs.append(cimgp)
if 0 < ii < len(poss) - 1:
positions.append((pn + self.page_from, x0, x1, y0, y1))
bottom -= page.size[1]
if not imgs:
if need_position:
return None, None
return
height = 0
for img in imgs:
height += img.size[1] + GAP
height = int(height)
width = int(np.max([i.size[0] for i in imgs]))
pic = Image.new("RGB", (width, height), (245, 245, 245))
height = 0
for ii, img in enumerate(imgs):
if ii == 0 or ii + 1 == len(imgs):
img = img.convert("RGBA")
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
overlay.putalpha(128)
img = Image.alpha_composite(img, overlay).convert("RGB")
pic.paste(img, (0, int(height)))
height += img.size[1] + GAP
if need_position:
return pic, positions
return pic
@staticmethod
def remove_tag(txt):
return re.sub(r"@@[\t0-9.-]+?##", "", txt)

View File

@@ -6,7 +6,7 @@ from .markdown_parser import MarkdownElementExtractor
from .markdown_parser import RAGMarkdownParser as MarkdownParser
from .pdf_parser import PlainParser
from .pdf_parser import RAGPdfParser as PdfParser
from .ppt_parser import RAGPptParser as PptParser
# from .ppt_parser import RAGPptParser as PptParser
from .txt_parser import RAGTxtParser as TxtParser
__all__ = [
@@ -14,7 +14,7 @@ __all__ = [
"PlainParser",
"DocxParser",
"ExcelParser",
"PptParser",
# "PptParser",
"HtmlParser",
"JsonParser",
"MarkdownParser",

View File

@@ -134,7 +134,7 @@ def question_proposal(chat_mdl, content, topn=3):
rendered_prompt = template.render(content=content, topn=topn)
msg = [{"role": "system", "content": rendered_prompt}, {"role": "user", "content": "Output: "}]
_, msg = message_fit_in(msg, chat_mdl.max_length)
_, msg = message_fit_in(msg, getattr(chat_mdl, 'max_length', 8096))
kwd = chat_mdl.chat(rendered_prompt, msg[1:], {"temperature": 0.2})
if isinstance(kwd, tuple):
kwd = kwd[0]

View File

@@ -0,0 +1,62 @@
import subprocess
import os
from concurrent.futures import ThreadPoolExecutor
from fastapi import HTTPException, status
# 根据CPU核心数自动设置保守策略核心数 * 2
MAX_WORKERS = os.cpu_count() * 2 if os.cpu_count() else 4
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
# 将DOCX/PPT/PPTX文件转换为PDF
def convert_to_pdf(src_path):
try:
print("开始使用LibreOffice将DOC/DOCX/PPT/PPTX转换为PDF...")
output_dir = os.path.dirname(src_path)
# 使用linux上LibreOffice的完整路径调用soffice进行转换
libreoffice_path = "/usr/bin/soffice"
if not os.path.exists(libreoffice_path):
# 使用macOS上LibreOffice的完整路径调用soffice进行转换
libreoffice_path = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
if not os.path.exists(libreoffice_path):
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="LibreOffice未安装或路径不正确请确认安装。"
)
# 使用subprocess.run的超时设置防止卡死
subprocess.run([
libreoffice_path,
'--headless',
'--convert-to', 'pdf',
'--outdir', output_dir,
src_path
], check=True, timeout=120) # 设置超时时间
# 检查PDF是否生成成功
dest_path = os.path.splitext(src_path)[0] + '.pdf'
if not os.path.exists(dest_path):
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"PDF文件未生成在 {dest_path}"
)
print(f"PDF已保存至 {dest_path}")
return dest_path
except subprocess.CalledProcessError as e:
print(f"转换过程中出错: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"转换过程中出错: {e}"
)
except FileNotFoundError as e:
print(f"文件错误: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"文件错误: {e}"
)
def async_convert_to_pdf(src_path):
# 提交任务到线程池
future = executor.submit(convert_to_pdf, src_path)
return future # 返回一个future对象调用者可以使用它来获取结果或处理异常