[MODIFY] Code optimization
This commit is contained in:
@@ -15,13 +15,15 @@ import copy
|
||||
from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
|
||||
from app.core.rag.common.file_utils import get_project_base_directory
|
||||
from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
||||
from app.core.rag.utils.libre_office import convert_to_pdf, async_convert_to_pdf
|
||||
from app.core.rag.deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||
from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
|
||||
from app.core.rag.deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from app.core.rag.deepdoc.parser.mineru_parser import MinerUParser
|
||||
from app.core.rag.app.textin_parser import TextLnParser
|
||||
from app.core.rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, tokenize, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
||||
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
|
||||
callback = callback
|
||||
binary = binary
|
||||
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
||||
@@ -39,7 +41,7 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
|
||||
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
|
||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
||||
@@ -59,23 +61,19 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
|
||||
textln_app_id = os.environ.get("TEXTLN_APP_ID", "")
|
||||
textln_secret_code = os.environ.get("TEXTLN_SECRET_CODE", "")
|
||||
def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None, **kwargs):
|
||||
textln_api = os.environ.get("TEXTLN_APISERVER", "https://api.textin.com/ai/service/v1/pdf_to_markdown")
|
||||
pdf_parser = MinerUParser(mineru_path=textln_app_id, mineru_api=textln_api)
|
||||
|
||||
if not pdf_parser.check_installation():
|
||||
callback(-1, "MinerU not found.")
|
||||
return None, None, pdf_parser
|
||||
app_id = os.environ.get("TEXTLN_APP_ID", "fa3f24380683ad53e6c620c0f0878a09")
|
||||
secret_code = os.environ.get("TEXTLN_SECRET_CODE", "6130caac9aabc6eb26433758d7898f4a")
|
||||
pdf_parser = TextLnParser(textln_api=textln_api, app_id=app_id, secret_code=secret_code)
|
||||
|
||||
sections, tables = pdf_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
vision_model=vision_model,
|
||||
lang=lang,
|
||||
**kwargs
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
@@ -605,7 +603,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
sections, tables, pdf_parser = parser(
|
||||
sections, tables, pdf_parser= parser(
|
||||
filename=filename,
|
||||
binary=binary,
|
||||
from_page=from_page,
|
||||
@@ -626,24 +624,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||
if not binary:
|
||||
with open(filename, "rb") as f:
|
||||
binary = f.read()
|
||||
from app.core.rag.app.presentation import Ppt
|
||||
ppt_parser = Ppt()
|
||||
for pn, (txt, img) in enumerate(ppt_parser(
|
||||
filename if not binary else binary, from_page, to_page, callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
d["image"] = img
|
||||
d["doc_type_kwd"] = "image"
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||
tokenize(d, txt, is_english)
|
||||
res.append(d)
|
||||
return res
|
||||
elif re.search(r"\.(pptx|ppt?)$", filename, re.IGNORECASE):
|
||||
# 方法1.Aspose.Slides是商业级库,其核心功能(如幻灯片创建、动画处理、格式转换等)需通过付费许可证使用。尽管它为符合条件的开源项目提供免费许可证(需申请),但商业闭源项目必须购买授权
|
||||
# if not binary:
|
||||
# with open(filename, "rb") as f:
|
||||
# binary = f.read()
|
||||
# from app.core.rag.app.presentation import Ppt
|
||||
# ppt_parser = Ppt()
|
||||
# for pn, (txt, img) in enumerate(ppt_parser(
|
||||
# filename if not binary else binary, from_page, to_page, callback)):
|
||||
# d = copy.deepcopy(doc)
|
||||
# pn += from_page
|
||||
# d["image"] = img
|
||||
# d["doc_type_kwd"] = "image"
|
||||
# d["page_num_int"] = [pn + 1]
|
||||
# d["top_int"] = [0]
|
||||
# d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||
# tokenize(d, txt, is_english)
|
||||
# res.append(d)
|
||||
# return res
|
||||
# 方法2.提交任务-文件转换为pdf
|
||||
future = async_convert_to_pdf(filename)
|
||||
dest_pdf_path = future.result()
|
||||
# 解析pdf
|
||||
return chunk(dest_pdf_path, binary=None, lang=lang, callback=callback, vision_model=vision_model, **kwargs)
|
||||
|
||||
elif re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", filename, re.IGNORECASE):
|
||||
if not binary:
|
||||
@@ -818,14 +822,14 @@ if __name__ == "__main__":
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/1.txt"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/2.md"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/3.md" # 带图url
|
||||
file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/4.doc"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/5.json"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/6.html"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/7.xlsx"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/8.pdf"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/9.pptx"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
|
||||
file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/11.mp4"
|
||||
# file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/12.mp3"
|
||||
res = chunk(filename=file_path,
|
||||
@@ -834,7 +838,8 @@ if __name__ == "__main__":
|
||||
callback=progress_callback,
|
||||
vision_model=vision_model,
|
||||
parser_config={
|
||||
"layout_recognize": "DeepDOC",
|
||||
# "layout_recognize": "DeepDOC",
|
||||
"layout_recognize": "TextLn",
|
||||
"chunk_token_num": 128,
|
||||
"delimiter": "\n",
|
||||
"analyze_hyperlink": True,
|
||||
|
||||
@@ -5,7 +5,8 @@ from PIL import Image
|
||||
|
||||
from app.core.rag.nlp import tokenize, is_english
|
||||
from app.core.rag.nlp import rag_tokenizer
|
||||
from app.core.rag.deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||
from app.core.rag.deepdoc.parser import PdfParser, PlainParser
|
||||
from app.core.rag.deepdoc.parser.ppt_parser import RAGPptParser as PptParser
|
||||
from PyPDF2 import PdfReader as pdf2_read
|
||||
from app.core.rag.app.naive import by_plaintext, PARSERS
|
||||
|
||||
|
||||
217
api/app/core/rag/app/textin_parser.py
Normal file
217
api/app/core/rag/app/textin_parser.py
Normal file
@@ -0,0 +1,217 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
from io import BytesIO
|
||||
from os import PathLike
|
||||
from typing import Any, Callable, Optional
|
||||
import numpy as np
|
||||
import pdfplumber
|
||||
from functools import reduce
|
||||
import requests
|
||||
import logging
|
||||
from PIL import Image
|
||||
|
||||
from app.core.rag.nlp import concat_img
|
||||
from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser
|
||||
|
||||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||||
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
||||
|
||||
|
||||
class TextLnParser:
|
||||
def __init__(self, textln_api: str, app_id: str, secret_code: str):
|
||||
self.textln_api = textln_api
|
||||
self.app_id = app_id
|
||||
self.secret_code = secret_code
|
||||
|
||||
def recognize(self, file_content: bytes, options: dict) -> str:
|
||||
params = {}
|
||||
for key, value in options.items():
|
||||
params[key] = str(value)
|
||||
|
||||
headers = {
|
||||
"x-ti-app-id": self.app_id,
|
||||
"x-ti-secret-code": self.secret_code,
|
||||
"Content-Type": "application/octet-stream"
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
url=self.textln_api,
|
||||
params=params,
|
||||
headers=headers,
|
||||
data=file_content
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
try:
|
||||
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
||||
self.pdf = pdf
|
||||
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
|
||||
except Exception as e:
|
||||
self.page_images = None
|
||||
logging.exception(e)
|
||||
|
||||
|
||||
def parse_pdf(
|
||||
self,
|
||||
filepath: str | PathLike[str],
|
||||
binary: BytesIO | bytes,
|
||||
callback: Optional[Callable] = None,
|
||||
vision_model=None,
|
||||
lang: Optional[str] = None,
|
||||
**kwargs
|
||||
):
|
||||
try:
|
||||
callback(0.15, "USE [Textln] to recognize the file")
|
||||
self.__images__(filepath, zoomin=1)
|
||||
base_name, ext = os.path.splitext(filepath)
|
||||
if not os.path.exists(f"{base_name}_result.md"):
|
||||
with open(filepath, "rb") as f:
|
||||
file_content = f.read()
|
||||
options = dict(
|
||||
dpi=144,
|
||||
get_image="objects",
|
||||
markdown_details=1,
|
||||
page_count=1000, # 当上传的是pdf时,表示要进行解析的pdf页数。总页数不得超过1000页,默认为1000页
|
||||
parse_mode="auto",
|
||||
table_flavor="md"
|
||||
)
|
||||
response = self.recognize(file_content, options)
|
||||
# 保存完整的JSON响应到result.json文件
|
||||
with open(f"{base_name}_result.json", "w", encoding="utf-8") as f:
|
||||
f.write(response)
|
||||
# 解析JSON响应以提取markdown内容
|
||||
json_response = json.loads(response)
|
||||
if "result" in json_response and "markdown" in json_response["result"]:
|
||||
markdown_content = json_response["result"]["markdown"]
|
||||
with open(f"{base_name}_result.md", "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
else:
|
||||
callback(prog=-1, msg=json_response["message"])
|
||||
return None, None, None
|
||||
callback(0.75, f"[Textln] respond md: {base_name}_result.md")
|
||||
|
||||
from app.core.rag.app.naive import Markdown
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"layout_recognize": "TextLn", "chunk_token_num": 512, "delimiter": "\n!?。;!?",
|
||||
"analyze_hyperlink": True})
|
||||
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||
sections, tables = markdown_parser(f"{base_name}_result.md", binary, separate_tables=False,
|
||||
delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
return sections, tables
|
||||
# # Process images for each section
|
||||
# section_images = []
|
||||
# if vision_model:
|
||||
# for idx, (section_text, _) in enumerate(sections):
|
||||
# images = markdown_parser.get_pictures(section_text) if section_text else None
|
||||
#
|
||||
# if images:
|
||||
# # If multiple images found, combine them using concat_img
|
||||
# combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
||||
# section_images.append(combined_image)
|
||||
# markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
|
||||
# ((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||
# boosted_figures = markdown_vision_parser(callback=callback)
|
||||
# sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1][0] for fig in boosted_figures]),
|
||||
# sections[idx][1])
|
||||
# else:
|
||||
# section_images.append(None)
|
||||
#
|
||||
# else:
|
||||
# logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
||||
# return sections, tables, section_images
|
||||
except Exception as e:
|
||||
logging.warning(f"Error: {e}")
|
||||
callback(prog=-1, msg=str(e))
|
||||
return None, None
|
||||
|
||||
@staticmethod
|
||||
def extract_positions(txt: str):
|
||||
poss = []
|
||||
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
||||
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
||||
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||
return poss
|
||||
|
||||
def crop(self, text, ZM=1, need_position=False):
|
||||
imgs = []
|
||||
poss = self.extract_positions(text)
|
||||
if not poss:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||||
GAP = 6
|
||||
pos = poss[0]
|
||||
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||
pos = poss[-1]
|
||||
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
|
||||
|
||||
positions = []
|
||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||
right = left + max_width
|
||||
|
||||
if bottom <= top:
|
||||
bottom = top + 2
|
||||
|
||||
for pn in pns[1:]:
|
||||
bottom += self.page_images[pn - 1].size[1]
|
||||
|
||||
img0 = self.page_images[pns[0]]
|
||||
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
||||
crop0 = img0.crop((x0, y0, x1, y1))
|
||||
imgs.append(crop0)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
||||
|
||||
bottom -= img0.size[1]
|
||||
for pn in pns[1:]:
|
||||
page = self.page_images[pn]
|
||||
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
||||
cimgp = page.crop((x0, y0, x1, y1))
|
||||
imgs.append(cimgp)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
||||
bottom -= page.size[1]
|
||||
|
||||
if not imgs:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
height = 0
|
||||
for img in imgs:
|
||||
height += img.size[1] + GAP
|
||||
height = int(height)
|
||||
width = int(np.max([i.size[0] for i in imgs]))
|
||||
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
||||
height = 0
|
||||
for ii, img in enumerate(imgs):
|
||||
if ii == 0 or ii + 1 == len(imgs):
|
||||
img = img.convert("RGBA")
|
||||
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
||||
overlay.putalpha(128)
|
||||
img = Image.alpha_composite(img, overlay).convert("RGB")
|
||||
pic.paste(img, (0, int(height)))
|
||||
height += img.size[1] + GAP
|
||||
|
||||
if need_position:
|
||||
return pic, positions
|
||||
return pic
|
||||
|
||||
@staticmethod
|
||||
def remove_tag(txt):
|
||||
return re.sub(r"@@[\t0-9.-]+?##", "", txt)
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from .markdown_parser import MarkdownElementExtractor
|
||||
from .markdown_parser import RAGMarkdownParser as MarkdownParser
|
||||
from .pdf_parser import PlainParser
|
||||
from .pdf_parser import RAGPdfParser as PdfParser
|
||||
from .ppt_parser import RAGPptParser as PptParser
|
||||
# from .ppt_parser import RAGPptParser as PptParser
|
||||
from .txt_parser import RAGTxtParser as TxtParser
|
||||
|
||||
__all__ = [
|
||||
@@ -14,7 +14,7 @@ __all__ = [
|
||||
"PlainParser",
|
||||
"DocxParser",
|
||||
"ExcelParser",
|
||||
"PptParser",
|
||||
# "PptParser",
|
||||
"HtmlParser",
|
||||
"JsonParser",
|
||||
"MarkdownParser",
|
||||
|
||||
@@ -134,7 +134,7 @@ def question_proposal(chat_mdl, content, topn=3):
|
||||
rendered_prompt = template.render(content=content, topn=topn)
|
||||
|
||||
msg = [{"role": "system", "content": rendered_prompt}, {"role": "user", "content": "Output: "}]
|
||||
_, msg = message_fit_in(msg, chat_mdl.max_length)
|
||||
_, msg = message_fit_in(msg, getattr(chat_mdl, 'max_length', 8096))
|
||||
kwd = chat_mdl.chat(rendered_prompt, msg[1:], {"temperature": 0.2})
|
||||
if isinstance(kwd, tuple):
|
||||
kwd = kwd[0]
|
||||
|
||||
62
api/app/core/rag/utils/libre_office.py
Normal file
62
api/app/core/rag/utils/libre_office.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import subprocess
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from fastapi import HTTPException, status
|
||||
|
||||
# 根据CPU核心数自动设置(保守策略:核心数 * 2)
|
||||
MAX_WORKERS = os.cpu_count() * 2 if os.cpu_count() else 4
|
||||
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
|
||||
|
||||
# 将DOCX/PPT/PPTX文件转换为PDF
|
||||
def convert_to_pdf(src_path):
|
||||
try:
|
||||
print("开始使用LibreOffice将DOC/DOCX/PPT/PPTX转换为PDF...")
|
||||
output_dir = os.path.dirname(src_path)
|
||||
|
||||
# 使用linux上LibreOffice的完整路径调用soffice进行转换
|
||||
libreoffice_path = "/usr/bin/soffice"
|
||||
if not os.path.exists(libreoffice_path):
|
||||
# 使用macOS上LibreOffice的完整路径调用soffice进行转换
|
||||
libreoffice_path = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
|
||||
if not os.path.exists(libreoffice_path):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="LibreOffice未安装或路径不正确,请确认安装。"
|
||||
)
|
||||
|
||||
# 使用subprocess.run的超时设置防止卡死
|
||||
subprocess.run([
|
||||
libreoffice_path,
|
||||
'--headless',
|
||||
'--convert-to', 'pdf',
|
||||
'--outdir', output_dir,
|
||||
src_path
|
||||
], check=True, timeout=120) # 设置超时时间
|
||||
|
||||
# 检查PDF是否生成成功
|
||||
dest_path = os.path.splitext(src_path)[0] + '.pdf'
|
||||
if not os.path.exists(dest_path):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"PDF文件未生成在 {dest_path}"
|
||||
)
|
||||
|
||||
print(f"PDF已保存至 {dest_path}")
|
||||
return dest_path
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"转换过程中出错: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"转换过程中出错: {e}"
|
||||
)
|
||||
except FileNotFoundError as e:
|
||||
print(f"文件错误: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"文件错误: {e}"
|
||||
)
|
||||
|
||||
def async_convert_to_pdf(src_path):
|
||||
# 提交任务到线程池
|
||||
future = executor.submit(convert_to_pdf, src_path)
|
||||
return future # 返回一个future对象,调用者可以使用它来获取结果或处理异常
|
||||
Reference in New Issue
Block a user