Initial commit

This commit is contained in:
Ke Sun
2025-11-30 18:22:17 +08:00
commit aea2fe391e
449 changed files with 83030 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
from .docx_parser import RAGDocxParser as DocxParser
from .excel_parser import RAGExcelParser as ExcelParser
from .html_parser import RAGHtmlParser as HtmlParser
from .json_parser import RAGJsonParser as JsonParser
from .markdown_parser import MarkdownElementExtractor
from .markdown_parser import RAGMarkdownParser as MarkdownParser
from .pdf_parser import PlainParser
from .pdf_parser import RAGPdfParser as PdfParser
from .ppt_parser import RAGPptParser as PptParser
from .txt_parser import RAGTxtParser as TxtParser
__all__ = [
"PdfParser",
"PlainParser",
"DocxParser",
"ExcelParser",
"PptParser",
"HtmlParser",
"JsonParser",
"MarkdownParser",
"TxtParser",
"MarkdownElementExtractor",
]

View File

@@ -0,0 +1,123 @@
from docx import Document
import re
import pandas as pd
from collections import Counter
from app.core.rag.nlp import rag_tokenizer
from io import BytesIO
class RAGDocxParser:
def __extract_table_content(self, tb):
df = []
for row in tb.rows:
df.append([c.text for c in row.cells])
return self.__compose_table_content(pd.DataFrame(df))
def __compose_table_content(self, df):
def blockType(b):
pattern = [
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
(r"^(20|19)[0-9]{2}年$", "Dt"),
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
(r"^第*[一二三四1-4]季度$", "Dt"),
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
(r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
("^[0-9.,+%/ -]+$", "Nu"),
(r"^[0-9A-Z/\._~-]+$", "Ca"),
(r"^[A-Z]*[a-z' -]+$", "En"),
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()' -]+$", "NE"),
(r"^.{1}$", "Sg")
]
for p, n in pattern:
if re.search(p, b):
return n
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
if len(tks) > 3:
if len(tks) < 12:
return "Tx"
else:
return "Lx"
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
return "Nr"
return "Ot"
if len(df) < 2:
return []
max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
1, len(df)) for j in range(len(df.iloc[i, :]))])
max_type = max(max_type.items(), key=lambda x: x[1])[0]
colnm = len(df.iloc[0, :])
hdrows = [0] # header is not necessarily appear in the first line
if max_type == "Nu":
for r in range(1, len(df)):
tys = Counter([blockType(str(df.iloc[r, j]))
for j in range(len(df.iloc[r, :]))])
tys = max(tys.items(), key=lambda x: x[1])[0]
if tys != max_type:
hdrows.append(r)
lines = []
for i in range(1, len(df)):
if i in hdrows:
continue
hr = [r - i for r in hdrows]
hr = [r for r in hr if r < 0]
t = len(hr) - 1
while t > 0:
if hr[t] - hr[t - 1] > 1:
hr = hr[t:]
break
t -= 1
headers = []
for j in range(len(df.iloc[i, :])):
t = []
for h in hr:
x = str(df.iloc[i + h, j]).strip()
if x in t:
continue
t.append(x)
t = ",".join(t)
if t:
t += ": "
headers.append(t)
cells = []
for j in range(len(df.iloc[i, :])):
if not str(df.iloc[i, j]):
continue
cells.append(headers[j] + str(df.iloc[i, j]))
lines.append(";".join(cells))
if colnm > 3:
return lines
return ["\n".join(lines)]
def __call__(self, fnm, from_page=0, to_page=100000000):
self.doc = Document(fnm) if isinstance(
fnm, str) else Document(BytesIO(fnm))
pn = 0 # parsed page
secs = [] # parsed contents
for p in self.doc.paragraphs:
if pn > to_page:
break
runs_within_single_paragraph = [] # save runs within the range of pages
for run in p.runs:
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
runs_within_single_paragraph.append(run.text) # append run.text first
# wrap page break checker into a static method
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
return secs, tbls

View File

@@ -0,0 +1,210 @@
import logging
import re
import sys
from io import BytesIO
import pandas as pd
from openpyxl import Workbook, load_workbook
from app.core.rag.nlp import find_codec
# copied from `/openpyxl/cell/cell.py`
ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
class RAGExcelParser:
@staticmethod
def _load_excel_to_workbook(file_like_object):
if isinstance(file_like_object, bytes):
file_like_object = BytesIO(file_like_object)
# Read first 4 bytes to determine file type
file_like_object.seek(0)
file_head = file_like_object.read(4)
file_like_object.seek(0)
if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")):
logging.info("Not an Excel file, converting CSV to Excel Workbook")
try:
file_like_object.seek(0)
df = pd.read_csv(file_like_object)
return RAGExcelParser._dataframe_to_workbook(df)
except Exception as e_csv:
raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
try:
return load_workbook(file_like_object, data_only=True)
except Exception as e:
logging.info(f"openpyxl load error: {e}, try pandas instead")
try:
file_like_object.seek(0)
try:
dfs = pd.read_excel(file_like_object, sheet_name=None)
return RAGExcelParser._dataframe_to_workbook(dfs)
except Exception as ex:
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
file_like_object.seek(0)
df = pd.read_excel(file_like_object, engine="calamine")
return RAGExcelParser._dataframe_to_workbook(df)
except Exception as e_pandas:
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
@staticmethod
def _clean_dataframe(df: pd.DataFrame):
def clean_string(s):
if isinstance(s, str):
return ILLEGAL_CHARACTERS_RE.sub(" ", s)
return s
return df.apply(lambda col: col.map(clean_string))
@staticmethod
def _dataframe_to_workbook(df):
# if contains multiple sheets use _dataframes_to_workbook
if isinstance(df, dict) and len(df) > 1:
return RAGExcelParser._dataframes_to_workbook(df)
df = RAGExcelParser._clean_dataframe(df)
wb = Workbook()
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
return wb
@staticmethod
def _dataframes_to_workbook(dfs: dict):
wb = Workbook()
default_sheet = wb.active
wb.remove(default_sheet)
for sheet_name, df in dfs.items():
df = RAGExcelParser._clean_dataframe(df)
ws = wb.create_sheet(title=sheet_name)
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
return wb
def html(self, fnm, chunk_rows=256):
from html import escape
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGExcelParser._load_excel_to_workbook(file_like_object)
tb_chunks = []
def _fmt(v):
if v is None:
return ""
return str(v).strip()
for sheetname in wb.sheetnames:
ws = wb[sheetname]
try:
rows = list(ws.rows)
except Exception as e:
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
continue
if not rows:
continue
tb_rows_0 = "<tr>"
for t in list(rows[0]):
tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
tb_rows_0 += "</tr>"
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
tb = ""
tb += f"<table><caption>{sheetname}</caption>"
tb += tb_rows_0
for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
tb += "<tr>"
for i, c in enumerate(r):
if c.value is None:
tb += "<td></td>"
else:
tb += f"<td>{escape(_fmt(c.value))}</td>"
tb += "</tr>"
tb += "</table>\n"
tb_chunks.append(tb)
return tb_chunks
def markdown(self, fnm):
import pandas as pd
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
try:
file_like_object.seek(0)
df = pd.read_excel(file_like_object)
except Exception as e:
logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
file_like_object.seek(0)
df = pd.read_csv(file_like_object)
df = df.replace(r"^\s*$", "", regex=True)
return df.to_markdown(index=False)
def __call__(self, fnm):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGExcelParser._load_excel_to_workbook(file_like_object)
res = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
try:
rows = list(ws.rows)
except Exception as e:
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
continue
if not rows:
continue
ti = list(rows[0])
for r in list(rows[1:]):
fields = []
for i, c in enumerate(r):
if not c.value:
continue
t = str(ti[i].value) if i < len(ti) else ""
t += ("" if t else "") + str(c.value)
fields.append(t)
line = "; ".join(fields)
if sheetname.lower().find("sheet") < 0:
line += " ——" + sheetname
res.append(line)
return res
@staticmethod
def row_number(fnm, binary):
if fnm.split(".")[-1].lower().find("xls") >= 0:
wb = RAGExcelParser._load_excel_to_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
try:
ws = wb[sheetname]
total += len(list(ws.rows))
except Exception as e:
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
continue
return total
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
return len(txt.split("\n"))
if __name__ == "__main__":
psr = RAGExcelParser()
psr(sys.argv[1])

View File

@@ -0,0 +1,118 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from app.core.rag.common.constants import LLMType
from app.core.rag.common.connection_utils import timeout
from app.core.rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from app.core.rag.prompts.generator import vision_llm_figure_describe_prompt
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
return [
(
(figure_data[1], [figure_data[0]]),
[(0, 0, 0, 0, 0)],
)
for figure_data in figures_data_without_positions
if isinstance(figure_data[1], Image.Image)
]
def vision_figure_parser_docx_wrapper(sections,tbls,callback=None,vision_model=None,**kwargs):
if vision_model:
figures_data = vision_figure_parser_figure_data_wrapper(sections)
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tbls.extend(boosted_figures)
except Exception as e:
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
return tbls
def vision_figure_parser_pdf_wrapper(tbls,callback=None,vision_model=None,**kwargs):
if vision_model:
def is_figure_item(item):
return (
isinstance(item[0][0], Image.Image) and
isinstance(item[0][1], list)
)
figures_data = [item for item in tbls if is_figure_item(item)]
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tbls = [item for item in tbls if not is_figure_item(item)]
tbls.extend(boosted_figures)
except Exception as e:
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
return tbls
shared_executor = ThreadPoolExecutor(max_workers=10)
class VisionFigureParser:
def __init__(self, vision_model, figures_data, *args, **kwargs):
self.vision_model = vision_model
self._extract_figures_info(figures_data)
assert len(self.figures) == len(self.descriptions)
assert not self.positions or (len(self.figures) == len(self.positions))
def _extract_figures_info(self, figures_data):
self.figures = []
self.descriptions = []
self.positions = []
for item in figures_data:
# position
if len(item) == 2 and isinstance(item[0], tuple) and len(item[0]) == 2 and isinstance(item[1], list) and isinstance(item[1][0], tuple) and len(item[1][0]) == 5:
img_desc = item[0]
assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])"
self.figures.append(img_desc[0])
self.descriptions.append(img_desc[1])
self.positions.append(item[1])
else:
assert len(item) == 2 and isinstance(item[0], Image.Image) and isinstance(item[1], list), f"Unexpected form of figure data: get {len(item)=}, {item=}"
self.figures.append(item[0])
self.descriptions.append(item[1])
def _assemble(self):
self.assembled = []
self.has_positions = len(self.positions) != 0
for i in range(len(self.figures)):
figure = self.figures[i]
desc = self.descriptions[i]
pos = self.positions[i] if self.has_positions else None
figure_desc = (figure, desc)
if pos is not None:
self.assembled.append((figure_desc, pos))
else:
self.assembled.append((figure_desc,))
return self.assembled
def __call__(self, **kwargs):
callback = kwargs.get("callback", lambda prog, msg: None)
@timeout(30, 3)
def process(figure_idx, figure_binary):
description_text = picture_vision_llm_chunk(
binary=figure_binary,
vision_model=self.vision_model,
prompt=vision_llm_figure_describe_prompt(),
callback=callback,
)
return figure_idx, description_text
futures = []
for idx, img_binary in enumerate(self.figures or []):
futures.append(shared_executor.submit(process, idx, img_binary))
for future in as_completed(futures):
figure_num, txt = future.result()
if txt:
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
self._assemble()
return self.assembled

View File

@@ -0,0 +1,197 @@
from app.core.rag.nlp import find_codec, rag_tokenizer
import uuid
import chardet
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
import html
def get_encoding(file):
with open(file,'rb') as f:
tmp = chardet.detect(f.read())
return tmp['encoding']
BLOCK_TAGS = [
"h1", "h2", "h3", "h4", "h5", "h6",
"p", "div", "article", "section", "aside",
"ul", "ol", "li",
"table", "pre", "code", "blockquote",
"figure", "figcaption"
]
TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
class RAGHtmlParser:
def __call__(self, fnm, binary=None, chunk_token_num=512):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
txt = f.read()
return self.parser_txt(txt, chunk_token_num)
@classmethod
def parser_txt(cls, txt, chunk_token_num):
if not isinstance(txt, str):
raise TypeError("txt type should be string!")
temp_sections = []
soup = BeautifulSoup(txt, "html5lib")
# delete <style> tag
for style_tag in soup.find_all(["style", "script"]):
style_tag.decompose()
# delete <script> tag in <div>
for div_tag in soup.find_all("div"):
for script_tag in div_tag.find_all("script"):
script_tag.decompose()
# delete inline style
for tag in soup.find_all(True):
if 'style' in tag.attrs:
del tag.attrs['style']
# delete HTML comment
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
block_txt_list, table_list = cls.merge_block_text(temp_sections)
sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
for table in table_list:
sections.append(table.get("content", ""))
return sections
@classmethod
def split_table(cls, html_table, chunk_token_num=512):
soup = BeautifulSoup(html_table, "html.parser")
rows = soup.find_all("tr")
tables = []
current_table = []
current_count = 0
table_str_list = []
for row in rows:
tks_str = rag_tokenizer.tokenize(str(row))
token_count = len(tks_str.split(" ")) if tks_str else 0
if current_count + token_count > chunk_token_num:
tables.append(current_table)
current_table = []
current_count = 0
current_table.append(row)
current_count += token_count
if current_table:
tables.append(current_table)
for table_rows in tables:
new_table = soup.new_tag("table")
for row in table_rows:
new_table.append(row)
table_str_list.append(str(new_table))
return table_str_list
@classmethod
def read_text_recursively(cls, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None):
if isinstance(element, NavigableString):
content = element.strip()
def is_valid_html(content):
try:
soup = BeautifulSoup(content, "html.parser")
return bool(soup.find())
except Exception:
return False
return_info = []
if content:
if is_valid_html(content):
soup = BeautifulSoup(content, "html.parser")
child_info = cls.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id)
parser_result.extend(child_info)
else:
info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}}
if parent_name:
info["tag_name"] = parent_name
return_info.append(info)
return return_info
elif isinstance(element, Tag):
if str.lower(element.name) == "table":
table_info_list = []
table_id = str(uuid.uuid1())
table_list = [html.unescape(str(element))]
for t in table_list:
table_info_list.append({"content": t, "tag_name": "table",
"metadata": {"table_id": table_id, "index": table_list.index(t)}})
return table_info_list
else:
block_id = None
if str.lower(element.name) in BLOCK_TAGS:
block_id = str(uuid.uuid1())
for child in element.children:
child_info = cls.read_text_recursively(child, parser_result, chunk_token_num, element.name,
block_id)
parser_result.extend(child_info)
return []
@classmethod
def merge_block_text(cls, parser_result):
block_content = []
current_content = ""
table_info_list = []
lask_block_id = None
for item in parser_result:
content = item.get("content")
tag_name = item.get("tag_name")
title_flag = tag_name in TITLE_TAGS
block_id = item.get("metadata", {}).get("block_id")
if block_id:
if title_flag:
content = f"{TITLE_TAGS[tag_name]} {content}"
if lask_block_id != block_id:
if lask_block_id is not None:
block_content.append(current_content)
current_content = content
lask_block_id = block_id
else:
current_content += (" " if current_content else "") + content
else:
if tag_name == "table":
table_info_list.append(item)
else:
current_content += (" " if current_content else "" + content)
if current_content:
block_content.append(current_content)
return block_content, table_info_list
@classmethod
def chunk_block(cls, block_txt_list, chunk_token_num=512):
chunks = []
current_block = ""
current_token_count = 0
for block in block_txt_list:
tks_str = rag_tokenizer.tokenize(block)
block_token_count = len(tks_str.split(" ")) if tks_str else 0
if block_token_count > chunk_token_num:
if current_block:
chunks.append(current_block)
start = 0
tokens = tks_str.split(" ")
while start < len(tokens):
end = start + chunk_token_num
split_tokens = tokens[start:end]
chunks.append(" ".join(split_tokens))
start = end
current_block = ""
current_token_count = 0
else:
if current_token_count + block_token_count <= chunk_token_num:
current_block += ("\n" if current_block else "") + block
current_token_count += block_token_count
else:
chunks.append(current_block)
current_block = block
current_token_count = block_token_count
if current_block:
chunks.append(current_block)
return chunks

View File

@@ -0,0 +1,159 @@
import json
from typing import Any
from app.core.rag.nlp import find_codec
class RAGJsonParser:
def __init__(self, max_chunk_size: int = 2000, min_chunk_size: int | None = None):
super().__init__()
self.max_chunk_size = max_chunk_size * 2
self.min_chunk_size = min_chunk_size if min_chunk_size is not None else max(max_chunk_size - 200, 50)
def __call__(self, filename):
with open(filename, "r") as f:
txt = f.read()
if self.is_jsonl_format(txt):
sections = self._parse_jsonl(txt)
else:
sections = self._parse_json(txt)
return sections
@staticmethod
def _json_size(data: dict) -> int:
"""Calculate the size of the serialized JSON object."""
return len(json.dumps(data, ensure_ascii=False))
@staticmethod
def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
"""Set a value in a nested dictionary based on the given path."""
for key in path[:-1]:
d = d.setdefault(key, {})
d[path[-1]] = value
def _list_to_dict_preprocessing(self, data: Any) -> Any:
if isinstance(data, dict):
# Process each key-value pair in the dictionary
return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
elif isinstance(data, list):
# Convert the list to a dictionary with index-based keys
return {str(i): self._list_to_dict_preprocessing(item) for i, item in enumerate(data)}
else:
# Base case: the item is neither a dict nor a list, so return it unchanged
return data
def _json_split(
self,
data,
current_path: list[str] | None,
chunks: list[dict] | None,
) -> list[dict]:
"""
Split json into maximum size dictionaries while preserving structure.
"""
current_path = current_path or []
chunks = chunks or [{}]
if isinstance(data, dict):
for key, value in data.items():
new_path = current_path + [key]
chunk_size = self._json_size(chunks[-1])
size = self._json_size({key: value})
remaining = self.max_chunk_size - chunk_size
if size < remaining:
# Add item to current chunk
self._set_nested_dict(chunks[-1], new_path, value)
else:
if chunk_size >= self.min_chunk_size:
# Chunk is big enough, start a new chunk
chunks.append({})
# Iterate
self._json_split(value, new_path, chunks)
else:
# handle single item
self._set_nested_dict(chunks[-1], current_path, data)
return chunks
def split_json(
self,
json_data,
convert_lists: bool = False,
) -> list[dict]:
"""Splits JSON into a list of JSON chunks"""
if convert_lists:
preprocessed_data = self._list_to_dict_preprocessing(json_data)
chunks = self._json_split(preprocessed_data, None, None)
else:
chunks = self._json_split(json_data, None, None)
# Remove the last chunk if it's empty
if not chunks[-1]:
chunks.pop()
return chunks
def split_text(
self,
json_data: dict[str, Any],
convert_lists: bool = False,
ensure_ascii: bool = True,
) -> list[str]:
"""Splits JSON into a list of JSON formatted strings"""
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
# Convert to string
return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
def _parse_json(self, content: str) -> list[str]:
sections = []
try:
json_data = json.loads(content)
chunks = self.split_json(json_data, True)
sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
except json.JSONDecodeError:
pass
return sections
def _parse_jsonl(self, content: str) -> list[str]:
lines = content.strip().splitlines()
all_chunks = []
for line in lines:
if not line.strip():
continue
try:
data = json.loads(line)
chunks = self.split_json(data, convert_lists=True)
all_chunks.extend(json.dumps(chunk, ensure_ascii=False) for chunk in chunks if chunk)
except json.JSONDecodeError:
continue
return all_chunks
def is_jsonl_format(self, txt: str, sample_limit: int = 10, threshold: float = 0.8) -> bool:
lines = [line.strip() for line in txt.strip().splitlines() if line.strip()]
if not lines:
return False
try:
json.loads(txt)
return False
except json.JSONDecodeError:
pass
sample_limit = min(len(lines), sample_limit)
sample_lines = lines[:sample_limit]
valid_lines = sum(1 for line in sample_lines if self._is_valid_json(line))
if not valid_lines:
return False
return (valid_lines / len(sample_lines)) >= threshold
def _is_valid_json(self, line: str) -> bool:
try:
json.loads(line)
return True
except json.JSONDecodeError:
return False

View File

@@ -0,0 +1,277 @@
import re
from markdown import markdown
class RAGMarkdownParser:
def __init__(self, chunk_token_num=128):
self.chunk_token_num = int(chunk_token_num)
def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
tables = []
working_text = markdown_text
def replace_tables_with_rendered_html(pattern, table_list, render=True):
new_text = ""
last_end = 0
for match in pattern.finditer(working_text):
raw_table = match.group()
table_list.append(raw_table)
if separate_tables:
# Skip this match (i.e., remove it)
new_text += working_text[last_end : match.start()] + "\n\n"
else:
# Replace with rendered HTML
html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
new_text += working_text[last_end : match.start()] + html_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
return new_text
if "|" in markdown_text: # for optimize performance
# Standard Markdown table
border_table_pattern = re.compile(
r"""
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
""",
re.VERBOSE,
)
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
# Borderless Markdown table
no_border_table_pattern = re.compile(
r"""
(?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
""",
re.VERBOSE,
)
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
# Replace any TAGS e.g. <table ...> to <table>
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
table_with_attributes_pattern = re.compile(
rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
)
def replace_tag(m):
tag_name = re.match(r"<(\w+)", m.group()).group(1)
return "<{}>".format(tag_name)
working_text = re.sub(table_with_attributes_pattern, replace_tag, working_text)
if "<table>" in working_text.lower(): # for optimize performance
# HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile(
r"""
(?:\n|^)
\s*
(?:
# case1: <html><body><table>...</table></body></html>
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
# case2: <body><table>...</table></body>
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
# case3: only<table>...</table>
(?:<table[^>]*>.*?</table>)
)
\s*
(?=\n|$)
""",
re.VERBOSE | re.DOTALL | re.IGNORECASE,
)
def replace_html_tables():
nonlocal working_text
new_text = ""
last_end = 0
for match in html_table_pattern.finditer(working_text):
raw_table = match.group()
tables.append(raw_table)
if separate_tables:
new_text += working_text[last_end : match.start()] + "\n\n"
else:
new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
working_text = new_text
replace_html_tables()
return working_text, tables
class MarkdownElementExtractor:
def __init__(self, markdown_content):
self.markdown_content = markdown_content
self.lines = markdown_content.split("\n")
def get_delimiters(self,delimiters):
toks = re.findall(r"`([^`]+)`", delimiters)
toks = sorted(set(toks), key=lambda x: -len(x))
return "|".join(re.escape(t) for t in toks if t)
def extract_elements(self,delimiter=None):
"""Extract individual elements (headers, code blocks, lists, etc.)"""
sections = []
i = 0
dels=""
if delimiter:
dels = self.get_delimiters(delimiter)
if len(dels) > 0:
text = "\n".join(self.lines)
parts = re.split(dels, text)
sections = [p.strip() for p in parts if p and p.strip()]
return sections
while i < len(self.lines):
line = self.lines[i]
if re.match(r"^#{1,6}\s+.*$", line):
# header
element = self._extract_header(i)
sections.append(element["content"])
i = element["end_line"] + 1
elif line.strip().startswith("```"):
# code block
element = self._extract_code_block(i)
sections.append(element["content"])
i = element["end_line"] + 1
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
# list block
element = self._extract_list_block(i)
sections.append(element["content"])
i = element["end_line"] + 1
elif line.strip().startswith(">"):
# blockquote
element = self._extract_blockquote(i)
sections.append(element["content"])
i = element["end_line"] + 1
elif line.strip():
# text block (paragraphs and inline elements until next block element)
element = self._extract_text_block(i)
sections.append(element["content"])
i = element["end_line"] + 1
else:
i += 1
sections = [section for section in sections if section.strip()]
return sections
def _extract_header(self, start_pos):
return {
"type": "header",
"content": self.lines[start_pos],
"start_line": start_pos,
"end_line": start_pos,
}
def _extract_code_block(self, start_pos):
end_pos = start_pos
content_lines = [self.lines[start_pos]]
# Find the end of the code block
for i in range(start_pos + 1, len(self.lines)):
content_lines.append(self.lines[i])
end_pos = i
if self.lines[i].strip().startswith("```"):
break
return {
"type": "code_block",
"content": "\n".join(content_lines),
"start_line": start_pos,
"end_line": end_pos,
}
def _extract_list_block(self, start_pos):
end_pos = start_pos
content_lines = []
i = start_pos
while i < len(self.lines):
line = self.lines[i]
# check if this line is a list item or continuation of a list
if (
re.match(r"^\s*[-*+]\s+.*$", line)
or re.match(r"^\s*\d+\.\s+.*$", line)
or (i > start_pos and not line.strip())
or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
or (i > start_pos and re.match(r"^\s+\w+.*$", line))
):
content_lines.append(line)
end_pos = i
i += 1
else:
break
return {
"type": "list_block",
"content": "\n".join(content_lines),
"start_line": start_pos,
"end_line": end_pos,
}
def _extract_blockquote(self, start_pos):
end_pos = start_pos
content_lines = []
i = start_pos
while i < len(self.lines):
line = self.lines[i]
if line.strip().startswith(">") or (i > start_pos and not line.strip()):
content_lines.append(line)
end_pos = i
i += 1
else:
break
return {
"type": "blockquote",
"content": "\n".join(content_lines),
"start_line": start_pos,
"end_line": end_pos,
}
def _extract_text_block(self, start_pos):
"""Extract a text block (paragraphs, inline elements) until next block element"""
end_pos = start_pos
content_lines = [self.lines[start_pos]]
i = start_pos + 1
while i < len(self.lines):
line = self.lines[i]
# stop if we encounter a block element
if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
break
elif not line.strip():
# check if the next line is a block element
if i + 1 < len(self.lines) and (
re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
or self.lines[i + 1].strip().startswith("```")
or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
or self.lines[i + 1].strip().startswith(">")
):
break
else:
content_lines.append(line)
end_pos = i
i += 1
else:
content_lines.append(line)
end_pos = i
i += 1
return {
"type": "text_block",
"content": "\n".join(content_lines),
"start_line": start_pos,
"end_line": end_pos,
}

View File

@@ -0,0 +1,524 @@
import json
import logging
import os
import platform
import re
import subprocess
import sys
import tempfile
import threading
import time
import zipfile
from io import BytesIO
from os import PathLike
from pathlib import Path
from queue import Empty, Queue
from typing import Any, Callable, Optional
import numpy as np
import pdfplumber
import requests
from PIL import Image
from strenum import StrEnum
from .pdf_parser import RAGPdfParser
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
class MinerUContentType(StrEnum):
IMAGE = "image"
TABLE = "table"
TEXT = "text"
EQUATION = "equation"
CODE = "code"
LIST = "list"
DISCARDED = "discarded"
class MinerUParser(RAGPdfParser):
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
self.mineru_path = Path(mineru_path)
self.mineru_api = mineru_api.rstrip("/")
self.mineru_server_url = mineru_server_url.rstrip("/")
self.using_api = False
self.logger = logging.getLogger(self.__class__.__name__)
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
with zipfile.ZipFile(zip_path, "r") as zip_ref:
if not root_dir:
files = zip_ref.namelist()
if files and files[0].endswith("/"):
root_dir = files[0]
else:
root_dir = None
if not root_dir or not root_dir.endswith("/"):
self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}")
zip_ref.extractall(extract_to)
return
root_len = len(root_dir)
for member in zip_ref.infolist():
filename = member.filename
if filename == root_dir:
self.logger.info("[MinerU] Ignore root folder...")
continue
path = filename
if path.startswith(root_dir):
path = path[root_len:]
full_path = os.path.join(extract_to, path)
if member.is_dir():
os.makedirs(full_path, exist_ok=True)
else:
os.makedirs(os.path.dirname(full_path), exist_ok=True)
with open(full_path, "wb") as f:
f.write(zip_ref.read(filename))
def _is_http_endpoint_valid(self, url, timeout=5):
try:
response = requests.head(url, timeout=timeout, allow_redirects=True)
return response.status_code in [200, 301, 302, 307, 308]
except Exception:
return False
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
reason = ""
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
if backend not in valid_backends:
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
logging.warning(reason)
return False, reason
subprocess_kwargs = {
"capture_output": True,
"text": True,
"check": True,
"encoding": "utf-8",
"errors": "ignore",
}
if platform.system() == "Windows":
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
if server_url is None:
server_url = self.mineru_server_url
if backend == "vlm-http-client" and server_url:
try:
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
logging.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
if server_accessible:
self.using_api = False # We are using http client, not API
return True, reason
else:
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
logging.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
return False, reason
except Exception as e:
logging.warning(f"[MinerU] vlm-http-client server check failed: {e}")
try:
response = requests.get(server_url, timeout=5)
logging.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
self.using_api = False
return True, reason
except Exception as e:
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
logging.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
return False, reason
try:
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
version_info = result.stdout.strip()
if version_info:
logging.info(f"[MinerU] Detected version: {version_info}")
else:
logging.info("[MinerU] Detected MinerU, but version info is empty.")
return True, reason
except subprocess.CalledProcessError as e:
logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
except FileNotFoundError:
logging.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
except Exception as e:
logging.error(f"[MinerU] Unexpected error during installation check: {e}")
# If executable check fails, try API check
try:
if self.mineru_api:
# check openapi.json
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
if not openapi_exists:
reason = "[MinerU] Failed to detect vaild MinerU API server"
return openapi_exists, reason
logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
self.using_api = openapi_exists
return openapi_exists, reason
else:
logging.info("[MinerU] api not exists.")
except Exception as e:
reason = f"[MinerU] Unexpected error during api check: {e}"
logging.error(f"[MinerU] Unexpected error during api check: {e}")
return False, reason
def _run_mineru(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
):
if self.using_api:
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
else:
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
pdf_file_path = str(input_path)
if not os.path.exists(pdf_file_path):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, method)
os.makedirs(output_path, exist_ok=True)
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
data = {
"output_dir": "./output",
"lang_list": lang,
"backend": backend,
"parse_method": method,
"formula_enable": True,
"table_enable": True,
"server_url": None,
"return_md": True,
"return_middle_json": True,
"return_model_output": True,
"return_content_list": True,
"return_images": True,
"response_format_zip": True,
"start_page_id": 0,
"end_page_id": 99999,
}
headers = {"Accept": "application/json"}
try:
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
if callback:
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800)
response.raise_for_status()
if response.headers.get("Content-Type") == "application/zip":
self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
if callback:
callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
with open(OUTPUT_ZIP_PATH, "wb") as f:
f.write(response.content)
self.logger.info(f"[MinerU] Unzip to {output_path}...")
self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/")
if callback:
callback(0.40, f"[MinerU] Unzip to {output_path}...")
else:
self.logger.warning("[MinerU] not zip returned from api%s " % response.headers.get("Content-Type"))
except Exception as e:
raise RuntimeError(f"[MinerU] api failed with exception {e}")
self.logger.info("[MinerU] Api completed successfully.")
def _run_mineru_executable(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
):
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
if backend:
cmd.extend(["-b", backend])
if lang:
cmd.extend(["-l", lang])
if server_url and backend == "vlm-http-client":
cmd.extend(["-u", server_url])
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
subprocess_kwargs = {
"stdout": subprocess.PIPE,
"stderr": subprocess.PIPE,
"text": True,
"encoding": "utf-8",
"errors": "ignore",
"bufsize": 1,
}
if platform.system() == "Windows":
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
process = subprocess.Popen(cmd, **subprocess_kwargs)
stdout_queue, stderr_queue = Queue(), Queue()
def enqueue_output(pipe, queue, prefix):
for line in iter(pipe.readline, ""):
if line.strip():
queue.put((prefix, line.strip()))
pipe.close()
threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
while process.poll() is None:
for q in (stdout_queue, stderr_queue):
try:
while True:
prefix, line = q.get_nowait()
if prefix == "STDOUT":
self.logger.info(f"[MinerU] {line}")
else:
self.logger.warning(f"[MinerU] {line}")
except Empty:
pass
time.sleep(0.1)
return_code = process.wait()
if return_code != 0:
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
self.logger.info("[MinerU] Command completed successfully.")
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
self.page_from = page_from
self.page_to = page_to
try:
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
self.pdf = pdf
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
except Exception as e:
self.page_images = None
self.total_page = 0
logging.exception(e)
def _line_tag(self, bx):
pn = [bx["page_idx"] + 1]
positions = bx["bbox"]
x0, top, x1, bott = positions
if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
page_width, page_height = self.page_images[bx["page_idx"]].size
x0 = (x0 / 1000.0) * page_width
x1 = (x1 / 1000.0) * page_width
top = (top / 1000.0) * page_height
bott = (bott / 1000.0) * page_height
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
def crop(self, text, ZM=1, need_position=False):
imgs = []
poss = self.extract_positions(text)
if not poss:
if need_position:
return None, None
return
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
pos = poss[-1]
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
positions = []
for ii, (pns, left, right, top, bottom) in enumerate(poss):
right = left + max_width
if bottom <= top:
bottom = top + 2
for pn in pns[1:]:
bottom += self.page_images[pn - 1].size[1]
img0 = self.page_images[pns[0]]
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
crop0 = img0.crop((x0, y0, x1, y1))
imgs.append(crop0)
if 0 < ii < len(poss) - 1:
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
bottom -= img0.size[1]
for pn in pns[1:]:
page = self.page_images[pn]
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
cimgp = page.crop((x0, y0, x1, y1))
imgs.append(cimgp)
if 0 < ii < len(poss) - 1:
positions.append((pn + self.page_from, x0, x1, y0, y1))
bottom -= page.size[1]
if not imgs:
if need_position:
return None, None
return
height = 0
for img in imgs:
height += img.size[1] + GAP
height = int(height)
width = int(np.max([i.size[0] for i in imgs]))
pic = Image.new("RGB", (width, height), (245, 245, 245))
height = 0
for ii, img in enumerate(imgs):
if ii == 0 or ii + 1 == len(imgs):
img = img.convert("RGBA")
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
overlay.putalpha(128)
img = Image.alpha_composite(img, overlay).convert("RGB")
pic.paste(img, (0, int(height)))
height += img.size[1] + GAP
if need_position:
return pic, positions
return pic
@staticmethod
def extract_positions(txt: str):
poss = []
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
subdir = output_dir / file_stem / method
if backend.startswith("vlm-"):
subdir = output_dir / file_stem / "vlm"
json_file = subdir / f"{file_stem}_content_list.json"
if not json_file.exists():
raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}")
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
for item in data:
for key in ("img_path", "table_img_path", "equation_img_path"):
if key in item and item[key]:
item[key] = str((subdir / item[key]).resolve())
return data
def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
sections = []
for output in outputs:
match output["type"]:
case MinerUContentType.TEXT:
section = output["text"]
case MinerUContentType.TABLE:
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
if not section.strip():
section = "FAILED TO PARSE TABLE"
case MinerUContentType.IMAGE:
section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"])
case MinerUContentType.EQUATION:
section = output["text"]
case MinerUContentType.CODE:
section = output["code_body"] + "\n".join(output.get("code_caption", []))
case MinerUContentType.LIST:
section = "\n".join(output.get("list_items", []))
case MinerUContentType.DISCARDED:
pass
if section:
sections.append((section, self._line_tag(output)))
return sections
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
return []
def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes,
callback: Optional[Callable] = None,
*,
output_dir: Optional[str] = None,
backend: str = "pipeline",
lang: Optional[str] = None,
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
) -> tuple:
import shutil
temp_pdf = None
created_tmp_dir = False
# remove spaces, or mineru crash, and _read_output fail too
file_path = Path(filepath)
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
if binary:
temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
temp_pdf = temp_dir / pdf_file_name
with open(temp_pdf, "wb") as f:
f.write(binary)
pdf = temp_pdf
self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
if callback:
callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
else:
if pdf_file_path_valid != filepath:
self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
shutil.move(filepath, pdf_file_path_valid)
pdf = Path(pdf_file_path_valid)
if not pdf.exists():
if callback:
callback(-1, f"[MinerU] PDF not found: {pdf}")
raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
if output_dir:
out_dir = Path(output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
else:
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
created_tmp_dir = True
self.logger.info(f"[MinerU] Output directory: {out_dir}")
if callback:
callback(0.15, f"[MinerU] Output directory: {out_dir}")
self.__images__(pdf, zoomin=1)
try:
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
finally:
if temp_pdf and temp_pdf.exists():
try:
temp_pdf.unlink()
temp_pdf.parent.rmdir()
except Exception:
pass
if delete_output and created_tmp_dir and out_dir.exists():
try:
shutil.rmtree(out_dir)
except Exception:
pass
if __name__ == "__main__":
parser = MinerUParser("mineru")
ok, reason = parser.check_installation()
print("MinerU available:", ok)
filepath = ""
with open(filepath, "rb") as file:
outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
for output in outputs:
print(output)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,83 @@
import logging
from io import BytesIO
from pptx import Presentation
class RAGPptParser:
def __init__(self):
super().__init__()
def __get_bulleted_text(self, paragraph):
is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
if is_bulleted:
return f"{' '* paragraph.level}.{paragraph.text}"
else:
return paragraph.text
def __extract(self, shape):
try:
# First try to get text content
if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
text_frame = shape.text_frame
texts = []
for paragraph in text_frame.paragraphs:
if paragraph.text.strip():
texts.append(self.__get_bulleted_text(paragraph))
return "\n".join(texts)
# Safely get shape_type
try:
shape_type = shape.shape_type
except NotImplementedError:
# If shape_type is not available, try to get text content
if hasattr(shape, 'text'):
return shape.text.strip()
return ""
# Handle table
if shape_type == 19:
tb = shape.table
rows = []
for i in range(1, len(tb.rows)):
rows.append("; ".join([tb.cell(
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)
# Handle group shape
if shape_type == 6:
texts = []
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
t = self.__extract(p)
if t:
texts.append(t)
return "\n".join(texts)
return ""
except Exception as e:
logging.error(f"Error processing shape: {str(e)}")
return ""
def __call__(self, fnm, from_page, to_page, callback=None):
ppt = Presentation(fnm) if isinstance(
fnm, str) else Presentation(
BytesIO(fnm))
txts = []
self.total_page = len(ppt.slides)
for i, slide in enumerate(ppt.slides):
if i < from_page:
continue
if i >= to_page:
break
texts = []
for shape in sorted(
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
try:
txt = self.__extract(shape)
if txt:
texts.append(txt)
except Exception as e:
logging.exception(e)
txts.append("\n".join(texts))
return txts

View File

@@ -0,0 +1,48 @@
import re
from .utils import get_text
from app.core.rag.common.token_utils import num_tokens_from_string
class RAGTxtParser:
def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
txt = get_text(fnm, binary)
return self.parser_txt(txt, chunk_token_num, delimiter)
@classmethod
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
if not isinstance(txt, str):
raise TypeError("txt type should be str!")
cks = [""]
tk_nums = [0]
delimiter = delimiter.encode('utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
def add_chunk(t):
nonlocal cks, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if tk_nums[-1] > chunk_token_num:
cks.append(t)
tk_nums.append(tnum)
else:
cks[-1] += t
tk_nums[-1] += tnum
dels = []
s = 0
for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
f, t = m.span()
dels.append(m.group(1))
dels.extend(list(delimiter[s: f]))
s = t
if s < len(delimiter):
dels.extend(list(delimiter[s:]))
dels = [re.escape(d) for d in dels if d]
dels = [d for d in dels if d]
dels = "|".join(dels)
secs = re.split(r"(%s)" % dels, txt)
for sec in secs:
if re.match(f"^{dels}$", sec):
continue
add_chunk(sec)
return [[c, ""] for c in cks]

View File

@@ -0,0 +1,16 @@
from app.core.rag.nlp import find_codec
def get_text(fnm: str, binary=None) -> str:
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
txt += line
return txt