Initial commit
This commit is contained in:
24
app/core/rag/deepdoc/parser/__init__.py
Normal file
24
app/core/rag/deepdoc/parser/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from .docx_parser import RAGDocxParser as DocxParser
|
||||
from .excel_parser import RAGExcelParser as ExcelParser
|
||||
from .html_parser import RAGHtmlParser as HtmlParser
|
||||
from .json_parser import RAGJsonParser as JsonParser
|
||||
from .markdown_parser import MarkdownElementExtractor
|
||||
from .markdown_parser import RAGMarkdownParser as MarkdownParser
|
||||
from .pdf_parser import PlainParser
|
||||
from .pdf_parser import RAGPdfParser as PdfParser
|
||||
from .ppt_parser import RAGPptParser as PptParser
|
||||
from .txt_parser import RAGTxtParser as TxtParser
|
||||
|
||||
__all__ = [
|
||||
"PdfParser",
|
||||
"PlainParser",
|
||||
"DocxParser",
|
||||
"ExcelParser",
|
||||
"PptParser",
|
||||
"HtmlParser",
|
||||
"JsonParser",
|
||||
"MarkdownParser",
|
||||
"TxtParser",
|
||||
"MarkdownElementExtractor",
|
||||
]
|
||||
|
||||
123
app/core/rag/deepdoc/parser/docx_parser.py
Normal file
123
app/core/rag/deepdoc/parser/docx_parser.py
Normal file
@@ -0,0 +1,123 @@
|
||||
from docx import Document
|
||||
import re
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
from app.core.rag.nlp import rag_tokenizer
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class RAGDocxParser:
|
||||
|
||||
def __extract_table_content(self, tb):
|
||||
df = []
|
||||
for row in tb.rows:
|
||||
df.append([c.text for c in row.cells])
|
||||
return self.__compose_table_content(pd.DataFrame(df))
|
||||
|
||||
def __compose_table_content(self, df):
|
||||
|
||||
def blockType(b):
|
||||
pattern = [
|
||||
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
|
||||
("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
||||
(r"^第*[一二三四1-4]季度$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
|
||||
("^[0-9.,+%/ -]+$", "Nu"),
|
||||
(r"^[0-9A-Z/\._~-]+$", "Ca"),
|
||||
(r"^[A-Z]*[a-z' -]+$", "En"),
|
||||
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
||||
(r"^.{1}$", "Sg")
|
||||
]
|
||||
for p, n in pattern:
|
||||
if re.search(p, b):
|
||||
return n
|
||||
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
|
||||
if len(tks) > 3:
|
||||
if len(tks) < 12:
|
||||
return "Tx"
|
||||
else:
|
||||
return "Lx"
|
||||
|
||||
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||||
return "Nr"
|
||||
|
||||
return "Ot"
|
||||
|
||||
if len(df) < 2:
|
||||
return []
|
||||
max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
|
||||
1, len(df)) for j in range(len(df.iloc[i, :]))])
|
||||
max_type = max(max_type.items(), key=lambda x: x[1])[0]
|
||||
|
||||
colnm = len(df.iloc[0, :])
|
||||
hdrows = [0] # header is not necessarily appear in the first line
|
||||
if max_type == "Nu":
|
||||
for r in range(1, len(df)):
|
||||
tys = Counter([blockType(str(df.iloc[r, j]))
|
||||
for j in range(len(df.iloc[r, :]))])
|
||||
tys = max(tys.items(), key=lambda x: x[1])[0]
|
||||
if tys != max_type:
|
||||
hdrows.append(r)
|
||||
|
||||
lines = []
|
||||
for i in range(1, len(df)):
|
||||
if i in hdrows:
|
||||
continue
|
||||
hr = [r - i for r in hdrows]
|
||||
hr = [r for r in hr if r < 0]
|
||||
t = len(hr) - 1
|
||||
while t > 0:
|
||||
if hr[t] - hr[t - 1] > 1:
|
||||
hr = hr[t:]
|
||||
break
|
||||
t -= 1
|
||||
headers = []
|
||||
for j in range(len(df.iloc[i, :])):
|
||||
t = []
|
||||
for h in hr:
|
||||
x = str(df.iloc[i + h, j]).strip()
|
||||
if x in t:
|
||||
continue
|
||||
t.append(x)
|
||||
t = ",".join(t)
|
||||
if t:
|
||||
t += ": "
|
||||
headers.append(t)
|
||||
cells = []
|
||||
for j in range(len(df.iloc[i, :])):
|
||||
if not str(df.iloc[i, j]):
|
||||
continue
|
||||
cells.append(headers[j] + str(df.iloc[i, j]))
|
||||
lines.append(";".join(cells))
|
||||
|
||||
if colnm > 3:
|
||||
return lines
|
||||
return ["\n".join(lines)]
|
||||
|
||||
def __call__(self, fnm, from_page=0, to_page=100000000):
|
||||
self.doc = Document(fnm) if isinstance(
|
||||
fnm, str) else Document(BytesIO(fnm))
|
||||
pn = 0 # parsed page
|
||||
secs = [] # parsed contents
|
||||
for p in self.doc.paragraphs:
|
||||
if pn > to_page:
|
||||
break
|
||||
|
||||
runs_within_single_paragraph = [] # save runs within the range of pages
|
||||
for run in p.runs:
|
||||
if pn > to_page:
|
||||
break
|
||||
if from_page <= pn < to_page and p.text.strip():
|
||||
runs_within_single_paragraph.append(run.text) # append run.text first
|
||||
|
||||
# wrap page break checker into a static method
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
pn += 1
|
||||
|
||||
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
|
||||
|
||||
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
||||
return secs, tbls
|
||||
210
app/core/rag/deepdoc/parser/excel_parser.py
Normal file
210
app/core/rag/deepdoc/parser/excel_parser.py
Normal file
@@ -0,0 +1,210 @@
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from io import BytesIO
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook, load_workbook
|
||||
|
||||
from app.core.rag.nlp import find_codec
|
||||
|
||||
# copied from `/openpyxl/cell/cell.py`
|
||||
ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
|
||||
|
||||
|
||||
class RAGExcelParser:
|
||||
@staticmethod
|
||||
def _load_excel_to_workbook(file_like_object):
|
||||
if isinstance(file_like_object, bytes):
|
||||
file_like_object = BytesIO(file_like_object)
|
||||
|
||||
# Read first 4 bytes to determine file type
|
||||
file_like_object.seek(0)
|
||||
file_head = file_like_object.read(4)
|
||||
file_like_object.seek(0)
|
||||
|
||||
if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")):
|
||||
logging.info("Not an Excel file, converting CSV to Excel Workbook")
|
||||
|
||||
try:
|
||||
file_like_object.seek(0)
|
||||
df = pd.read_csv(file_like_object)
|
||||
return RAGExcelParser._dataframe_to_workbook(df)
|
||||
|
||||
except Exception as e_csv:
|
||||
raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
|
||||
|
||||
try:
|
||||
return load_workbook(file_like_object, data_only=True)
|
||||
except Exception as e:
|
||||
logging.info(f"openpyxl load error: {e}, try pandas instead")
|
||||
try:
|
||||
file_like_object.seek(0)
|
||||
try:
|
||||
dfs = pd.read_excel(file_like_object, sheet_name=None)
|
||||
return RAGExcelParser._dataframe_to_workbook(dfs)
|
||||
except Exception as ex:
|
||||
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
|
||||
file_like_object.seek(0)
|
||||
df = pd.read_excel(file_like_object, engine="calamine")
|
||||
return RAGExcelParser._dataframe_to_workbook(df)
|
||||
except Exception as e_pandas:
|
||||
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
|
||||
|
||||
@staticmethod
|
||||
def _clean_dataframe(df: pd.DataFrame):
|
||||
def clean_string(s):
|
||||
if isinstance(s, str):
|
||||
return ILLEGAL_CHARACTERS_RE.sub(" ", s)
|
||||
return s
|
||||
|
||||
return df.apply(lambda col: col.map(clean_string))
|
||||
|
||||
@staticmethod
|
||||
def _dataframe_to_workbook(df):
|
||||
# if contains multiple sheets use _dataframes_to_workbook
|
||||
if isinstance(df, dict) and len(df) > 1:
|
||||
return RAGExcelParser._dataframes_to_workbook(df)
|
||||
|
||||
df = RAGExcelParser._clean_dataframe(df)
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Data"
|
||||
|
||||
for col_num, column_name in enumerate(df.columns, 1):
|
||||
ws.cell(row=1, column=col_num, value=column_name)
|
||||
|
||||
for row_num, row in enumerate(df.values, 2):
|
||||
for col_num, value in enumerate(row, 1):
|
||||
ws.cell(row=row_num, column=col_num, value=value)
|
||||
|
||||
return wb
|
||||
|
||||
@staticmethod
|
||||
def _dataframes_to_workbook(dfs: dict):
|
||||
wb = Workbook()
|
||||
default_sheet = wb.active
|
||||
wb.remove(default_sheet)
|
||||
|
||||
for sheet_name, df in dfs.items():
|
||||
df = RAGExcelParser._clean_dataframe(df)
|
||||
ws = wb.create_sheet(title=sheet_name)
|
||||
for col_num, column_name in enumerate(df.columns, 1):
|
||||
ws.cell(row=1, column=col_num, value=column_name)
|
||||
for row_num, row in enumerate(df.values, 2):
|
||||
for col_num, value in enumerate(row, 1):
|
||||
ws.cell(row=row_num, column=col_num, value=value)
|
||||
return wb
|
||||
|
||||
def html(self, fnm, chunk_rows=256):
|
||||
from html import escape
|
||||
|
||||
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||
wb = RAGExcelParser._load_excel_to_workbook(file_like_object)
|
||||
tb_chunks = []
|
||||
|
||||
def _fmt(v):
|
||||
if v is None:
|
||||
return ""
|
||||
return str(v).strip()
|
||||
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
try:
|
||||
rows = list(ws.rows)
|
||||
except Exception as e:
|
||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||
continue
|
||||
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
tb_rows_0 = "<tr>"
|
||||
for t in list(rows[0]):
|
||||
tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
|
||||
tb_rows_0 += "</tr>"
|
||||
|
||||
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
||||
tb = ""
|
||||
tb += f"<table><caption>{sheetname}</caption>"
|
||||
tb += tb_rows_0
|
||||
for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
|
||||
tb += "<tr>"
|
||||
for i, c in enumerate(r):
|
||||
if c.value is None:
|
||||
tb += "<td></td>"
|
||||
else:
|
||||
tb += f"<td>{escape(_fmt(c.value))}</td>"
|
||||
tb += "</tr>"
|
||||
tb += "</table>\n"
|
||||
tb_chunks.append(tb)
|
||||
|
||||
return tb_chunks
|
||||
|
||||
def markdown(self, fnm):
|
||||
import pandas as pd
|
||||
|
||||
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||
try:
|
||||
file_like_object.seek(0)
|
||||
df = pd.read_excel(file_like_object)
|
||||
except Exception as e:
|
||||
logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
|
||||
file_like_object.seek(0)
|
||||
df = pd.read_csv(file_like_object)
|
||||
df = df.replace(r"^\s*$", "", regex=True)
|
||||
return df.to_markdown(index=False)
|
||||
|
||||
def __call__(self, fnm):
|
||||
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||
wb = RAGExcelParser._load_excel_to_workbook(file_like_object)
|
||||
|
||||
res = []
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
try:
|
||||
rows = list(ws.rows)
|
||||
except Exception as e:
|
||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||
continue
|
||||
if not rows:
|
||||
continue
|
||||
ti = list(rows[0])
|
||||
for r in list(rows[1:]):
|
||||
fields = []
|
||||
for i, c in enumerate(r):
|
||||
if not c.value:
|
||||
continue
|
||||
t = str(ti[i].value) if i < len(ti) else ""
|
||||
t += (":" if t else "") + str(c.value)
|
||||
fields.append(t)
|
||||
line = "; ".join(fields)
|
||||
if sheetname.lower().find("sheet") < 0:
|
||||
line += " ——" + sheetname
|
||||
res.append(line)
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def row_number(fnm, binary):
|
||||
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
||||
wb = RAGExcelParser._load_excel_to_workbook(BytesIO(binary))
|
||||
total = 0
|
||||
|
||||
for sheetname in wb.sheetnames:
|
||||
try:
|
||||
ws = wb[sheetname]
|
||||
total += len(list(ws.rows))
|
||||
except Exception as e:
|
||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||
continue
|
||||
return total
|
||||
|
||||
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
return len(txt.split("\n"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
psr = RAGExcelParser()
|
||||
psr(sys.argv[1])
|
||||
118
app/core/rag/deepdoc/parser/figure_parser.py
Normal file
118
app/core/rag/deepdoc/parser/figure_parser.py
Normal file
@@ -0,0 +1,118 @@
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from app.core.rag.common.constants import LLMType
|
||||
from app.core.rag.common.connection_utils import timeout
|
||||
from app.core.rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
|
||||
from app.core.rag.prompts.generator import vision_llm_figure_describe_prompt
|
||||
|
||||
|
||||
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
|
||||
return [
|
||||
(
|
||||
(figure_data[1], [figure_data[0]]),
|
||||
[(0, 0, 0, 0, 0)],
|
||||
)
|
||||
for figure_data in figures_data_without_positions
|
||||
if isinstance(figure_data[1], Image.Image)
|
||||
]
|
||||
|
||||
def vision_figure_parser_docx_wrapper(sections,tbls,callback=None,vision_model=None,**kwargs):
|
||||
if vision_model:
|
||||
figures_data = vision_figure_parser_figure_data_wrapper(sections)
|
||||
try:
|
||||
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
|
||||
boosted_figures = docx_vision_parser(callback=callback)
|
||||
tbls.extend(boosted_figures)
|
||||
except Exception as e:
|
||||
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
|
||||
return tbls
|
||||
|
||||
def vision_figure_parser_pdf_wrapper(tbls,callback=None,vision_model=None,**kwargs):
|
||||
if vision_model:
|
||||
def is_figure_item(item):
|
||||
return (
|
||||
isinstance(item[0][0], Image.Image) and
|
||||
isinstance(item[0][1], list)
|
||||
)
|
||||
figures_data = [item for item in tbls if is_figure_item(item)]
|
||||
try:
|
||||
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
|
||||
boosted_figures = docx_vision_parser(callback=callback)
|
||||
tbls = [item for item in tbls if not is_figure_item(item)]
|
||||
tbls.extend(boosted_figures)
|
||||
except Exception as e:
|
||||
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
|
||||
return tbls
|
||||
|
||||
shared_executor = ThreadPoolExecutor(max_workers=10)
|
||||
|
||||
|
||||
class VisionFigureParser:
|
||||
def __init__(self, vision_model, figures_data, *args, **kwargs):
|
||||
self.vision_model = vision_model
|
||||
self._extract_figures_info(figures_data)
|
||||
assert len(self.figures) == len(self.descriptions)
|
||||
assert not self.positions or (len(self.figures) == len(self.positions))
|
||||
|
||||
def _extract_figures_info(self, figures_data):
|
||||
self.figures = []
|
||||
self.descriptions = []
|
||||
self.positions = []
|
||||
|
||||
for item in figures_data:
|
||||
# position
|
||||
if len(item) == 2 and isinstance(item[0], tuple) and len(item[0]) == 2 and isinstance(item[1], list) and isinstance(item[1][0], tuple) and len(item[1][0]) == 5:
|
||||
img_desc = item[0]
|
||||
assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])"
|
||||
self.figures.append(img_desc[0])
|
||||
self.descriptions.append(img_desc[1])
|
||||
self.positions.append(item[1])
|
||||
else:
|
||||
assert len(item) == 2 and isinstance(item[0], Image.Image) and isinstance(item[1], list), f"Unexpected form of figure data: get {len(item)=}, {item=}"
|
||||
self.figures.append(item[0])
|
||||
self.descriptions.append(item[1])
|
||||
|
||||
def _assemble(self):
|
||||
self.assembled = []
|
||||
self.has_positions = len(self.positions) != 0
|
||||
for i in range(len(self.figures)):
|
||||
figure = self.figures[i]
|
||||
desc = self.descriptions[i]
|
||||
pos = self.positions[i] if self.has_positions else None
|
||||
|
||||
figure_desc = (figure, desc)
|
||||
|
||||
if pos is not None:
|
||||
self.assembled.append((figure_desc, pos))
|
||||
else:
|
||||
self.assembled.append((figure_desc,))
|
||||
|
||||
return self.assembled
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
callback = kwargs.get("callback", lambda prog, msg: None)
|
||||
|
||||
@timeout(30, 3)
|
||||
def process(figure_idx, figure_binary):
|
||||
description_text = picture_vision_llm_chunk(
|
||||
binary=figure_binary,
|
||||
vision_model=self.vision_model,
|
||||
prompt=vision_llm_figure_describe_prompt(),
|
||||
callback=callback,
|
||||
)
|
||||
return figure_idx, description_text
|
||||
|
||||
futures = []
|
||||
for idx, img_binary in enumerate(self.figures or []):
|
||||
futures.append(shared_executor.submit(process, idx, img_binary))
|
||||
|
||||
for future in as_completed(futures):
|
||||
figure_num, txt = future.result()
|
||||
if txt:
|
||||
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
|
||||
|
||||
self._assemble()
|
||||
|
||||
return self.assembled
|
||||
197
app/core/rag/deepdoc/parser/html_parser.py
Normal file
197
app/core/rag/deepdoc/parser/html_parser.py
Normal file
@@ -0,0 +1,197 @@
|
||||
from app.core.rag.nlp import find_codec, rag_tokenizer
|
||||
import uuid
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||
import html
|
||||
|
||||
def get_encoding(file):
|
||||
with open(file,'rb') as f:
|
||||
tmp = chardet.detect(f.read())
|
||||
return tmp['encoding']
|
||||
|
||||
BLOCK_TAGS = [
|
||||
"h1", "h2", "h3", "h4", "h5", "h6",
|
||||
"p", "div", "article", "section", "aside",
|
||||
"ul", "ol", "li",
|
||||
"table", "pre", "code", "blockquote",
|
||||
"figure", "figcaption"
|
||||
]
|
||||
TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
|
||||
|
||||
|
||||
class RAGHtmlParser:
|
||||
def __call__(self, fnm, binary=None, chunk_token_num=512):
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
||||
txt = f.read()
|
||||
return self.parser_txt(txt, chunk_token_num)
|
||||
|
||||
@classmethod
|
||||
def parser_txt(cls, txt, chunk_token_num):
|
||||
if not isinstance(txt, str):
|
||||
raise TypeError("txt type should be string!")
|
||||
|
||||
temp_sections = []
|
||||
soup = BeautifulSoup(txt, "html5lib")
|
||||
# delete <style> tag
|
||||
for style_tag in soup.find_all(["style", "script"]):
|
||||
style_tag.decompose()
|
||||
# delete <script> tag in <div>
|
||||
for div_tag in soup.find_all("div"):
|
||||
for script_tag in div_tag.find_all("script"):
|
||||
script_tag.decompose()
|
||||
# delete inline style
|
||||
for tag in soup.find_all(True):
|
||||
if 'style' in tag.attrs:
|
||||
del tag.attrs['style']
|
||||
# delete HTML comment
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
||||
cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
|
||||
block_txt_list, table_list = cls.merge_block_text(temp_sections)
|
||||
sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
|
||||
for table in table_list:
|
||||
sections.append(table.get("content", ""))
|
||||
return sections
|
||||
|
||||
@classmethod
|
||||
def split_table(cls, html_table, chunk_token_num=512):
|
||||
soup = BeautifulSoup(html_table, "html.parser")
|
||||
rows = soup.find_all("tr")
|
||||
tables = []
|
||||
current_table = []
|
||||
current_count = 0
|
||||
table_str_list = []
|
||||
for row in rows:
|
||||
tks_str = rag_tokenizer.tokenize(str(row))
|
||||
token_count = len(tks_str.split(" ")) if tks_str else 0
|
||||
if current_count + token_count > chunk_token_num:
|
||||
tables.append(current_table)
|
||||
current_table = []
|
||||
current_count = 0
|
||||
current_table.append(row)
|
||||
current_count += token_count
|
||||
if current_table:
|
||||
tables.append(current_table)
|
||||
|
||||
for table_rows in tables:
|
||||
new_table = soup.new_tag("table")
|
||||
for row in table_rows:
|
||||
new_table.append(row)
|
||||
table_str_list.append(str(new_table))
|
||||
|
||||
return table_str_list
|
||||
|
||||
@classmethod
|
||||
def read_text_recursively(cls, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None):
|
||||
if isinstance(element, NavigableString):
|
||||
content = element.strip()
|
||||
|
||||
def is_valid_html(content):
|
||||
try:
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
return bool(soup.find())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
return_info = []
|
||||
if content:
|
||||
if is_valid_html(content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
child_info = cls.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id)
|
||||
parser_result.extend(child_info)
|
||||
else:
|
||||
info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}}
|
||||
if parent_name:
|
||||
info["tag_name"] = parent_name
|
||||
return_info.append(info)
|
||||
return return_info
|
||||
elif isinstance(element, Tag):
|
||||
|
||||
if str.lower(element.name) == "table":
|
||||
table_info_list = []
|
||||
table_id = str(uuid.uuid1())
|
||||
table_list = [html.unescape(str(element))]
|
||||
for t in table_list:
|
||||
table_info_list.append({"content": t, "tag_name": "table",
|
||||
"metadata": {"table_id": table_id, "index": table_list.index(t)}})
|
||||
return table_info_list
|
||||
else:
|
||||
block_id = None
|
||||
if str.lower(element.name) in BLOCK_TAGS:
|
||||
block_id = str(uuid.uuid1())
|
||||
for child in element.children:
|
||||
child_info = cls.read_text_recursively(child, parser_result, chunk_token_num, element.name,
|
||||
block_id)
|
||||
parser_result.extend(child_info)
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def merge_block_text(cls, parser_result):
|
||||
block_content = []
|
||||
current_content = ""
|
||||
table_info_list = []
|
||||
lask_block_id = None
|
||||
for item in parser_result:
|
||||
content = item.get("content")
|
||||
tag_name = item.get("tag_name")
|
||||
title_flag = tag_name in TITLE_TAGS
|
||||
block_id = item.get("metadata", {}).get("block_id")
|
||||
if block_id:
|
||||
if title_flag:
|
||||
content = f"{TITLE_TAGS[tag_name]} {content}"
|
||||
if lask_block_id != block_id:
|
||||
if lask_block_id is not None:
|
||||
block_content.append(current_content)
|
||||
current_content = content
|
||||
lask_block_id = block_id
|
||||
else:
|
||||
current_content += (" " if current_content else "") + content
|
||||
else:
|
||||
if tag_name == "table":
|
||||
table_info_list.append(item)
|
||||
else:
|
||||
current_content += (" " if current_content else "" + content)
|
||||
if current_content:
|
||||
block_content.append(current_content)
|
||||
return block_content, table_info_list
|
||||
|
||||
@classmethod
|
||||
def chunk_block(cls, block_txt_list, chunk_token_num=512):
|
||||
chunks = []
|
||||
current_block = ""
|
||||
current_token_count = 0
|
||||
|
||||
for block in block_txt_list:
|
||||
tks_str = rag_tokenizer.tokenize(block)
|
||||
block_token_count = len(tks_str.split(" ")) if tks_str else 0
|
||||
if block_token_count > chunk_token_num:
|
||||
if current_block:
|
||||
chunks.append(current_block)
|
||||
start = 0
|
||||
tokens = tks_str.split(" ")
|
||||
while start < len(tokens):
|
||||
end = start + chunk_token_num
|
||||
split_tokens = tokens[start:end]
|
||||
chunks.append(" ".join(split_tokens))
|
||||
start = end
|
||||
current_block = ""
|
||||
current_token_count = 0
|
||||
else:
|
||||
if current_token_count + block_token_count <= chunk_token_num:
|
||||
current_block += ("\n" if current_block else "") + block
|
||||
current_token_count += block_token_count
|
||||
else:
|
||||
chunks.append(current_block)
|
||||
current_block = block
|
||||
current_token_count = block_token_count
|
||||
|
||||
if current_block:
|
||||
chunks.append(current_block)
|
||||
|
||||
return chunks
|
||||
|
||||
159
app/core/rag/deepdoc/parser/json_parser.py
Normal file
159
app/core/rag/deepdoc/parser/json_parser.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from app.core.rag.nlp import find_codec
|
||||
|
||||
|
||||
class RAGJsonParser:
|
||||
def __init__(self, max_chunk_size: int = 2000, min_chunk_size: int | None = None):
|
||||
super().__init__()
|
||||
self.max_chunk_size = max_chunk_size * 2
|
||||
self.min_chunk_size = min_chunk_size if min_chunk_size is not None else max(max_chunk_size - 200, 50)
|
||||
|
||||
def __call__(self, filename):
|
||||
with open(filename, "r") as f:
|
||||
txt = f.read()
|
||||
|
||||
if self.is_jsonl_format(txt):
|
||||
sections = self._parse_jsonl(txt)
|
||||
else:
|
||||
sections = self._parse_json(txt)
|
||||
return sections
|
||||
|
||||
@staticmethod
|
||||
def _json_size(data: dict) -> int:
|
||||
"""Calculate the size of the serialized JSON object."""
|
||||
return len(json.dumps(data, ensure_ascii=False))
|
||||
|
||||
@staticmethod
|
||||
def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
|
||||
"""Set a value in a nested dictionary based on the given path."""
|
||||
for key in path[:-1]:
|
||||
d = d.setdefault(key, {})
|
||||
d[path[-1]] = value
|
||||
|
||||
def _list_to_dict_preprocessing(self, data: Any) -> Any:
|
||||
if isinstance(data, dict):
|
||||
# Process each key-value pair in the dictionary
|
||||
return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
|
||||
elif isinstance(data, list):
|
||||
# Convert the list to a dictionary with index-based keys
|
||||
return {str(i): self._list_to_dict_preprocessing(item) for i, item in enumerate(data)}
|
||||
else:
|
||||
# Base case: the item is neither a dict nor a list, so return it unchanged
|
||||
return data
|
||||
|
||||
def _json_split(
|
||||
self,
|
||||
data,
|
||||
current_path: list[str] | None,
|
||||
chunks: list[dict] | None,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Split json into maximum size dictionaries while preserving structure.
|
||||
"""
|
||||
current_path = current_path or []
|
||||
chunks = chunks or [{}]
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
new_path = current_path + [key]
|
||||
chunk_size = self._json_size(chunks[-1])
|
||||
size = self._json_size({key: value})
|
||||
remaining = self.max_chunk_size - chunk_size
|
||||
|
||||
if size < remaining:
|
||||
# Add item to current chunk
|
||||
self._set_nested_dict(chunks[-1], new_path, value)
|
||||
else:
|
||||
if chunk_size >= self.min_chunk_size:
|
||||
# Chunk is big enough, start a new chunk
|
||||
chunks.append({})
|
||||
|
||||
# Iterate
|
||||
self._json_split(value, new_path, chunks)
|
||||
else:
|
||||
# handle single item
|
||||
self._set_nested_dict(chunks[-1], current_path, data)
|
||||
return chunks
|
||||
|
||||
def split_json(
|
||||
self,
|
||||
json_data,
|
||||
convert_lists: bool = False,
|
||||
) -> list[dict]:
|
||||
"""Splits JSON into a list of JSON chunks"""
|
||||
|
||||
if convert_lists:
|
||||
preprocessed_data = self._list_to_dict_preprocessing(json_data)
|
||||
chunks = self._json_split(preprocessed_data, None, None)
|
||||
else:
|
||||
chunks = self._json_split(json_data, None, None)
|
||||
|
||||
# Remove the last chunk if it's empty
|
||||
if not chunks[-1]:
|
||||
chunks.pop()
|
||||
return chunks
|
||||
|
||||
def split_text(
|
||||
self,
|
||||
json_data: dict[str, Any],
|
||||
convert_lists: bool = False,
|
||||
ensure_ascii: bool = True,
|
||||
) -> list[str]:
|
||||
"""Splits JSON into a list of JSON formatted strings"""
|
||||
|
||||
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
|
||||
|
||||
# Convert to string
|
||||
return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
|
||||
|
||||
def _parse_json(self, content: str) -> list[str]:
|
||||
sections = []
|
||||
try:
|
||||
json_data = json.loads(content)
|
||||
chunks = self.split_json(json_data, True)
|
||||
sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return sections
|
||||
|
||||
def _parse_jsonl(self, content: str) -> list[str]:
|
||||
lines = content.strip().splitlines()
|
||||
all_chunks = []
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
chunks = self.split_json(data, convert_lists=True)
|
||||
all_chunks.extend(json.dumps(chunk, ensure_ascii=False) for chunk in chunks if chunk)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return all_chunks
|
||||
|
||||
def is_jsonl_format(self, txt: str, sample_limit: int = 10, threshold: float = 0.8) -> bool:
|
||||
lines = [line.strip() for line in txt.strip().splitlines() if line.strip()]
|
||||
if not lines:
|
||||
return False
|
||||
|
||||
try:
|
||||
json.loads(txt)
|
||||
return False
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
sample_limit = min(len(lines), sample_limit)
|
||||
sample_lines = lines[:sample_limit]
|
||||
valid_lines = sum(1 for line in sample_lines if self._is_valid_json(line))
|
||||
|
||||
if not valid_lines:
|
||||
return False
|
||||
|
||||
return (valid_lines / len(sample_lines)) >= threshold
|
||||
|
||||
def _is_valid_json(self, line: str) -> bool:
|
||||
try:
|
||||
json.loads(line)
|
||||
return True
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
277
app/core/rag/deepdoc/parser/markdown_parser.py
Normal file
277
app/core/rag/deepdoc/parser/markdown_parser.py
Normal file
@@ -0,0 +1,277 @@
|
||||
import re
|
||||
|
||||
from markdown import markdown
|
||||
|
||||
|
||||
class RAGMarkdownParser:
|
||||
def __init__(self, chunk_token_num=128):
|
||||
self.chunk_token_num = int(chunk_token_num)
|
||||
|
||||
def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
|
||||
tables = []
|
||||
working_text = markdown_text
|
||||
|
||||
def replace_tables_with_rendered_html(pattern, table_list, render=True):
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for match in pattern.finditer(working_text):
|
||||
raw_table = match.group()
|
||||
table_list.append(raw_table)
|
||||
if separate_tables:
|
||||
# Skip this match (i.e., remove it)
|
||||
new_text += working_text[last_end : match.start()] + "\n\n"
|
||||
else:
|
||||
# Replace with rendered HTML
|
||||
html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
|
||||
new_text += working_text[last_end : match.start()] + html_table + "\n\n"
|
||||
last_end = match.end()
|
||||
new_text += working_text[last_end:]
|
||||
return new_text
|
||||
|
||||
if "|" in markdown_text: # for optimize performance
|
||||
# Standard Markdown table
|
||||
border_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
(?:\|.*?\|.*?\|.*?\n)
|
||||
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
||||
(?:\|.*?\|.*?\|.*?\n)+
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
|
||||
|
||||
# Borderless Markdown table
|
||||
no_border_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
(?:\S.*?\|.*?\n)
|
||||
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
||||
(?:\S.*?\|.*?\n)+
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
|
||||
|
||||
# Replace any TAGS e.g. <table ...> to <table>
|
||||
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
|
||||
table_with_attributes_pattern = re.compile(
|
||||
rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
|
||||
)
|
||||
def replace_tag(m):
|
||||
tag_name = re.match(r"<(\w+)", m.group()).group(1)
|
||||
return "<{}>".format(tag_name)
|
||||
|
||||
working_text = re.sub(table_with_attributes_pattern, replace_tag, working_text)
|
||||
|
||||
if "<table>" in working_text.lower(): # for optimize performance
|
||||
# HTML table extraction - handle possible html/body wrapper tags
|
||||
html_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
\s*
|
||||
(?:
|
||||
# case1: <html><body><table>...</table></body></html>
|
||||
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
||||
|
|
||||
# case2: <body><table>...</table></body>
|
||||
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
||||
|
|
||||
# case3: only<table>...</table>
|
||||
(?:<table[^>]*>.*?</table>)
|
||||
)
|
||||
\s*
|
||||
(?=\n|$)
|
||||
""",
|
||||
re.VERBOSE | re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
|
||||
def replace_html_tables():
|
||||
nonlocal working_text
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for match in html_table_pattern.finditer(working_text):
|
||||
raw_table = match.group()
|
||||
tables.append(raw_table)
|
||||
if separate_tables:
|
||||
new_text += working_text[last_end : match.start()] + "\n\n"
|
||||
else:
|
||||
new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
|
||||
last_end = match.end()
|
||||
new_text += working_text[last_end:]
|
||||
working_text = new_text
|
||||
|
||||
replace_html_tables()
|
||||
|
||||
return working_text, tables
|
||||
|
||||
|
||||
class MarkdownElementExtractor:
|
||||
def __init__(self, markdown_content):
|
||||
self.markdown_content = markdown_content
|
||||
self.lines = markdown_content.split("\n")
|
||||
|
||||
def get_delimiters(self,delimiters):
|
||||
toks = re.findall(r"`([^`]+)`", delimiters)
|
||||
toks = sorted(set(toks), key=lambda x: -len(x))
|
||||
return "|".join(re.escape(t) for t in toks if t)
|
||||
|
||||
def extract_elements(self,delimiter=None):
|
||||
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
||||
sections = []
|
||||
|
||||
i = 0
|
||||
dels=""
|
||||
if delimiter:
|
||||
dels = self.get_delimiters(delimiter)
|
||||
if len(dels) > 0:
|
||||
text = "\n".join(self.lines)
|
||||
parts = re.split(dels, text)
|
||||
sections = [p.strip() for p in parts if p and p.strip()]
|
||||
return sections
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
|
||||
if re.match(r"^#{1,6}\s+.*$", line):
|
||||
# header
|
||||
element = self._extract_header(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip().startswith("```"):
|
||||
# code block
|
||||
element = self._extract_code_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
||||
# list block
|
||||
element = self._extract_list_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip().startswith(">"):
|
||||
# blockquote
|
||||
element = self._extract_blockquote(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip():
|
||||
# text block (paragraphs and inline elements until next block element)
|
||||
element = self._extract_text_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
sections = [section for section in sections if section.strip()]
|
||||
return sections
|
||||
|
||||
def _extract_header(self, start_pos):
|
||||
return {
|
||||
"type": "header",
|
||||
"content": self.lines[start_pos],
|
||||
"start_line": start_pos,
|
||||
"end_line": start_pos,
|
||||
}
|
||||
|
||||
def _extract_code_block(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = [self.lines[start_pos]]
|
||||
|
||||
# Find the end of the code block
|
||||
for i in range(start_pos + 1, len(self.lines)):
|
||||
content_lines.append(self.lines[i])
|
||||
end_pos = i
|
||||
if self.lines[i].strip().startswith("```"):
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "code_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_list_block(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = []
|
||||
|
||||
i = start_pos
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
# check if this line is a list item or continuation of a list
|
||||
if (
|
||||
re.match(r"^\s*[-*+]\s+.*$", line)
|
||||
or re.match(r"^\s*\d+\.\s+.*$", line)
|
||||
or (i > start_pos and not line.strip())
|
||||
or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
|
||||
or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
|
||||
or (i > start_pos and re.match(r"^\s+\w+.*$", line))
|
||||
):
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "list_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_blockquote(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = []
|
||||
|
||||
i = start_pos
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
if line.strip().startswith(">") or (i > start_pos and not line.strip()):
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "blockquote",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_text_block(self, start_pos):
|
||||
"""Extract a text block (paragraphs, inline elements) until next block element"""
|
||||
end_pos = start_pos
|
||||
content_lines = [self.lines[start_pos]]
|
||||
|
||||
i = start_pos + 1
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
# stop if we encounter a block element
|
||||
if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
|
||||
break
|
||||
elif not line.strip():
|
||||
# check if the next line is a block element
|
||||
if i + 1 < len(self.lines) and (
|
||||
re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
|
||||
or self.lines[i + 1].strip().startswith("```")
|
||||
or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
|
||||
or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
|
||||
or self.lines[i + 1].strip().startswith(">")
|
||||
):
|
||||
break
|
||||
else:
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
|
||||
return {
|
||||
"type": "text_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
524
app/core/rag/deepdoc/parser/mineru_parser.py
Normal file
524
app/core/rag/deepdoc/parser/mineru_parser.py
Normal file
@@ -0,0 +1,524 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from queue import Empty, Queue
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
import pdfplumber
|
||||
import requests
|
||||
from PIL import Image
|
||||
from strenum import StrEnum
|
||||
|
||||
from .pdf_parser import RAGPdfParser
|
||||
|
||||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||||
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
||||
|
||||
|
||||
class MinerUContentType(StrEnum):
|
||||
IMAGE = "image"
|
||||
TABLE = "table"
|
||||
TEXT = "text"
|
||||
EQUATION = "equation"
|
||||
CODE = "code"
|
||||
LIST = "list"
|
||||
DISCARDED = "discarded"
|
||||
|
||||
|
||||
class MinerUParser(RAGPdfParser):
|
||||
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
|
||||
self.mineru_path = Path(mineru_path)
|
||||
self.mineru_api = mineru_api.rstrip("/")
|
||||
self.mineru_server_url = mineru_server_url.rstrip("/")
|
||||
self.using_api = False
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
if not root_dir:
|
||||
files = zip_ref.namelist()
|
||||
if files and files[0].endswith("/"):
|
||||
root_dir = files[0]
|
||||
else:
|
||||
root_dir = None
|
||||
|
||||
if not root_dir or not root_dir.endswith("/"):
|
||||
self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}")
|
||||
zip_ref.extractall(extract_to)
|
||||
return
|
||||
|
||||
root_len = len(root_dir)
|
||||
for member in zip_ref.infolist():
|
||||
filename = member.filename
|
||||
if filename == root_dir:
|
||||
self.logger.info("[MinerU] Ignore root folder...")
|
||||
continue
|
||||
|
||||
path = filename
|
||||
if path.startswith(root_dir):
|
||||
path = path[root_len:]
|
||||
|
||||
full_path = os.path.join(extract_to, path)
|
||||
if member.is_dir():
|
||||
os.makedirs(full_path, exist_ok=True)
|
||||
else:
|
||||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||
with open(full_path, "wb") as f:
|
||||
f.write(zip_ref.read(filename))
|
||||
|
||||
def _is_http_endpoint_valid(self, url, timeout=5):
|
||||
try:
|
||||
response = requests.head(url, timeout=timeout, allow_redirects=True)
|
||||
return response.status_code in [200, 301, 302, 307, 308]
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
|
||||
reason = ""
|
||||
|
||||
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
|
||||
if backend not in valid_backends:
|
||||
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
||||
logging.warning(reason)
|
||||
return False, reason
|
||||
|
||||
subprocess_kwargs = {
|
||||
"capture_output": True,
|
||||
"text": True,
|
||||
"check": True,
|
||||
"encoding": "utf-8",
|
||||
"errors": "ignore",
|
||||
}
|
||||
|
||||
if platform.system() == "Windows":
|
||||
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
||||
|
||||
if server_url is None:
|
||||
server_url = self.mineru_server_url
|
||||
|
||||
if backend == "vlm-http-client" and server_url:
|
||||
try:
|
||||
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
|
||||
logging.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
|
||||
if server_accessible:
|
||||
self.using_api = False # We are using http client, not API
|
||||
return True, reason
|
||||
else:
|
||||
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
|
||||
logging.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
|
||||
return False, reason
|
||||
except Exception as e:
|
||||
logging.warning(f"[MinerU] vlm-http-client server check failed: {e}")
|
||||
try:
|
||||
response = requests.get(server_url, timeout=5)
|
||||
logging.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
|
||||
self.using_api = False
|
||||
return True, reason
|
||||
except Exception as e:
|
||||
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
|
||||
logging.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
|
||||
return False, reason
|
||||
|
||||
try:
|
||||
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
|
||||
version_info = result.stdout.strip()
|
||||
if version_info:
|
||||
logging.info(f"[MinerU] Detected version: {version_info}")
|
||||
else:
|
||||
logging.info("[MinerU] Detected MinerU, but version info is empty.")
|
||||
return True, reason
|
||||
except subprocess.CalledProcessError as e:
|
||||
logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
|
||||
except FileNotFoundError:
|
||||
logging.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
|
||||
except Exception as e:
|
||||
logging.error(f"[MinerU] Unexpected error during installation check: {e}")
|
||||
|
||||
# If executable check fails, try API check
|
||||
try:
|
||||
if self.mineru_api:
|
||||
# check openapi.json
|
||||
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
|
||||
if not openapi_exists:
|
||||
reason = "[MinerU] Failed to detect vaild MinerU API server"
|
||||
return openapi_exists, reason
|
||||
logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
|
||||
self.using_api = openapi_exists
|
||||
return openapi_exists, reason
|
||||
else:
|
||||
logging.info("[MinerU] api not exists.")
|
||||
except Exception as e:
|
||||
reason = f"[MinerU] Unexpected error during api check: {e}"
|
||||
logging.error(f"[MinerU] Unexpected error during api check: {e}")
|
||||
return False, reason
|
||||
|
||||
def _run_mineru(
|
||||
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
||||
):
|
||||
if self.using_api:
|
||||
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
|
||||
else:
|
||||
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
||||
|
||||
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
||||
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
|
||||
|
||||
pdf_file_path = str(input_path)
|
||||
|
||||
if not os.path.exists(pdf_file_path):
|
||||
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
||||
|
||||
pdf_file_name = Path(pdf_file_path).stem.strip()
|
||||
output_path = os.path.join(str(output_dir), pdf_file_name, method)
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
|
||||
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
||||
|
||||
data = {
|
||||
"output_dir": "./output",
|
||||
"lang_list": lang,
|
||||
"backend": backend,
|
||||
"parse_method": method,
|
||||
"formula_enable": True,
|
||||
"table_enable": True,
|
||||
"server_url": None,
|
||||
"return_md": True,
|
||||
"return_middle_json": True,
|
||||
"return_model_output": True,
|
||||
"return_content_list": True,
|
||||
"return_images": True,
|
||||
"response_format_zip": True,
|
||||
"start_page_id": 0,
|
||||
"end_page_id": 99999,
|
||||
}
|
||||
|
||||
headers = {"Accept": "application/json"}
|
||||
try:
|
||||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||
if callback:
|
||||
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800)
|
||||
|
||||
response.raise_for_status()
|
||||
if response.headers.get("Content-Type") == "application/zip":
|
||||
self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
|
||||
|
||||
if callback:
|
||||
callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
|
||||
|
||||
with open(OUTPUT_ZIP_PATH, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
||||
self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/")
|
||||
|
||||
if callback:
|
||||
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
||||
else:
|
||||
self.logger.warning("[MinerU] not zip returned from api:%s " % response.headers.get("Content-Type"))
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
||||
self.logger.info("[MinerU] Api completed successfully.")
|
||||
|
||||
def _run_mineru_executable(
|
||||
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
||||
):
|
||||
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
|
||||
if backend:
|
||||
cmd.extend(["-b", backend])
|
||||
if lang:
|
||||
cmd.extend(["-l", lang])
|
||||
if server_url and backend == "vlm-http-client":
|
||||
cmd.extend(["-u", server_url])
|
||||
|
||||
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
|
||||
|
||||
subprocess_kwargs = {
|
||||
"stdout": subprocess.PIPE,
|
||||
"stderr": subprocess.PIPE,
|
||||
"text": True,
|
||||
"encoding": "utf-8",
|
||||
"errors": "ignore",
|
||||
"bufsize": 1,
|
||||
}
|
||||
|
||||
if platform.system() == "Windows":
|
||||
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
||||
|
||||
process = subprocess.Popen(cmd, **subprocess_kwargs)
|
||||
stdout_queue, stderr_queue = Queue(), Queue()
|
||||
|
||||
def enqueue_output(pipe, queue, prefix):
|
||||
for line in iter(pipe.readline, ""):
|
||||
if line.strip():
|
||||
queue.put((prefix, line.strip()))
|
||||
pipe.close()
|
||||
|
||||
threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
|
||||
threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
|
||||
|
||||
while process.poll() is None:
|
||||
for q in (stdout_queue, stderr_queue):
|
||||
try:
|
||||
while True:
|
||||
prefix, line = q.get_nowait()
|
||||
if prefix == "STDOUT":
|
||||
self.logger.info(f"[MinerU] {line}")
|
||||
else:
|
||||
self.logger.warning(f"[MinerU] {line}")
|
||||
except Empty:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
return_code = process.wait()
|
||||
if return_code != 0:
|
||||
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
|
||||
self.logger.info("[MinerU] Command completed successfully.")
|
||||
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
try:
|
||||
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
||||
self.pdf = pdf
|
||||
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
|
||||
except Exception as e:
|
||||
self.page_images = None
|
||||
self.total_page = 0
|
||||
logging.exception(e)
|
||||
|
||||
def _line_tag(self, bx):
|
||||
pn = [bx["page_idx"] + 1]
|
||||
positions = bx["bbox"]
|
||||
x0, top, x1, bott = positions
|
||||
|
||||
if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
|
||||
page_width, page_height = self.page_images[bx["page_idx"]].size
|
||||
x0 = (x0 / 1000.0) * page_width
|
||||
x1 = (x1 / 1000.0) * page_width
|
||||
top = (top / 1000.0) * page_height
|
||||
bott = (bott / 1000.0) * page_height
|
||||
|
||||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
|
||||
|
||||
def crop(self, text, ZM=1, need_position=False):
|
||||
imgs = []
|
||||
poss = self.extract_positions(text)
|
||||
if not poss:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||||
GAP = 6
|
||||
pos = poss[0]
|
||||
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||
pos = poss[-1]
|
||||
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
|
||||
|
||||
positions = []
|
||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||
right = left + max_width
|
||||
|
||||
if bottom <= top:
|
||||
bottom = top + 2
|
||||
|
||||
for pn in pns[1:]:
|
||||
bottom += self.page_images[pn - 1].size[1]
|
||||
|
||||
img0 = self.page_images[pns[0]]
|
||||
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
||||
crop0 = img0.crop((x0, y0, x1, y1))
|
||||
imgs.append(crop0)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
||||
|
||||
bottom -= img0.size[1]
|
||||
for pn in pns[1:]:
|
||||
page = self.page_images[pn]
|
||||
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
||||
cimgp = page.crop((x0, y0, x1, y1))
|
||||
imgs.append(cimgp)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
||||
bottom -= page.size[1]
|
||||
|
||||
if not imgs:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
height = 0
|
||||
for img in imgs:
|
||||
height += img.size[1] + GAP
|
||||
height = int(height)
|
||||
width = int(np.max([i.size[0] for i in imgs]))
|
||||
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
||||
height = 0
|
||||
for ii, img in enumerate(imgs):
|
||||
if ii == 0 or ii + 1 == len(imgs):
|
||||
img = img.convert("RGBA")
|
||||
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
||||
overlay.putalpha(128)
|
||||
img = Image.alpha_composite(img, overlay).convert("RGB")
|
||||
pic.paste(img, (0, int(height)))
|
||||
height += img.size[1] + GAP
|
||||
|
||||
if need_position:
|
||||
return pic, positions
|
||||
return pic
|
||||
|
||||
@staticmethod
|
||||
def extract_positions(txt: str):
|
||||
poss = []
|
||||
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
||||
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
||||
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||
return poss
|
||||
|
||||
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
||||
subdir = output_dir / file_stem / method
|
||||
if backend.startswith("vlm-"):
|
||||
subdir = output_dir / file_stem / "vlm"
|
||||
json_file = subdir / f"{file_stem}_content_list.json"
|
||||
|
||||
if not json_file.exists():
|
||||
raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}")
|
||||
|
||||
with open(json_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
for item in data:
|
||||
for key in ("img_path", "table_img_path", "equation_img_path"):
|
||||
if key in item and item[key]:
|
||||
item[key] = str((subdir / item[key]).resolve())
|
||||
return data
|
||||
|
||||
def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
|
||||
sections = []
|
||||
for output in outputs:
|
||||
match output["type"]:
|
||||
case MinerUContentType.TEXT:
|
||||
section = output["text"]
|
||||
case MinerUContentType.TABLE:
|
||||
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
|
||||
if not section.strip():
|
||||
section = "FAILED TO PARSE TABLE"
|
||||
case MinerUContentType.IMAGE:
|
||||
section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"])
|
||||
case MinerUContentType.EQUATION:
|
||||
section = output["text"]
|
||||
case MinerUContentType.CODE:
|
||||
section = output["code_body"] + "\n".join(output.get("code_caption", []))
|
||||
case MinerUContentType.LIST:
|
||||
section = "\n".join(output.get("list_items", []))
|
||||
case MinerUContentType.DISCARDED:
|
||||
pass
|
||||
|
||||
if section:
|
||||
sections.append((section, self._line_tag(output)))
|
||||
return sections
|
||||
|
||||
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
|
||||
return []
|
||||
|
||||
def parse_pdf(
|
||||
self,
|
||||
filepath: str | PathLike[str],
|
||||
binary: BytesIO | bytes,
|
||||
callback: Optional[Callable] = None,
|
||||
*,
|
||||
output_dir: Optional[str] = None,
|
||||
backend: str = "pipeline",
|
||||
lang: Optional[str] = None,
|
||||
method: str = "auto",
|
||||
server_url: Optional[str] = None,
|
||||
delete_output: bool = True,
|
||||
) -> tuple:
|
||||
import shutil
|
||||
|
||||
temp_pdf = None
|
||||
created_tmp_dir = False
|
||||
|
||||
# remove spaces, or mineru crash, and _read_output fail too
|
||||
file_path = Path(filepath)
|
||||
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
|
||||
pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
|
||||
|
||||
if binary:
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
|
||||
temp_pdf = temp_dir / pdf_file_name
|
||||
with open(temp_pdf, "wb") as f:
|
||||
f.write(binary)
|
||||
pdf = temp_pdf
|
||||
self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
|
||||
if callback:
|
||||
callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
|
||||
else:
|
||||
if pdf_file_path_valid != filepath:
|
||||
self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
|
||||
shutil.move(filepath, pdf_file_path_valid)
|
||||
pdf = Path(pdf_file_path_valid)
|
||||
if not pdf.exists():
|
||||
if callback:
|
||||
callback(-1, f"[MinerU] PDF not found: {pdf}")
|
||||
raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
|
||||
|
||||
if output_dir:
|
||||
out_dir = Path(output_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
|
||||
created_tmp_dir = True
|
||||
|
||||
self.logger.info(f"[MinerU] Output directory: {out_dir}")
|
||||
if callback:
|
||||
callback(0.15, f"[MinerU] Output directory: {out_dir}")
|
||||
|
||||
self.__images__(pdf, zoomin=1)
|
||||
|
||||
try:
|
||||
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
|
||||
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
|
||||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||
if callback:
|
||||
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||
return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
|
||||
finally:
|
||||
if temp_pdf and temp_pdf.exists():
|
||||
try:
|
||||
temp_pdf.unlink()
|
||||
temp_pdf.parent.rmdir()
|
||||
except Exception:
|
||||
pass
|
||||
if delete_output and created_tmp_dir and out_dir.exists():
|
||||
try:
|
||||
shutil.rmtree(out_dir)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = MinerUParser("mineru")
|
||||
ok, reason = parser.check_installation()
|
||||
print("MinerU available:", ok)
|
||||
|
||||
filepath = ""
|
||||
with open(filepath, "rb") as file:
|
||||
outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
|
||||
for output in outputs:
|
||||
print(output)
|
||||
1387
app/core/rag/deepdoc/parser/pdf_parser.py
Normal file
1387
app/core/rag/deepdoc/parser/pdf_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
83
app/core/rag/deepdoc/parser/ppt_parser.py
Normal file
83
app/core/rag/deepdoc/parser/ppt_parser.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class RAGPptParser:
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __get_bulleted_text(self, paragraph):
|
||||
is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
|
||||
if is_bulleted:
|
||||
return f"{' '* paragraph.level}.{paragraph.text}"
|
||||
else:
|
||||
return paragraph.text
|
||||
|
||||
def __extract(self, shape):
|
||||
try:
|
||||
# First try to get text content
|
||||
if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
|
||||
text_frame = shape.text_frame
|
||||
texts = []
|
||||
for paragraph in text_frame.paragraphs:
|
||||
if paragraph.text.strip():
|
||||
texts.append(self.__get_bulleted_text(paragraph))
|
||||
return "\n".join(texts)
|
||||
|
||||
# Safely get shape_type
|
||||
try:
|
||||
shape_type = shape.shape_type
|
||||
except NotImplementedError:
|
||||
# If shape_type is not available, try to get text content
|
||||
if hasattr(shape, 'text'):
|
||||
return shape.text.strip()
|
||||
return ""
|
||||
|
||||
# Handle table
|
||||
if shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
# Handle group shape
|
||||
if shape_type == 6:
|
||||
texts = []
|
||||
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
||||
t = self.__extract(p)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing shape: {str(e)}")
|
||||
return ""
|
||||
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
fnm, str) else Presentation(
|
||||
BytesIO(fnm))
|
||||
txts = []
|
||||
self.total_page = len(ppt.slides)
|
||||
for i, slide in enumerate(ppt.slides):
|
||||
if i < from_page:
|
||||
continue
|
||||
if i >= to_page:
|
||||
break
|
||||
texts = []
|
||||
for shape in sorted(
|
||||
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
|
||||
try:
|
||||
txt = self.__extract(shape)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
txts.append("\n".join(texts))
|
||||
|
||||
return txts
|
||||
48
app/core/rag/deepdoc/parser/txt_parser.py
Normal file
48
app/core/rag/deepdoc/parser/txt_parser.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import re
|
||||
|
||||
from .utils import get_text
|
||||
from app.core.rag.common.token_utils import num_tokens_from_string
|
||||
|
||||
|
||||
class RAGTxtParser:
|
||||
def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
||||
txt = get_text(fnm, binary)
|
||||
return self.parser_txt(txt, chunk_token_num, delimiter)
|
||||
|
||||
@classmethod
|
||||
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
||||
if not isinstance(txt, str):
|
||||
raise TypeError("txt type should be str!")
|
||||
cks = [""]
|
||||
tk_nums = [0]
|
||||
delimiter = delimiter.encode('utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
|
||||
|
||||
def add_chunk(t):
|
||||
nonlocal cks, tk_nums, delimiter
|
||||
tnum = num_tokens_from_string(t)
|
||||
if tk_nums[-1] > chunk_token_num:
|
||||
cks.append(t)
|
||||
tk_nums.append(tnum)
|
||||
else:
|
||||
cks[-1] += t
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
dels = []
|
||||
s = 0
|
||||
for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
|
||||
f, t = m.span()
|
||||
dels.append(m.group(1))
|
||||
dels.extend(list(delimiter[s: f]))
|
||||
s = t
|
||||
if s < len(delimiter):
|
||||
dels.extend(list(delimiter[s:]))
|
||||
dels = [re.escape(d) for d in dels if d]
|
||||
dels = [d for d in dels if d]
|
||||
dels = "|".join(dels)
|
||||
secs = re.split(r"(%s)" % dels, txt)
|
||||
for sec in secs:
|
||||
if re.match(f"^{dels}$", sec):
|
||||
continue
|
||||
add_chunk(sec)
|
||||
|
||||
return [[c, ""] for c in cks]
|
||||
16
app/core/rag/deepdoc/parser/utils.py
Normal file
16
app/core/rag/deepdoc/parser/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from app.core.rag.nlp import find_codec
|
||||
|
||||
|
||||
def get_text(fnm: str, binary=None) -> str:
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
txt += line
|
||||
return txt
|
||||
Reference in New Issue
Block a user