Initial commit

2025-11-30 18:22:17 +08:00
commit aea2fe391e
449 changed files with 83030 additions and 0 deletions
--- a/app/core/rag/deepdoc/parser/init.py
+++ b/app/core/rag/deepdoc/parser/init.py
@@ -0,0 +1,24 @@
+from .docx_parser import RAGDocxParser as DocxParser
+from .excel_parser import RAGExcelParser as ExcelParser
+from .html_parser import RAGHtmlParser as HtmlParser
+from .json_parser import RAGJsonParser as JsonParser
+from .markdown_parser import MarkdownElementExtractor
+from .markdown_parser import RAGMarkdownParser as MarkdownParser
+from .pdf_parser import PlainParser
+from .pdf_parser import RAGPdfParser as PdfParser
+from .ppt_parser import RAGPptParser as PptParser
+from .txt_parser import RAGTxtParser as TxtParser
+
+__all__ = [
+    "PdfParser",
+    "PlainParser",
+    "DocxParser",
+    "ExcelParser",
+    "PptParser",
+    "HtmlParser",
+    "JsonParser",
+    "MarkdownParser",
+    "TxtParser",
+    "MarkdownElementExtractor",
+]
+
--- a/app/core/rag/deepdoc/parser/docx_parser.py
+++ b/app/core/rag/deepdoc/parser/docx_parser.py
@@ -0,0 +1,123 @@
+from docx import Document
+import re
+import pandas as pd
+from collections import Counter
+from app.core.rag.nlp import rag_tokenizer
+from io import BytesIO
+
+
+class RAGDocxParser:
+
+    def __extract_table_content(self, tb):
+        df = []
+        for row in tb.rows:
+            df.append([c.text for c in row.cells])
+        return self.__compose_table_content(pd.DataFrame(df))
+
+    def __compose_table_content(self, df):
+
+        def blockType(b):
+            pattern = [
+                ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
+                (r"^(20|19)[0-9]{2}年$", "Dt"),
+                (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
+                ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
+                (r"^第*[一二三四1-4]季度$", "Dt"),
+                (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
+                (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
+                ("^[0-9.,+%/ -]+$", "Nu"),
+                (r"^[0-9A-Z/\._~-]+$", "Ca"),
+                (r"^[A-Z]*[a-z' -]+$", "En"),
+                (r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
+                (r"^.{1}$", "Sg")
+            ]
+            for p, n in pattern:
+                if re.search(p, b):
+                    return n
+            tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
+            if len(tks) > 3:
+                if len(tks) < 12:
+                    return "Tx"
+                else:
+                    return "Lx"
+
+            if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
+                return "Nr"
+
+            return "Ot"
+
+        if len(df) < 2:
+            return []
+        max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
+            1, len(df)) for j in range(len(df.iloc[i, :]))])
+        max_type = max(max_type.items(), key=lambda x: x[1])[0]
+
+        colnm = len(df.iloc[0, :])
+        hdrows = [0]  # header is not necessarily appear in the first line
+        if max_type == "Nu":
+            for r in range(1, len(df)):
+                tys = Counter([blockType(str(df.iloc[r, j]))
+                              for j in range(len(df.iloc[r, :]))])
+                tys = max(tys.items(), key=lambda x: x[1])[0]
+                if tys != max_type:
+                    hdrows.append(r)
+
+        lines = []
+        for i in range(1, len(df)):
+            if i in hdrows:
+                continue
+            hr = [r - i for r in hdrows]
+            hr = [r for r in hr if r < 0]
+            t = len(hr) - 1
+            while t > 0:
+                if hr[t] - hr[t - 1] > 1:
+                    hr = hr[t:]
+                    break
+                t -= 1
+            headers = []
+            for j in range(len(df.iloc[i, :])):
+                t = []
+                for h in hr:
+                    x = str(df.iloc[i + h, j]).strip()
+                    if x in t:
+                        continue
+                    t.append(x)
+                t = ",".join(t)
+                if t:
+                    t += ": "
+                headers.append(t)
+            cells = []
+            for j in range(len(df.iloc[i, :])):
+                if not str(df.iloc[i, j]):
+                    continue
+                cells.append(headers[j] + str(df.iloc[i, j]))
+            lines.append(";".join(cells))
+
+        if colnm > 3:
+            return lines
+        return ["\n".join(lines)]
+
+    def __call__(self, fnm, from_page=0, to_page=100000000):
+        self.doc = Document(fnm) if isinstance(
+            fnm, str) else Document(BytesIO(fnm))
+        pn = 0 # parsed page
+        secs = [] # parsed contents
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+
+            runs_within_single_paragraph = [] # save runs within the range of pages
+            for run in p.runs:
+                if pn > to_page:
+                    break
+                if from_page <= pn < to_page and p.text.strip():
+                    runs_within_single_paragraph.append(run.text) # append run.text first
+
+                # wrap page break checker into a static method
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+
+            secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
+
+        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
+        return secs, tbls
--- a/app/core/rag/deepdoc/parser/excel_parser.py
+++ b/app/core/rag/deepdoc/parser/excel_parser.py
@@ -0,0 +1,210 @@
+import logging
+import re
+import sys
+from io import BytesIO
+
+import pandas as pd
+from openpyxl import Workbook, load_workbook
+
+from app.core.rag.nlp import find_codec
+
+# copied from `/openpyxl/cell/cell.py`
+ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
+
+
+class RAGExcelParser:
+    @staticmethod
+    def _load_excel_to_workbook(file_like_object):
+        if isinstance(file_like_object, bytes):
+            file_like_object = BytesIO(file_like_object)
+
+        # Read first 4 bytes to determine file type
+        file_like_object.seek(0)
+        file_head = file_like_object.read(4)
+        file_like_object.seek(0)
+
+        if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")):
+            logging.info("Not an Excel file, converting CSV to Excel Workbook")
+
+            try:
+                file_like_object.seek(0)
+                df = pd.read_csv(file_like_object)
+                return RAGExcelParser._dataframe_to_workbook(df)
+
+            except Exception as e_csv:
+                raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
+
+        try:
+            return load_workbook(file_like_object, data_only=True)
+        except Exception as e:
+            logging.info(f"openpyxl load error: {e}, try pandas instead")
+            try:
+                file_like_object.seek(0)
+                try:
+                    dfs = pd.read_excel(file_like_object, sheet_name=None)
+                    return RAGExcelParser._dataframe_to_workbook(dfs)
+                except Exception as ex:
+                    logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
+                    file_like_object.seek(0)
+                    df = pd.read_excel(file_like_object, engine="calamine")
+                    return RAGExcelParser._dataframe_to_workbook(df)
+            except Exception as e_pandas:
+                raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
+
+    @staticmethod
+    def _clean_dataframe(df: pd.DataFrame):
+        def clean_string(s):
+            if isinstance(s, str):
+                return ILLEGAL_CHARACTERS_RE.sub(" ", s)
+            return s
+
+        return df.apply(lambda col: col.map(clean_string))
+
+    @staticmethod
+    def _dataframe_to_workbook(df):
+        # if contains multiple sheets use _dataframes_to_workbook
+        if isinstance(df, dict) and len(df) > 1:
+            return RAGExcelParser._dataframes_to_workbook(df)
+
+        df = RAGExcelParser._clean_dataframe(df)
+        wb = Workbook()
+        ws = wb.active
+        ws.title = "Data"
+
+        for col_num, column_name in enumerate(df.columns, 1):
+            ws.cell(row=1, column=col_num, value=column_name)
+
+        for row_num, row in enumerate(df.values, 2):
+            for col_num, value in enumerate(row, 1):
+                ws.cell(row=row_num, column=col_num, value=value)
+
+        return wb
+    
+    @staticmethod
+    def _dataframes_to_workbook(dfs: dict):
+        wb = Workbook()
+        default_sheet = wb.active
+        wb.remove(default_sheet)
+        
+        for sheet_name, df in dfs.items():
+            df = RAGExcelParser._clean_dataframe(df)
+            ws = wb.create_sheet(title=sheet_name)
+            for col_num, column_name in enumerate(df.columns, 1):
+                ws.cell(row=1, column=col_num, value=column_name)
+            for row_num, row in enumerate(df.values, 2):
+                for col_num, value in enumerate(row, 1):
+                    ws.cell(row=row_num, column=col_num, value=value)
+        return wb
+
+    def html(self, fnm, chunk_rows=256):
+        from html import escape
+
+        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
+        wb = RAGExcelParser._load_excel_to_workbook(file_like_object)
+        tb_chunks = []
+
+        def _fmt(v):
+            if v is None:
+                return ""
+            return str(v).strip()
+
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            try:
+                rows = list(ws.rows)
+            except Exception as e:
+                logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
+                continue
+
+            if not rows:
+                continue
+
+            tb_rows_0 = "<tr>"
+            for t in list(rows[0]):
+                tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
+            tb_rows_0 += "</tr>"
+
+            for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
+                tb = ""
+                tb += f"<table><caption>{sheetname}</caption>"
+                tb += tb_rows_0
+                for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
+                    tb += "<tr>"
+                    for i, c in enumerate(r):
+                        if c.value is None:
+                            tb += "<td></td>"
+                        else:
+                            tb += f"<td>{escape(_fmt(c.value))}</td>"
+                    tb += "</tr>"
+                tb += "</table>\n"
+                tb_chunks.append(tb)
+
+        return tb_chunks
+
+    def markdown(self, fnm):
+        import pandas as pd
+
+        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
+        try:
+            file_like_object.seek(0)
+            df = pd.read_excel(file_like_object)
+        except Exception as e:
+            logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
+            file_like_object.seek(0)
+            df = pd.read_csv(file_like_object)
+        df = df.replace(r"^\s*$", "", regex=True)
+        return df.to_markdown(index=False)
+
+    def __call__(self, fnm):
+        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
+        wb = RAGExcelParser._load_excel_to_workbook(file_like_object)
+
+        res = []
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            try:
+                rows = list(ws.rows)
+            except Exception as e:
+                logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
+                continue
+            if not rows:
+                continue
+            ti = list(rows[0])
+            for r in list(rows[1:]):
+                fields = []
+                for i, c in enumerate(r):
+                    if not c.value:
+                        continue
+                    t = str(ti[i].value) if i < len(ti) else ""
+                    t += ("：" if t else "") + str(c.value)
+                    fields.append(t)
+                line = "; ".join(fields)
+                if sheetname.lower().find("sheet") < 0:
+                    line += " ——" + sheetname
+                res.append(line)
+        return res
+
+    @staticmethod
+    def row_number(fnm, binary):
+        if fnm.split(".")[-1].lower().find("xls") >= 0:
+            wb = RAGExcelParser._load_excel_to_workbook(BytesIO(binary))
+            total = 0
+            
+            for sheetname in wb.sheetnames:
+               try:
+                   ws = wb[sheetname]
+                   total += len(list(ws.rows))
+               except Exception as e:
+                   logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
+                   continue
+            return total
+
+        if fnm.split(".")[-1].lower() in ["csv", "txt"]:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+            return len(txt.split("\n"))
+
+
+if __name__ == "__main__":
+    psr = RAGExcelParser()
+    psr(sys.argv[1])
--- a/app/core/rag/deepdoc/parser/figure_parser.py
+++ b/app/core/rag/deepdoc/parser/figure_parser.py
@@ -0,0 +1,118 @@
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+
+from app.core.rag.common.constants import LLMType
+from app.core.rag.common.connection_utils import timeout
+from app.core.rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
+from app.core.rag.prompts.generator import vision_llm_figure_describe_prompt
+
+
+def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
+    return [
+        (
+            (figure_data[1], [figure_data[0]]),
+            [(0, 0, 0, 0, 0)],
+        )
+        for figure_data in figures_data_without_positions
+        if isinstance(figure_data[1], Image.Image)
+    ]
+
+def vision_figure_parser_docx_wrapper(sections,tbls,callback=None,vision_model=None,**kwargs):
+    if vision_model:
+        figures_data = vision_figure_parser_figure_data_wrapper(sections)
+        try:
+            docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
+            boosted_figures = docx_vision_parser(callback=callback)
+            tbls.extend(boosted_figures)
+        except Exception as e:
+            callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
+    return tbls
+
+def vision_figure_parser_pdf_wrapper(tbls,callback=None,vision_model=None,**kwargs):
+    if vision_model:
+        def is_figure_item(item):
+            return (
+                isinstance(item[0][0], Image.Image) and
+                isinstance(item[0][1], list)
+            )
+        figures_data = [item for item in tbls if is_figure_item(item)]
+        try:
+            docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
+            boosted_figures = docx_vision_parser(callback=callback)
+            tbls = [item for item in tbls if not is_figure_item(item)]
+            tbls.extend(boosted_figures)
+        except Exception as e:
+            callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
+    return tbls
+
+shared_executor = ThreadPoolExecutor(max_workers=10)
+
+
+class VisionFigureParser:
+    def __init__(self, vision_model, figures_data, *args, **kwargs):
+        self.vision_model = vision_model
+        self._extract_figures_info(figures_data)
+        assert len(self.figures) == len(self.descriptions)
+        assert not self.positions or (len(self.figures) == len(self.positions))
+
+    def _extract_figures_info(self, figures_data):
+        self.figures = []
+        self.descriptions = []
+        self.positions = []
+
+        for item in figures_data:
+            # position
+            if len(item) == 2 and isinstance(item[0], tuple) and len(item[0]) == 2 and isinstance(item[1], list) and isinstance(item[1][0], tuple) and len(item[1][0]) == 5:
+                img_desc = item[0]
+                assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])"
+                self.figures.append(img_desc[0])
+                self.descriptions.append(img_desc[1])
+                self.positions.append(item[1])
+            else:
+                assert len(item) == 2 and isinstance(item[0], Image.Image) and isinstance(item[1], list), f"Unexpected form of figure data: get {len(item)=}, {item=}"
+                self.figures.append(item[0])
+                self.descriptions.append(item[1])
+
+    def _assemble(self):
+        self.assembled = []
+        self.has_positions = len(self.positions) != 0
+        for i in range(len(self.figures)):
+            figure = self.figures[i]
+            desc = self.descriptions[i]
+            pos = self.positions[i] if self.has_positions else None
+
+            figure_desc = (figure, desc)
+
+            if pos is not None:
+                self.assembled.append((figure_desc, pos))
+            else:
+                self.assembled.append((figure_desc,))
+
+        return self.assembled
+
+    def __call__(self, **kwargs):
+        callback = kwargs.get("callback", lambda prog, msg: None)
+
+        @timeout(30, 3)
+        def process(figure_idx, figure_binary):
+            description_text = picture_vision_llm_chunk(
+                binary=figure_binary,
+                vision_model=self.vision_model,
+                prompt=vision_llm_figure_describe_prompt(),
+                callback=callback,
+            )
+            return figure_idx, description_text
+
+        futures = []
+        for idx, img_binary in enumerate(self.figures or []):
+            futures.append(shared_executor.submit(process, idx, img_binary))
+
+        for future in as_completed(futures):
+            figure_num, txt = future.result()
+            if txt:
+                self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
+
+        self._assemble()
+
+        return self.assembled
--- a/app/core/rag/deepdoc/parser/html_parser.py
+++ b/app/core/rag/deepdoc/parser/html_parser.py
@@ -0,0 +1,197 @@
+from app.core.rag.nlp import find_codec, rag_tokenizer
+import uuid
+import chardet
+from bs4 import BeautifulSoup, NavigableString, Tag, Comment
+import html
+
+def get_encoding(file):
+    with open(file,'rb') as f:
+        tmp = chardet.detect(f.read())
+        return tmp['encoding']
+
+BLOCK_TAGS = [
+    "h1", "h2", "h3", "h4", "h5", "h6",
+    "p", "div", "article", "section", "aside",
+    "ul", "ol", "li",
+    "table", "pre", "code", "blockquote",
+    "figure", "figcaption"
+]
+TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
+
+
+class RAGHtmlParser:
+    def __call__(self, fnm, binary=None, chunk_token_num=512):
+        if binary:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+        else:
+            with open(fnm, "r",encoding=get_encoding(fnm)) as f:
+                txt = f.read()
+        return self.parser_txt(txt, chunk_token_num)
+
+    @classmethod
+    def parser_txt(cls, txt, chunk_token_num):
+        if not isinstance(txt, str):
+            raise TypeError("txt type should be string!")
+
+        temp_sections = []
+        soup = BeautifulSoup(txt, "html5lib")
+        # delete <style> tag
+        for style_tag in soup.find_all(["style", "script"]):
+            style_tag.decompose()
+        # delete <script> tag in <div>
+        for div_tag in soup.find_all("div"):
+            for script_tag in div_tag.find_all("script"):
+                script_tag.decompose()
+        # delete inline style
+        for tag in soup.find_all(True):
+            if 'style' in tag.attrs:
+                del tag.attrs['style']
+        # delete HTML comment
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+
+        cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
+        block_txt_list, table_list = cls.merge_block_text(temp_sections)
+        sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
+        for table in table_list:
+            sections.append(table.get("content", ""))
+        return sections
+
+    @classmethod
+    def split_table(cls, html_table, chunk_token_num=512):
+        soup = BeautifulSoup(html_table, "html.parser")
+        rows = soup.find_all("tr")
+        tables = []
+        current_table = []
+        current_count = 0
+        table_str_list = []
+        for row in rows:
+            tks_str = rag_tokenizer.tokenize(str(row))
+            token_count = len(tks_str.split(" ")) if tks_str else 0
+            if current_count + token_count > chunk_token_num:
+                tables.append(current_table)
+                current_table = []
+                current_count = 0
+            current_table.append(row)
+            current_count += token_count
+        if current_table:
+            tables.append(current_table)
+
+        for table_rows in tables:
+            new_table = soup.new_tag("table")
+            for row in table_rows:
+                new_table.append(row)
+            table_str_list.append(str(new_table))
+
+        return table_str_list
+
+    @classmethod
+    def read_text_recursively(cls, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None):
+        if isinstance(element, NavigableString):
+            content = element.strip()
+
+            def is_valid_html(content):
+                try:
+                    soup = BeautifulSoup(content, "html.parser")
+                    return bool(soup.find())
+                except Exception:
+                    return False
+
+            return_info = []
+            if content:
+                if is_valid_html(content):
+                    soup = BeautifulSoup(content, "html.parser")
+                    child_info = cls.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id)
+                    parser_result.extend(child_info)
+                else:
+                    info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}}
+                    if parent_name:
+                        info["tag_name"] = parent_name
+                    return_info.append(info)
+            return return_info
+        elif isinstance(element, Tag):
+
+            if str.lower(element.name) == "table":
+                table_info_list = []
+                table_id = str(uuid.uuid1())
+                table_list = [html.unescape(str(element))]
+                for t in table_list:
+                    table_info_list.append({"content": t, "tag_name": "table",
+                                            "metadata": {"table_id": table_id, "index": table_list.index(t)}})
+                return table_info_list
+            else:
+                block_id = None
+                if str.lower(element.name) in BLOCK_TAGS:
+                    block_id = str(uuid.uuid1())
+                for child in element.children:
+                    child_info = cls.read_text_recursively(child, parser_result, chunk_token_num, element.name,
+                                                           block_id)
+                    parser_result.extend(child_info)
+        return []
+
+    @classmethod
+    def merge_block_text(cls, parser_result):
+        block_content = []
+        current_content = ""
+        table_info_list = []
+        lask_block_id = None
+        for item in parser_result:
+            content = item.get("content")
+            tag_name = item.get("tag_name")
+            title_flag = tag_name in TITLE_TAGS
+            block_id = item.get("metadata", {}).get("block_id")
+            if block_id:
+                if title_flag:
+                    content = f"{TITLE_TAGS[tag_name]} {content}"
+                if lask_block_id != block_id:
+                    if lask_block_id is not None:
+                        block_content.append(current_content)
+                    current_content = content
+                    lask_block_id = block_id
+                else:
+                    current_content += (" " if current_content else "") + content
+            else:
+                if tag_name == "table":
+                    table_info_list.append(item)
+                else:
+                    current_content += (" " if current_content else "" + content)
+        if current_content:
+            block_content.append(current_content)
+        return block_content, table_info_list
+
+    @classmethod
+    def chunk_block(cls, block_txt_list, chunk_token_num=512):
+        chunks = []
+        current_block = ""
+        current_token_count = 0
+
+        for block in block_txt_list:
+            tks_str = rag_tokenizer.tokenize(block)
+            block_token_count = len(tks_str.split(" ")) if tks_str else 0
+            if block_token_count > chunk_token_num:
+                if current_block:
+                    chunks.append(current_block)
+                start = 0
+                tokens = tks_str.split(" ")
+                while start < len(tokens):
+                    end = start + chunk_token_num
+                    split_tokens = tokens[start:end]
+                    chunks.append(" ".join(split_tokens))
+                    start = end
+                current_block = ""
+                current_token_count = 0
+            else:
+                if current_token_count + block_token_count <= chunk_token_num:
+                    current_block += ("\n" if current_block else "") + block
+                    current_token_count += block_token_count
+                else:
+                    chunks.append(current_block)
+                    current_block = block
+                    current_token_count = block_token_count
+
+        if current_block:
+            chunks.append(current_block)
+
+        return chunks
+
--- a/app/core/rag/deepdoc/parser/json_parser.py
+++ b/app/core/rag/deepdoc/parser/json_parser.py
@@ -0,0 +1,159 @@
+import json
+from typing import Any
+
+from app.core.rag.nlp import find_codec
+
+
+class RAGJsonParser:
+    def __init__(self, max_chunk_size: int = 2000, min_chunk_size: int | None = None):
+        super().__init__()
+        self.max_chunk_size = max_chunk_size * 2
+        self.min_chunk_size = min_chunk_size if min_chunk_size is not None else max(max_chunk_size - 200, 50)
+
+    def __call__(self, filename):
+        with open(filename, "r") as f:
+            txt = f.read()
+
+        if self.is_jsonl_format(txt):
+            sections = self._parse_jsonl(txt)
+        else:
+            sections = self._parse_json(txt)
+        return sections
+
+    @staticmethod
+    def _json_size(data: dict) -> int:
+        """Calculate the size of the serialized JSON object."""
+        return len(json.dumps(data, ensure_ascii=False))
+
+    @staticmethod
+    def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
+        """Set a value in a nested dictionary based on the given path."""
+        for key in path[:-1]:
+            d = d.setdefault(key, {})
+        d[path[-1]] = value
+
+    def _list_to_dict_preprocessing(self, data: Any) -> Any:
+        if isinstance(data, dict):
+            # Process each key-value pair in the dictionary
+            return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
+        elif isinstance(data, list):
+            # Convert the list to a dictionary with index-based keys
+            return {str(i): self._list_to_dict_preprocessing(item) for i, item in enumerate(data)}
+        else:
+            # Base case: the item is neither a dict nor a list, so return it unchanged
+            return data
+
+    def _json_split(
+        self,
+        data,
+        current_path: list[str] | None,
+        chunks: list[dict] | None,
+    ) -> list[dict]:
+        """
+        Split json into maximum size dictionaries while preserving structure.
+        """
+        current_path = current_path or []
+        chunks = chunks or [{}]
+        if isinstance(data, dict):
+            for key, value in data.items():
+                new_path = current_path + [key]
+                chunk_size = self._json_size(chunks[-1])
+                size = self._json_size({key: value})
+                remaining = self.max_chunk_size - chunk_size
+
+                if size < remaining:
+                    # Add item to current chunk
+                    self._set_nested_dict(chunks[-1], new_path, value)
+                else:
+                    if chunk_size >= self.min_chunk_size:
+                        # Chunk is big enough, start a new chunk
+                        chunks.append({})
+
+                    # Iterate
+                    self._json_split(value, new_path, chunks)
+        else:
+            # handle single item
+            self._set_nested_dict(chunks[-1], current_path, data)
+        return chunks
+
+    def split_json(
+        self,
+        json_data,
+        convert_lists: bool = False,
+    ) -> list[dict]:
+        """Splits JSON into a list of JSON chunks"""
+
+        if convert_lists:
+            preprocessed_data = self._list_to_dict_preprocessing(json_data)
+            chunks = self._json_split(preprocessed_data, None, None)
+        else:
+            chunks = self._json_split(json_data, None, None)
+
+        # Remove the last chunk if it's empty
+        if not chunks[-1]:
+            chunks.pop()
+        return chunks
+
+    def split_text(
+        self,
+        json_data: dict[str, Any],
+        convert_lists: bool = False,
+        ensure_ascii: bool = True,
+    ) -> list[str]:
+        """Splits JSON into a list of JSON formatted strings"""
+
+        chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
+
+        # Convert to string
+        return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
+
+    def _parse_json(self, content: str) -> list[str]:
+        sections = []
+        try:
+            json_data = json.loads(content)
+            chunks = self.split_json(json_data, True)
+            sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
+        except json.JSONDecodeError:
+            pass
+        return sections
+
+    def _parse_jsonl(self, content: str) -> list[str]:
+        lines = content.strip().splitlines()
+        all_chunks = []
+        for line in lines:
+            if not line.strip():
+                continue
+            try:
+                data = json.loads(line)
+                chunks = self.split_json(data, convert_lists=True)
+                all_chunks.extend(json.dumps(chunk, ensure_ascii=False) for chunk in chunks if chunk)
+            except json.JSONDecodeError:
+                continue
+        return all_chunks
+
+    def is_jsonl_format(self, txt: str, sample_limit: int = 10, threshold: float = 0.8) -> bool:
+        lines = [line.strip() for line in txt.strip().splitlines() if line.strip()]
+        if not lines:
+            return False
+
+        try:
+            json.loads(txt)
+            return False
+        except json.JSONDecodeError:
+            pass
+
+        sample_limit = min(len(lines), sample_limit)
+        sample_lines = lines[:sample_limit]
+        valid_lines = sum(1 for line in sample_lines if self._is_valid_json(line))
+
+        if not valid_lines:
+            return False
+
+        return (valid_lines / len(sample_lines)) >= threshold
+
+    def _is_valid_json(self, line: str) -> bool:
+        try:
+            json.loads(line)
+            return True
+        except json.JSONDecodeError:
+            return False
--- a/app/core/rag/deepdoc/parser/markdown_parser.py
+++ b/app/core/rag/deepdoc/parser/markdown_parser.py
@@ -0,0 +1,277 @@
+import re
+
+from markdown import markdown
+
+
+class RAGMarkdownParser:
+    def __init__(self, chunk_token_num=128):
+        self.chunk_token_num = int(chunk_token_num)
+
+    def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
+        tables = []
+        working_text = markdown_text
+
+        def replace_tables_with_rendered_html(pattern, table_list, render=True):
+            new_text = ""
+            last_end = 0
+            for match in pattern.finditer(working_text):
+                raw_table = match.group()
+                table_list.append(raw_table)
+                if separate_tables:
+                    # Skip this match (i.e., remove it)
+                    new_text += working_text[last_end : match.start()] + "\n\n"
+                else:
+                    # Replace with rendered HTML
+                    html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
+                    new_text += working_text[last_end : match.start()] + html_table + "\n\n"
+                last_end = match.end()
+            new_text += working_text[last_end:]
+            return new_text
+
+        if "|" in markdown_text:  # for optimize performance
+            # Standard Markdown table
+            border_table_pattern = re.compile(
+                r"""
+                (?:\n|^)
+                (?:\|.*?\|.*?\|.*?\n)
+                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
+                (?:\|.*?\|.*?\|.*?\n)+
+            """,
+                re.VERBOSE,
+            )
+            working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
+
+            # Borderless Markdown table
+            no_border_table_pattern = re.compile(
+                r"""
+                (?:\n|^)
+                (?:\S.*?\|.*?\n)
+                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
+                (?:\S.*?\|.*?\n)+
+                """,
+                re.VERBOSE,
+            )
+            working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
+
+        # Replace any TAGS e.g. <table ...> to <table>
+        TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
+        table_with_attributes_pattern = re.compile(
+            rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
+        )
+        def replace_tag(m):
+            tag_name = re.match(r"<(\w+)", m.group()).group(1)
+            return "<{}>".format(tag_name)
+
+        working_text = re.sub(table_with_attributes_pattern, replace_tag, working_text)
+
+        if "<table>" in working_text.lower():  # for optimize performance
+            # HTML table extraction - handle possible html/body wrapper tags
+            html_table_pattern = re.compile(
+                r"""
+            (?:\n|^)
+            \s*
+            (?:
+                # case1: <html><body><table>...</table></body></html>
+                (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
+                |
+                # case2: <body><table>...</table></body>
+                (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
+                |
+                # case3: only<table>...</table>
+                (?:<table[^>]*>.*?</table>)
+            )
+            \s*
+            (?=\n|$)
+            """,
+                re.VERBOSE | re.DOTALL | re.IGNORECASE,
+            )
+
+            def replace_html_tables():
+                nonlocal working_text
+                new_text = ""
+                last_end = 0
+                for match in html_table_pattern.finditer(working_text):
+                    raw_table = match.group()
+                    tables.append(raw_table)
+                    if separate_tables:
+                        new_text += working_text[last_end : match.start()] + "\n\n"
+                    else:
+                        new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
+                    last_end = match.end()
+                new_text += working_text[last_end:]
+                working_text = new_text
+
+            replace_html_tables()
+
+        return working_text, tables
+
+
+class MarkdownElementExtractor:
+    def __init__(self, markdown_content):
+        self.markdown_content = markdown_content
+        self.lines = markdown_content.split("\n")
+
+    def get_delimiters(self,delimiters):
+        toks = re.findall(r"`([^`]+)`", delimiters)
+        toks = sorted(set(toks), key=lambda x: -len(x))
+        return "|".join(re.escape(t) for t in toks if t)
+    
+    def extract_elements(self,delimiter=None):
+        """Extract individual elements (headers, code blocks, lists, etc.)"""
+        sections = []
+
+        i = 0
+        dels=""
+        if delimiter:
+            dels = self.get_delimiters(delimiter)
+        if len(dels) > 0:
+            text = "\n".join(self.lines)
+            parts = re.split(dels, text)
+            sections = [p.strip() for p in parts if p and p.strip()]
+            return sections
+        while i < len(self.lines):
+            line = self.lines[i]
+
+            if re.match(r"^#{1,6}\s+.*$", line):
+                # header
+                element = self._extract_header(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            elif line.strip().startswith("```"):
+                # code block
+                element = self._extract_code_block(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
+                # list block
+                element = self._extract_list_block(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            elif line.strip().startswith(">"):
+                # blockquote
+                element = self._extract_blockquote(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            elif line.strip():
+                # text block (paragraphs and inline elements until next block element)
+                element = self._extract_text_block(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            else:
+                i += 1
+
+        sections = [section for section in sections if section.strip()]
+        return sections
+
+    def _extract_header(self, start_pos):
+        return {
+            "type": "header",
+            "content": self.lines[start_pos],
+            "start_line": start_pos,
+            "end_line": start_pos,
+        }
+
+    def _extract_code_block(self, start_pos):
+        end_pos = start_pos
+        content_lines = [self.lines[start_pos]]
+
+        # Find the end of the code block
+        for i in range(start_pos + 1, len(self.lines)):
+            content_lines.append(self.lines[i])
+            end_pos = i
+            if self.lines[i].strip().startswith("```"):
+                break
+
+        return {
+            "type": "code_block",
+            "content": "\n".join(content_lines),
+            "start_line": start_pos,
+            "end_line": end_pos,
+        }
+
+    def _extract_list_block(self, start_pos):
+        end_pos = start_pos
+        content_lines = []
+
+        i = start_pos
+        while i < len(self.lines):
+            line = self.lines[i]
+            # check if this line is a list item or continuation of a list
+            if (
+                re.match(r"^\s*[-*+]\s+.*$", line)
+                or re.match(r"^\s*\d+\.\s+.*$", line)
+                or (i > start_pos and not line.strip())
+                or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
+                or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
+                or (i > start_pos and re.match(r"^\s+\w+.*$", line))
+            ):
+                content_lines.append(line)
+                end_pos = i
+                i += 1
+            else:
+                break
+
+        return {
+            "type": "list_block",
+            "content": "\n".join(content_lines),
+            "start_line": start_pos,
+            "end_line": end_pos,
+        }
+
+    def _extract_blockquote(self, start_pos):
+        end_pos = start_pos
+        content_lines = []
+
+        i = start_pos
+        while i < len(self.lines):
+            line = self.lines[i]
+            if line.strip().startswith(">") or (i > start_pos and not line.strip()):
+                content_lines.append(line)
+                end_pos = i
+                i += 1
+            else:
+                break
+
+        return {
+            "type": "blockquote",
+            "content": "\n".join(content_lines),
+            "start_line": start_pos,
+            "end_line": end_pos,
+        }
+
+    def _extract_text_block(self, start_pos):
+        """Extract a text block (paragraphs, inline elements) until next block element"""
+        end_pos = start_pos
+        content_lines = [self.lines[start_pos]]
+
+        i = start_pos + 1
+        while i < len(self.lines):
+            line = self.lines[i]
+            # stop if we encounter a block element
+            if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
+                break
+            elif not line.strip():
+                # check if the next line is a block element
+                if i + 1 < len(self.lines) and (
+                    re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
+                    or self.lines[i + 1].strip().startswith("```")
+                    or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
+                    or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
+                    or self.lines[i + 1].strip().startswith(">")
+                ):
+                    break
+                else:
+                    content_lines.append(line)
+                    end_pos = i
+                    i += 1
+            else:
+                content_lines.append(line)
+                end_pos = i
+                i += 1
+
+        return {
+            "type": "text_block",
+            "content": "\n".join(content_lines),
+            "start_line": start_pos,
+            "end_line": end_pos,
+        }
--- a/app/core/rag/deepdoc/parser/mineru_parser.py
+++ b/app/core/rag/deepdoc/parser/mineru_parser.py
@@ -0,0 +1,524 @@
+import json
+import logging
+import os
+import platform
+import re
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import zipfile
+from io import BytesIO
+from os import PathLike
+from pathlib import Path
+from queue import Empty, Queue
+from typing import Any, Callable, Optional
+
+import numpy as np
+import pdfplumber
+import requests
+from PIL import Image
+from strenum import StrEnum
+
+from .pdf_parser import RAGPdfParser
+
+LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
+if LOCK_KEY_pdfplumber not in sys.modules:
+    sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
+
+
+class MinerUContentType(StrEnum):
+    IMAGE = "image"
+    TABLE = "table"
+    TEXT = "text"
+    EQUATION = "equation"
+    CODE = "code"
+    LIST = "list"
+    DISCARDED = "discarded"
+
+
+class MinerUParser(RAGPdfParser):
+    def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
+        self.mineru_path = Path(mineru_path)
+        self.mineru_api = mineru_api.rstrip("/")
+        self.mineru_server_url = mineru_server_url.rstrip("/")
+        self.using_api = False
+        self.logger = logging.getLogger(self.__class__.__name__)
+
+    def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            if not root_dir:
+                files = zip_ref.namelist()
+                if files and files[0].endswith("/"):
+                    root_dir = files[0]
+                else:
+                    root_dir = None
+
+            if not root_dir or not root_dir.endswith("/"):
+                self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}")
+                zip_ref.extractall(extract_to)
+                return
+
+            root_len = len(root_dir)
+            for member in zip_ref.infolist():
+                filename = member.filename
+                if filename == root_dir:
+                    self.logger.info("[MinerU] Ignore root folder...")
+                    continue
+
+                path = filename
+                if path.startswith(root_dir):
+                    path = path[root_len:]
+
+                full_path = os.path.join(extract_to, path)
+                if member.is_dir():
+                    os.makedirs(full_path, exist_ok=True)
+                else:
+                    os.makedirs(os.path.dirname(full_path), exist_ok=True)
+                    with open(full_path, "wb") as f:
+                        f.write(zip_ref.read(filename))
+
+    def _is_http_endpoint_valid(self, url, timeout=5):
+        try:
+            response = requests.head(url, timeout=timeout, allow_redirects=True)
+            return response.status_code in [200, 301, 302, 307, 308]
+        except Exception:
+            return False
+
+    def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
+        reason = ""
+
+        valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
+        if backend not in valid_backends:
+            reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
+            logging.warning(reason)
+            return False, reason
+
+        subprocess_kwargs = {
+            "capture_output": True,
+            "text": True,
+            "check": True,
+            "encoding": "utf-8",
+            "errors": "ignore",
+        }
+
+        if platform.system() == "Windows":
+            subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
+
+        if server_url is None:
+            server_url = self.mineru_server_url
+
+        if backend == "vlm-http-client" and server_url:
+            try:
+                server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
+                logging.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
+                if server_accessible:
+                    self.using_api = False  # We are using http client, not API
+                    return True, reason
+                else:
+                    reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
+                    logging.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
+                    return False, reason
+            except Exception as e:
+                logging.warning(f"[MinerU] vlm-http-client server check failed: {e}")
+                try:
+                    response = requests.get(server_url, timeout=5)
+                    logging.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
+                    self.using_api = False
+                    return True, reason
+                except Exception as e:
+                    reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
+                    logging.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
+                    return False, reason
+
+        try:
+            result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
+            version_info = result.stdout.strip()
+            if version_info:
+                logging.info(f"[MinerU] Detected version: {version_info}")
+            else:
+                logging.info("[MinerU] Detected MinerU, but version info is empty.")
+            return True, reason
+        except subprocess.CalledProcessError as e:
+            logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
+        except FileNotFoundError:
+            logging.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
+        except Exception as e:
+            logging.error(f"[MinerU] Unexpected error during installation check: {e}")
+
+        # If executable check fails, try API check
+        try:
+            if self.mineru_api:
+                # check openapi.json
+                openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
+                if not openapi_exists:
+                    reason = "[MinerU] Failed to detect vaild MinerU API server"
+                    return openapi_exists, reason
+                logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
+                self.using_api = openapi_exists
+                return openapi_exists, reason
+            else:
+                logging.info("[MinerU] api not exists.")
+        except Exception as e:
+            reason = f"[MinerU] Unexpected error during api check: {e}"
+            logging.error(f"[MinerU] Unexpected error during api check: {e}")
+        return False, reason
+
+    def _run_mineru(
+        self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
+    ):
+        if self.using_api:
+            self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
+        else:
+            self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
+
+    def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
+        OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
+
+        pdf_file_path = str(input_path)
+
+        if not os.path.exists(pdf_file_path):
+            raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
+
+        pdf_file_name = Path(pdf_file_path).stem.strip()
+        output_path = os.path.join(str(output_dir), pdf_file_name, method)
+        os.makedirs(output_path, exist_ok=True)
+
+        files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
+
+        data = {
+            "output_dir": "./output",
+            "lang_list": lang,
+            "backend": backend,
+            "parse_method": method,
+            "formula_enable": True,
+            "table_enable": True,
+            "server_url": None,
+            "return_md": True,
+            "return_middle_json": True,
+            "return_model_output": True,
+            "return_content_list": True,
+            "return_images": True,
+            "response_format_zip": True,
+            "start_page_id": 0,
+            "end_page_id": 99999,
+        }
+
+        headers = {"Accept": "application/json"}
+        try:
+            self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
+            if callback:
+                callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
+            response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800)
+
+            response.raise_for_status()
+            if response.headers.get("Content-Type") == "application/zip":
+                self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
+
+                if callback:
+                    callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
+
+                with open(OUTPUT_ZIP_PATH, "wb") as f:
+                    f.write(response.content)
+
+                self.logger.info(f"[MinerU] Unzip to {output_path}...")
+                self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/")
+
+                if callback:
+                    callback(0.40, f"[MinerU] Unzip to {output_path}...")
+            else:
+                self.logger.warning("[MinerU] not zip returned from api：%s " % response.headers.get("Content-Type"))
+        except Exception as e:
+            raise RuntimeError(f"[MinerU] api failed with exception {e}")
+        self.logger.info("[MinerU] Api completed successfully.")
+
+    def _run_mineru_executable(
+        self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
+    ):
+        cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
+        if backend:
+            cmd.extend(["-b", backend])
+        if lang:
+            cmd.extend(["-l", lang])
+        if server_url and backend == "vlm-http-client":
+            cmd.extend(["-u", server_url])
+
+        self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
+
+        subprocess_kwargs = {
+            "stdout": subprocess.PIPE,
+            "stderr": subprocess.PIPE,
+            "text": True,
+            "encoding": "utf-8",
+            "errors": "ignore",
+            "bufsize": 1,
+        }
+
+        if platform.system() == "Windows":
+            subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
+
+        process = subprocess.Popen(cmd, **subprocess_kwargs)
+        stdout_queue, stderr_queue = Queue(), Queue()
+
+        def enqueue_output(pipe, queue, prefix):
+            for line in iter(pipe.readline, ""):
+                if line.strip():
+                    queue.put((prefix, line.strip()))
+            pipe.close()
+
+        threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
+        threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
+
+        while process.poll() is None:
+            for q in (stdout_queue, stderr_queue):
+                try:
+                    while True:
+                        prefix, line = q.get_nowait()
+                        if prefix == "STDOUT":
+                            self.logger.info(f"[MinerU] {line}")
+                        else:
+                            self.logger.warning(f"[MinerU] {line}")
+                except Empty:
+                    pass
+            time.sleep(0.1)
+
+        return_code = process.wait()
+        if return_code != 0:
+            raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
+        self.logger.info("[MinerU] Command completed successfully.")
+
+    def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
+        self.page_from = page_from
+        self.page_to = page_to
+        try:
+            with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
+                self.pdf = pdf
+                self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
+        except Exception as e:
+            self.page_images = None
+            self.total_page = 0
+            logging.exception(e)
+
+    def _line_tag(self, bx):
+        pn = [bx["page_idx"] + 1]
+        positions = bx["bbox"]
+        x0, top, x1, bott = positions
+
+        if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
+            page_width, page_height = self.page_images[bx["page_idx"]].size
+            x0 = (x0 / 1000.0) * page_width
+            x1 = (x1 / 1000.0) * page_width
+            top = (top / 1000.0) * page_height
+            bott = (bott / 1000.0) * page_height
+
+        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
+
+    def crop(self, text, ZM=1, need_position=False):
+        imgs = []
+        poss = self.extract_positions(text)
+        if not poss:
+            if need_position:
+                return None, None
+            return
+
+        max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
+        GAP = 6
+        pos = poss[0]
+        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
+        pos = poss[-1]
+        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
+
+        positions = []
+        for ii, (pns, left, right, top, bottom) in enumerate(poss):
+            right = left + max_width
+
+            if bottom <= top:
+                bottom = top + 2
+
+            for pn in pns[1:]:
+                bottom += self.page_images[pn - 1].size[1]
+
+            img0 = self.page_images[pns[0]]
+            x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
+            crop0 = img0.crop((x0, y0, x1, y1))
+            imgs.append(crop0)
+            if 0 < ii < len(poss) - 1:
+                positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
+
+            bottom -= img0.size[1]
+            for pn in pns[1:]:
+                page = self.page_images[pn]
+                x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
+                cimgp = page.crop((x0, y0, x1, y1))
+                imgs.append(cimgp)
+                if 0 < ii < len(poss) - 1:
+                    positions.append((pn + self.page_from, x0, x1, y0, y1))
+                bottom -= page.size[1]
+
+        if not imgs:
+            if need_position:
+                return None, None
+            return
+
+        height = 0
+        for img in imgs:
+            height += img.size[1] + GAP
+        height = int(height)
+        width = int(np.max([i.size[0] for i in imgs]))
+        pic = Image.new("RGB", (width, height), (245, 245, 245))
+        height = 0
+        for ii, img in enumerate(imgs):
+            if ii == 0 or ii + 1 == len(imgs):
+                img = img.convert("RGBA")
+                overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
+                overlay.putalpha(128)
+                img = Image.alpha_composite(img, overlay).convert("RGB")
+            pic.paste(img, (0, int(height)))
+            height += img.size[1] + GAP
+
+        if need_position:
+            return pic, positions
+        return pic
+
+    @staticmethod
+    def extract_positions(txt: str):
+        poss = []
+        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
+            pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
+            left, right, top, bottom = float(left), float(right), float(top), float(bottom)
+            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
+        return poss
+
+    def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
+        subdir = output_dir / file_stem / method
+        if backend.startswith("vlm-"):
+            subdir = output_dir / file_stem / "vlm"
+        json_file = subdir / f"{file_stem}_content_list.json"
+
+        if not json_file.exists():
+            raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}")
+
+        with open(json_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        for item in data:
+            for key in ("img_path", "table_img_path", "equation_img_path"):
+                if key in item and item[key]:
+                    item[key] = str((subdir / item[key]).resolve())
+        return data
+
+    def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
+        sections = []
+        for output in outputs:
+            match output["type"]:
+                case MinerUContentType.TEXT:
+                    section = output["text"]
+                case MinerUContentType.TABLE:
+                    section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
+                    if not section.strip():
+                        section = "FAILED TO PARSE TABLE"
+                case MinerUContentType.IMAGE:
+                    section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"])
+                case MinerUContentType.EQUATION:
+                    section = output["text"]
+                case MinerUContentType.CODE:
+                    section = output["code_body"] + "\n".join(output.get("code_caption", []))
+                case MinerUContentType.LIST:
+                    section = "\n".join(output.get("list_items", []))
+                case MinerUContentType.DISCARDED:
+                    pass
+
+            if section:
+                sections.append((section, self._line_tag(output)))
+        return sections
+
+    def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
+        return []
+
+    def parse_pdf(
+        self,
+        filepath: str | PathLike[str],
+        binary: BytesIO | bytes,
+        callback: Optional[Callable] = None,
+        *,
+        output_dir: Optional[str] = None,
+        backend: str = "pipeline",
+        lang: Optional[str] = None,
+        method: str = "auto",
+        server_url: Optional[str] = None,
+        delete_output: bool = True,
+    ) -> tuple:
+        import shutil
+
+        temp_pdf = None
+        created_tmp_dir = False
+
+        # remove spaces, or mineru crash, and _read_output fail too
+        file_path = Path(filepath)
+        pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
+        pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
+
+        if binary:
+            temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
+            temp_pdf = temp_dir / pdf_file_name
+            with open(temp_pdf, "wb") as f:
+                f.write(binary)
+            pdf = temp_pdf
+            self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
+            if callback:
+                callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
+        else:
+            if pdf_file_path_valid != filepath:
+                self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
+                shutil.move(filepath, pdf_file_path_valid)
+            pdf = Path(pdf_file_path_valid)
+            if not pdf.exists():
+                if callback:
+                    callback(-1, f"[MinerU] PDF not found: {pdf}")
+                raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
+
+        if output_dir:
+            out_dir = Path(output_dir)
+            out_dir.mkdir(parents=True, exist_ok=True)
+        else:
+            out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
+            created_tmp_dir = True
+
+        self.logger.info(f"[MinerU] Output directory: {out_dir}")
+        if callback:
+            callback(0.15, f"[MinerU] Output directory: {out_dir}")
+
+        self.__images__(pdf, zoomin=1)
+
+        try:
+            self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
+            outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
+            self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
+            if callback:
+                callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
+            return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
+        finally:
+            if temp_pdf and temp_pdf.exists():
+                try:
+                    temp_pdf.unlink()
+                    temp_pdf.parent.rmdir()
+                except Exception:
+                    pass
+            if delete_output and created_tmp_dir and out_dir.exists():
+                try:
+                    shutil.rmtree(out_dir)
+                except Exception:
+                    pass
+
+
+if __name__ == "__main__":
+    parser = MinerUParser("mineru")
+    ok, reason = parser.check_installation()
+    print("MinerU available:", ok)
+
+    filepath = ""
+    with open(filepath, "rb") as file:
+        outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
+        for output in outputs:
+            print(output)
--- a/app/core/rag/deepdoc/parser/pdf_parser.py
+++ b/app/core/rag/deepdoc/parser/pdf_parser.py
--- a/app/core/rag/deepdoc/parser/ppt_parser.py
+++ b/app/core/rag/deepdoc/parser/ppt_parser.py
@@ -0,0 +1,83 @@
+import logging
+from io import BytesIO
+from pptx import Presentation
+
+
+class RAGPptParser:
+    def __init__(self):
+        super().__init__()
+
+    def __get_bulleted_text(self, paragraph):
+        is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
+        if is_bulleted:
+            return f"{'  '* paragraph.level}.{paragraph.text}"
+        else:
+            return paragraph.text
+
+    def __extract(self, shape):
+        try:
+            # First try to get text content
+            if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
+                text_frame = shape.text_frame
+                texts = []
+                for paragraph in text_frame.paragraphs:
+                    if paragraph.text.strip():
+                        texts.append(self.__get_bulleted_text(paragraph))
+                return "\n".join(texts)
+
+            # Safely get shape_type
+            try:
+                shape_type = shape.shape_type
+            except NotImplementedError:
+                # If shape_type is not available, try to get text content
+                if hasattr(shape, 'text'):
+                    return shape.text.strip()
+                return ""
+
+            # Handle table
+            if shape_type == 19:
+                tb = shape.table
+                rows = []
+                for i in range(1, len(tb.rows)):
+                    rows.append("; ".join([tb.cell(
+                        0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+                return "\n".join(rows)
+
+            # Handle group shape
+            if shape_type == 6:
+                texts = []
+                for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
+                    t = self.__extract(p)
+                    if t:
+                        texts.append(t)
+                return "\n".join(texts)
+
+            return ""
+
+        except Exception as e:
+            logging.error(f"Error processing shape: {str(e)}")
+            return ""
+
+    def __call__(self, fnm, from_page, to_page, callback=None):
+        ppt = Presentation(fnm) if isinstance(
+            fnm, str) else Presentation(
+            BytesIO(fnm))
+        txts = []
+        self.total_page = len(ppt.slides)
+        for i, slide in enumerate(ppt.slides):
+            if i < from_page:
+                continue
+            if i >= to_page:
+                break
+            texts = []
+            for shape in sorted(
+                    slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
+                try:
+                    txt = self.__extract(shape)
+                    if txt:
+                        texts.append(txt)
+                except Exception as e:
+                    logging.exception(e)
+            txts.append("\n".join(texts))
+
+        return txts
--- a/app/core/rag/deepdoc/parser/txt_parser.py
+++ b/app/core/rag/deepdoc/parser/txt_parser.py
@@ -0,0 +1,48 @@
+import re
+
+from .utils import get_text
+from app.core.rag.common.token_utils import num_tokens_from_string
+
+
+class RAGTxtParser:
+    def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。；！？"):
+        txt = get_text(fnm, binary)
+        return self.parser_txt(txt, chunk_token_num, delimiter)
+
+    @classmethod
+    def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。；！？"):
+        if not isinstance(txt, str):
+            raise TypeError("txt type should be str!")
+        cks = [""]
+        tk_nums = [0]
+        delimiter = delimiter.encode('utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
+
+        def add_chunk(t):
+            nonlocal cks, tk_nums, delimiter
+            tnum = num_tokens_from_string(t)
+            if tk_nums[-1] > chunk_token_num:
+                cks.append(t)
+                tk_nums.append(tnum)
+            else:
+                cks[-1] += t
+                tk_nums[-1] += tnum
+
+        dels = []
+        s = 0
+        for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
+            f, t = m.span()
+            dels.append(m.group(1))
+            dels.extend(list(delimiter[s: f]))
+            s = t
+        if s < len(delimiter):
+            dels.extend(list(delimiter[s:]))
+        dels = [re.escape(d) for d in dels if d]
+        dels = [d for d in dels if d]
+        dels = "|".join(dels)
+        secs = re.split(r"(%s)" % dels, txt)
+        for sec in secs:
+            if re.match(f"^{dels}$", sec):
+                continue
+            add_chunk(sec)
+
+        return [[c, ""] for c in cks]
--- a/app/core/rag/deepdoc/parser/utils.py
+++ b/app/core/rag/deepdoc/parser/utils.py
@@ -0,0 +1,16 @@
+from app.core.rag.nlp import find_codec
+
+
+def get_text(fnm: str, binary=None) -> str:
+    txt = ""
+    if binary:
+        encoding = find_codec(binary)
+        txt = binary.decode(encoding, errors="ignore")
+    else:
+        with open(fnm, "r") as f:
+            while True:
+                line = f.readline()
+                if not line:
+                    break
+                txt += line
+    return txt