feat: Add base project structure with API and web components

2025-12-02 20:28:01 +08:00
parent f3de6d6cc9
commit c1adc62ec6
817 changed files with 111226 additions and 106 deletions
--- a/api/app/core/rag/app/init.py
+++ b/api/app/core/rag/app/init.py
--- a/api/app/core/rag/app/audio.py
+++ b/api/app/core/rag/app/audio.py
@@ -0,0 +1,42 @@
+import os
+import re
+import tempfile
+
+from app.core.rag.nlp import rag_tokenizer, tokenize
+
+
+def chunk(filename, binary, lang, callback=None, seq2txt_mdl=None, **kwargs):
+    doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+
+    # is it English
+    eng = lang.lower() == "english"  # is_english(sections)
+    try:
+        _, ext = os.path.splitext(filename)
+        if not ext:
+            raise RuntimeError("No extension detected.")
+
+        if ext not in [".da", ".wave", ".wav", ".mp3", ".aac", ".flac", ".ogg", ".aiff", ".au", ".midi", ".wma", ".realaudio", ".vqf", ".oggvorbis", ".ape"]:
+            raise RuntimeError(f"Extension {ext} is not supported yet.")
+
+        tmp_path = ""
+        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmpf:
+            tmpf.write(binary)
+            tmpf.flush()
+            tmp_path = os.path.abspath(tmpf.name)
+
+        callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
+        ans = seq2txt_mdl.transcription(tmp_path)
+        callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
+
+        tokenize(doc, ans, eng)
+        return [doc]
+    except Exception as e:
+        callback(prog=-1, msg=str(e))
+    finally:
+        if tmp_path and os.path.exists(tmp_path):
+            try:
+                os.unlink(tmp_path)
+            except Exception:
+                pass
+    return []
--- a/api/app/core/rag/app/book.py
+++ b/api/app/core/rag/app/book.py
@@ -0,0 +1,170 @@
+import logging
+import re
+from io import BytesIO
+
+from app.core.rag.deepdoc.parser.utils import get_text
+from . import naive
+from .naive import by_plaintext, PARSERS
+from app.core.rag.nlp import bullets_category, is_english,remove_contents_table, \
+    hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
+    tokenize_chunks
+from app.core.rag.nlp import rag_tokenizer
+from app.core.rag.deepdoc.parser import PdfParser, HtmlParser
+from app.core.rag.deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
+from PIL import Image
+
+
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback)
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug("layouts: {}".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        self._naive_vertical_merge()
+        self._filter_forpages()
+        self._merge_with_same_bullet()
+        callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
+
+        return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
+                for b in self.boxes], tbls
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, txt.
+        Since a book is long and not all the parts are useful, if it's a PDF,
+        please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    pdf_parser = None
+    sections, tbls = [], []
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        doc_parser = naive.Docx()
+        # TODO: table of contents need to be removed
+        sections, tbls = doc_parser(
+            filename, binary=binary, from_page=from_page, to_page=to_page)
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
+        tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
+        # tbls = [((None, lns), None) for lns in tbls]
+        sections=[(item[0],item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+
+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+
+        name = layout_recognizer.strip().lower()
+        parser = PARSERS.get(name, by_plaintext)
+        callback(0.1, "Start to parse.")
+
+        sections, tables, pdf_parser = parser(
+            filename = filename,
+            binary = binary,
+            from_page = from_page,
+            to_page = to_page,
+            lang = lang,
+            callback = callback,
+            pdf_cls = Pdf,
+            **kwargs
+        )
+
+        if not sections and not tables:
+            return []
+
+        if name in ["tcadp", "docling", "mineru"]:
+            parser_config["chunk_token_num"] = 0
+        
+        callback(0.8, "Finish parsing.")
+    elif re.search(r"\.txt$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        sections = txt.split("\n")
+        sections = [(line, "") for line in sections if line]
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = HtmlParser()(filename, binary)
+        sections = [(line, "") for line in sections if line]
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        binary = BytesIO(binary)
+        doc_parsed = parser.from_buffer(binary)
+        sections = doc_parsed['content'].split('\n')
+        sections = [(line, "") for line in sections if line]
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
+        callback(0.8, "Finish parsing.")
+
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(doc, docx, pdf, txt supported)")
+
+    make_colon_as_title(sections)
+    bull = bullets_category(
+        [t for t in random_choices([t for t, _ in sections], k=100)])
+    if bull >= 0:
+        chunks = ["\n".join(ck)
+                  for ck in hierarchical_merge(bull, sections, 5)]
+    else:
+        sections = [s.split("@") for s, _ in sections]
+        sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]
+        chunks = naive_merge(
+            sections, kwargs.get(
+                "chunk_token_num", 256), kwargs.get(
+                "delimer", "\n。；！？"))
+
+    # is it English
+    # is_english(random_choices([t for t, _ in sections], k=218))
+    eng = lang.lower() == "english"
+
+    res = tokenize_table(tbls, doc, eng)
+    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+
+    return res
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
--- a/api/app/core/rag/app/laws.py
+++ b/api/app/core/rag/app/laws.py
@@ -0,0 +1,219 @@
+import logging
+import re
+from io import BytesIO
+from docx import Document
+
+from app.core.rag.common.constants import ParserType
+from app.core.rag.deepdoc.parser.utils import get_text
+from app.core.rag.nlp import bullets_category, remove_contents_table, \
+    make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
+from app.core.rag.nlp import rag_tokenizer, Node
+from app.core.rag.deepdoc.parser import PdfParser, DocxParser, HtmlParser
+from app.core.rag.app.naive import by_plaintext, PARSERS
+
+
+
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def __clean(self, line):
+        line = re.sub(r"\u3000", " ", line).strip()
+        return line
+
+    def old_call(self, filename, binary=None, from_page=0, to_page=100000):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        lines = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page and p.text.strip():
+                lines.append(self.__clean(p.text))
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        return [line for line in lines if line]
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
+            self.doc = Document(
+                filename) if not binary else Document(BytesIO(binary))
+            pn = 0
+            lines = []
+            level_set = set()
+            bull = bullets_category([p.text for p in self.doc.paragraphs])
+            for p in self.doc.paragraphs:
+                if pn > to_page:
+                    break
+                question_level, p_text = docx_question_level(p, bull)
+                if not p_text.strip("\n"):
+                    continue
+                lines.append((question_level, p_text))
+                level_set.add(question_level)
+                for run in p.runs:
+                    if 'lastRenderedPageBreak' in run._element.xml:
+                        pn += 1
+                        continue
+                    if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                        pn += 1
+
+            sorted_levels = sorted(level_set)
+
+            h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1
+            h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level
+
+            root = Node(level=0, depth=h2_level, texts=[])
+            root.build_tree(lines)
+
+            return [element for element in root.get_tree() if element]
+
+
+    def __str__(self) -> str:
+        return f'''
+            question:{self.question},
+            answer:{self.answer},
+            level:{self.level},
+            childs:{self.childs}
+        '''
+
+
+class Pdf(PdfParser):
+    def __init__(self):
+        self.model_speciess = ParserType.LAWS.value
+        super().__init__()
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug("layouts:".format(
+            ))
+        self._naive_vertical_merge()
+
+        callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
+
+        return [(b["text"], self._line_tag(b, zoomin))
+                for b in self.boxes], None
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, txt.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    pdf_parser = None
+    sections = []
+    # is it English
+    eng = lang.lower() == "english"  # is_english(sections)
+
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        chunks = Docx()(filename, binary)
+        callback(0.7, "Finish parsing.")
+        return tokenize_chunks(chunks, doc, eng, None)
+    
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+
+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+
+        name = layout_recognizer.strip().lower()
+        parser = PARSERS.get(name, by_plaintext)
+        callback(0.1, "Start to parse.")
+
+        raw_sections, tables, pdf_parser = parser(
+            filename = filename,
+            binary = binary,
+            from_page = from_page,
+            to_page = to_page,
+            lang = lang,
+            callback = callback,
+            pdf_cls = Pdf,
+            **kwargs
+        )
+
+        if not raw_sections and not tables:
+            return []
+
+        if name in ["tcadp", "docling", "mineru"]:
+            parser_config["chunk_token_num"] = 0
+        
+        for txt, poss in raw_sections:
+            sections.append(txt + poss)
+
+        callback(0.8, "Finish parsing.")
+    elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        sections = txt.split("\n")
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = HtmlParser()(filename, binary)
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        binary = BytesIO(binary)
+        doc_parsed = parser.from_buffer(binary)
+        sections = doc_parsed['content'].split('\n')
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(doc, docx, pdf, txt supported)")
+
+
+    # Remove 'Contents' part
+    remove_contents_table(sections, eng)
+
+    make_colon_as_title(sections)
+    bull = bullets_category(sections)
+    res = tree_merge(bull, sections, 2)
+
+
+    if not res:
+        callback(0.99, "No chunk parsed out.")
+
+    return tokenize_chunks(res, doc, eng, pdf_parser)
+
+    # chunks = hierarchical_merge(bull, sections, 5)
+    #     return tokenize_chunks(["\n".join(ck)for ck in chunks], doc, eng, pdf_parser)
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], callback=dummy)
--- a/api/app/core/rag/app/mail.py
+++ b/api/app/core/rag/app/mail.py
@@ -0,0 +1,114 @@
+import logging
+from email import policy
+from email.parser import BytesParser
+from .naive import chunk as naive_chunk
+import re
+from app.core.rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
+from app.core.rag.deepdoc.parser import HtmlParser, TxtParser
+from timeit import default_timer as timer
+import io
+
+
+def chunk(
+    filename,
+    binary=None,
+    from_page=0,
+    to_page=100000,
+    lang="Chinese",
+    callback=None,
+    **kwargs,
+):
+    """
+    Only eml is supported
+    """
+    eng = lang.lower() == "english"  # is_english(cks)
+    parser_config = kwargs.get(
+        "parser_config",
+        {"chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"},
+    )
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    main_res = []
+    attachment_res = []
+
+    if binary:
+        msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
+    else:
+        msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
+
+    text_txt, html_txt = [], []
+    # get the email header info
+    for header, value in msg.items():
+        text_txt.append(f"{header}: {value}")
+
+    #  get the email main info
+    def _add_content(msg, content_type):
+        def _decode_payload(payload, charset, target_list):
+            try:
+                target_list.append(payload.decode(charset))
+            except (UnicodeDecodeError, LookupError):
+                for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
+                    try:
+                        target_list.append(payload.decode(enc))
+                        break
+                    except UnicodeDecodeError:
+                        continue
+                else:
+                    target_list.append(payload.decode("utf-8", errors="ignore"))
+
+        if content_type == "text/plain":
+            payload = msg.get_payload(decode=True)
+            charset = msg.get_content_charset() or "utf-8"
+            _decode_payload(payload, charset, text_txt)
+        elif content_type == "text/html":
+            payload = msg.get_payload(decode=True)
+            charset = msg.get_content_charset() or "utf-8"
+            _decode_payload(payload, charset, html_txt)
+        elif "multipart" in content_type:
+            if msg.is_multipart():
+                for part in msg.iter_parts():
+                    _add_content(part, part.get_content_type())
+
+    _add_content(msg, msg.get_content_type())
+
+    sections = TxtParser.parser_txt("\n".join(text_txt)) + [
+        (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt), chunk_token_num=parser_config["chunk_token_num"]) if line
+    ]
+
+    st = timer()
+    chunks = naive_merge(
+        sections,
+        int(parser_config.get("chunk_token_num", 128)),
+        parser_config.get("delimiter", "\n!?。；！？"),
+    )
+
+    main_res.extend(tokenize_chunks(chunks, doc, eng, None))
+    logging.debug("naive_merge({}): {}".format(filename, timer() - st))
+    # get the attachment info
+    for part in msg.iter_attachments():
+        content_disposition = part.get("Content-Disposition")
+        if content_disposition:
+            dispositions = content_disposition.strip().split(";")
+            if dispositions[0].lower() == "attachment":
+                filename = part.get_filename()
+                payload = part.get_payload(decode=True)
+                try:
+                    attachment_res.extend(
+                        naive_chunk(filename, payload, callback=callback, **kwargs)
+                    )
+                except Exception:
+                    pass
+
+    return main_res + attachment_res
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+
+    chunk(sys.argv[1], callback=dummy)
--- a/api/app/core/rag/app/manual.py
+++ b/api/app/core/rag/app/manual.py
@@ -0,0 +1,299 @@
+import logging
+import copy
+import re
+
+from app.core.rag.common.constants import ParserType
+from io import BytesIO
+from app.core.rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
+from app.core.rag.common.token_utils import num_tokens_from_string
+from app.core.rag.deepdoc.parser import PdfParser, DocxParser
+from app.core.rag.deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
+from docx import Document
+from PIL import Image
+from .naive import by_plaintext, PARSERS
+
+class Pdf(PdfParser):
+    def __init__(self):
+        self.model_speciess = ParserType.MANUAL.value
+        super().__init__()
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+        logging.debug("OCR: {}".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug("layouts: {}".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        self._concat_downward()
+        self._filter_forpages()
+        callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))
+
+        # clean mess
+        for b in self.boxes:
+            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
+
+        return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
+                for i, b in enumerate(self.boxes)], tbls
+
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def get_picture(self, document, paragraph):
+        img = paragraph._element.xpath('.//pic:pic')
+        if not img:
+            return None
+        try:
+            img = img[0]
+            embed = img.xpath('.//a:blip/@r:embed')[0]
+            related_part = document.part.related_parts[embed]
+            image = related_part.image
+            if image is not None:
+                image = Image.open(BytesIO(image.blob))
+                return image
+            elif related_part.blob is not None:
+                image = Image.open(BytesIO(related_part.blob))
+                return image
+            else:
+                return None
+        except Exception:
+            return None
+
+    def concat_img(self, img1, img2):
+        if img1 and not img2:
+            return img1
+        if not img1 and img2:
+            return img2
+        if not img1 and not img2:
+            return None
+        width1, height1 = img1.size
+        width2, height2 = img2.size
+
+        new_width = max(width1, width2)
+        new_height = height1 + height2
+        new_image = Image.new('RGB', (new_width, new_height))
+
+        new_image.paste(img1, (0, 0))
+        new_image.paste(img2, (0, height1))
+
+        return new_image
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        last_answer, last_image = "", None
+        question_stack, level_stack = [], []
+        ti_list = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            question_level, p_text = 0, ''
+            if from_page <= pn < to_page and p.text.strip():
+                question_level, p_text = docx_question_level(p)
+            if not question_level or question_level > 6: # not a question
+                last_answer = f'{last_answer}\n{p_text}'
+                current_image = self.get_picture(self.doc, p)
+                last_image = self.concat_img(last_image, current_image)
+            else:   # is a question
+                if last_answer or last_image:
+                    sum_question = '\n'.join(question_stack)
+                    if sum_question:
+                        ti_list.append((f'{sum_question}\n{last_answer}', last_image))
+                    last_answer, last_image = '', None
+
+                i = question_level
+                while question_stack and i <= level_stack[-1]:
+                    question_stack.pop()
+                    level_stack.pop()
+                question_stack.append(p_text)
+                level_stack.append(question_level)
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        if last_answer:
+            sum_question = '\n'.join(question_stack)
+            if sum_question:
+                ti_list.append((f'{sum_question}\n{last_answer}', last_image))
+                
+        tbls = []
+        for tb in self.doc.tables:
+            html= "<table>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                while i < len(r.cells):
+                    span = 1
+                    c = r.cells[i]
+                    for j in range(i+1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            i = j
+                        else:
+                            break
+                    i += 1
+                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                html += "</tr>"
+            html += "</table>"
+            tbls.append(((None, html), ""))
+        return ti_list, tbls
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Only pdf is supported.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    pdf_parser = None
+    doc = {
+        "docnm_kwd": filename
+    }
+    doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    # is it English
+    eng = lang.lower() == "english"  # pdf_parser.is_english
+    if re.search(r"\.pdf$", filename, re.IGNORECASE):
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+
+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+
+        name = layout_recognizer.strip().lower()
+        pdf_parser = PARSERS.get(name, by_plaintext)
+        callback(0.1, "Start to parse.")
+
+        sections, tbls, pdf_parser = pdf_parser(
+            filename = filename,
+            binary = binary,
+            from_page = from_page,
+            to_page = to_page,
+            lang = lang,
+            callback = callback,
+            pdf_cls = Pdf,
+            **kwargs
+        )
+
+        if not sections and not tbls:
+            return []
+
+        if name in ["tcadp", "docling", "mineru"]:
+            parser_config["chunk_token_num"] = 0
+        
+        callback(0.8, "Finish parsing.")
+
+        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
+            max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
+            most_level = max(0, max_lvl - 1)
+            levels = []
+            for txt, _, _ in sections:
+                for t, lvl in pdf_parser.outlines:
+                    tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
+                    tks_ = set([txt[i] + txt[i + 1]
+                                for i in range(min(len(t), len(txt) - 1))])
+                    if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
+                        levels.append(lvl)
+                        break
+                else:
+                    levels.append(max_lvl + 1)
+
+        else:
+            bull = bullets_category([txt for txt, _, _ in sections])
+            most_level, levels = title_frequency(
+                bull, [(txt, lvl) for txt, lvl, _ in sections])
+
+        assert len(sections) == len(levels)
+        sec_ids = []
+        sid = 0
+        for i, lvl in enumerate(levels):
+            if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
+                sid += 1
+            sec_ids.append(sid)
+
+        sections = [(txt, sec_ids[i], poss)
+                    for i, (txt, _, poss) in enumerate(sections)]
+        for (img, rows), poss in tbls:
+            if not rows:
+                continue
+            sections.append((rows if isinstance(rows, str) else rows[0], -1,
+                            [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
+
+        def tag(pn, left, right, top, bottom):
+            if pn + left + right + top + bottom == 0:
+                return ""
+            return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
+                .format(pn, left, right, top, bottom)
+
+        chunks = []
+        last_sid = -2
+        tk_cnt = 0
+        for txt, sec_id, poss in sorted(sections, key=lambda x: (
+                x[-1][0][0], x[-1][0][3], x[-1][0][1])):
+            poss = "\t".join([tag(*pos) for pos in poss])
+            if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
+                if chunks:
+                    chunks[-1] += "\n" + txt + poss
+                    tk_cnt += num_tokens_from_string(txt)
+                    continue
+            chunks.append(txt + poss)
+            tk_cnt = num_tokens_from_string(txt)
+            if sec_id > -1:
+                last_sid = sec_id
+        tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
+        res = tokenize_table(tbls, doc, eng)
+        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+        return res
+
+    elif re.search(r"\.docx?$", filename, re.IGNORECASE):
+        docx_parser = Docx()
+        ti_list, tbls = docx_parser(filename, binary,
+                                    from_page=0, to_page=10000, callback=callback)
+        tbls=vision_figure_parser_docx_wrapper(sections=ti_list,tbls=tbls,callback=callback,**kwargs)
+        res = tokenize_table(tbls, doc, eng)
+        for text, image in ti_list:
+            d = copy.deepcopy(doc)
+            if image:
+                d['image'] = image
+                d["doc_type_kwd"] = "image"
+            tokenize(d, text, eng)
+            res.append(d)
+        return res
+    else:
+        raise NotImplementedError("file type not supported yet(pdf and docx supported)")
+    
+
+if __name__ == "__main__":
+    import sys
+
+
+    def dummy(prog=None, msg=""):
+        pass
+
+
+    chunk(sys.argv[1], callback=dummy)
--- a/api/app/core/rag/app/naive.py
+++ b/api/app/core/rag/app/naive.py
@@ -0,0 +1,849 @@
+import logging
+import re
+import os
+from functools import reduce
+from io import BytesIO
+from timeit import default_timer as timer
+from docx import Document
+from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
+from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
+from docx.opc.oxml import parse_xml
+from markdown import markdown
+from PIL import Image
+import copy
+
+from app.core.rag.llm.cv_model import AzureGptV4, QWenCV
+from app.core.rag.common.file_utils import get_project_base_directory
+from app.core.rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
+from app.core.rag.deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
+from app.core.rag.deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
+from app.core.rag.deepdoc.parser.pdf_parser import PlainParser, VisionParser
+from app.core.rag.deepdoc.parser.mineru_parser import MinerUParser
+from app.core.rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, tokenize, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
+
+def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
+    callback = callback
+    binary = binary
+    pdf_parser = pdf_cls() if pdf_cls else Pdf()
+    sections, tables = pdf_parser(
+        filename if not binary else binary,
+        from_page=from_page,
+        to_page=to_page,
+        callback=callback
+    )
+
+    tables = vision_figure_parser_pdf_wrapper(tbls=tables,
+                                              callback=callback,
+                                              vision_model=vision_model,
+                                              **kwargs)
+    return sections, tables, pdf_parser
+
+
+def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
+    mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
+    mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
+    pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
+
+    if not pdf_parser.check_installation():
+        callback(-1, "MinerU not found.")
+        return None, None, pdf_parser
+
+    sections, tables = pdf_parser.parse_pdf(
+        filepath=filename,
+        binary=binary,
+        callback=callback,
+        output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
+        backend=os.environ.get("MINERU_BACKEND", "pipeline"),
+        delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
+    )
+    return sections, tables, pdf_parser
+
+
+def by_textln(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, vision_model=None, pdf_cls = None ,**kwargs):
+    textln_app_id = os.environ.get("TEXTLN_APP_ID", "")
+    textln_secret_code = os.environ.get("TEXTLN_SECRET_CODE", "")
+    textln_api = os.environ.get("TEXTLN_APISERVER", "https://api.textin.com/ai/service/v1/pdf_to_markdown")
+    pdf_parser = MinerUParser(mineru_path=textln_app_id, mineru_api=textln_api)
+
+    if not pdf_parser.check_installation():
+        callback(-1, "MinerU not found.")
+        return None, None, pdf_parser
+
+    sections, tables = pdf_parser.parse_pdf(
+        filepath=filename,
+        binary=binary,
+        callback=callback,
+        output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
+        backend=os.environ.get("MINERU_BACKEND", "pipeline"),
+        delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
+    )
+    return sections, tables, pdf_parser
+
+
+def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, vision_model=None, **kwargs):
+    if kwargs.get("layout_recognizer", "") == "Plain Text":
+        pdf_parser = PlainParser()
+    else:
+        pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
+
+    sections, tables = pdf_parser(
+        filename if not binary else binary,
+        from_page=from_page,
+        to_page=to_page,
+        callback=callback
+    )
+    return sections, tables, pdf_parser
+
+
+PARSERS = {
+    "deepdoc":  by_deepdoc,
+    "mineru":   by_mineru,
+    "textln":   by_textln,
+    "plaintext": by_plaintext,  # default
+}
+
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def get_picture(self, document, paragraph):
+        imgs = paragraph._element.xpath('.//pic:pic')
+        if not imgs:
+            return None
+        res_img = None
+        for img in imgs:
+            embed = img.xpath('.//a:blip/@r:embed')
+            if not embed:
+                continue
+            embed = embed[0]
+            try:
+                related_part = document.part.related_parts[embed]
+                image_blob = related_part.image.blob
+            except UnrecognizedImageError:
+                logging.info("Unrecognized image format. Skipping image.")
+                continue
+            except UnexpectedEndOfFileError:
+                logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
+                continue
+            except InvalidImageStreamError:
+                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
+                continue
+            except UnicodeDecodeError:
+                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
+                continue
+            except Exception:
+                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
+                continue
+            try:
+                image = Image.open(BytesIO(image_blob)).convert('RGB')
+                if res_img is None:
+                    res_img = image
+                else:
+                    res_img = concat_img(res_img, image)
+            except Exception:
+                continue
+
+        return res_img
+
+    def __clean(self, line):
+        line = re.sub(r"\u3000", " ", line).strip()
+        return line
+
+    def __get_nearest_title(self, table_index, filename):
+        """Get the hierarchical title structure before the table"""
+        import re
+        from docx.text.paragraph import Paragraph
+
+        titles = []
+        blocks = []
+
+        # Get document name from filename parameter
+        doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
+        if not doc_name:
+            doc_name = "Untitled Document"
+
+        # Collect all document blocks while maintaining document order
+        try:
+            # Iterate through all paragraphs and tables in document order
+            for i, block in enumerate(self.doc._element.body):
+                if block.tag.endswith('p'):  # Paragraph
+                    p = Paragraph(block, self.doc)
+                    blocks.append(('p', i, p))
+                elif block.tag.endswith('tbl'):  # Table
+                    blocks.append(('t', i, None))  # Table object will be retrieved later
+        except Exception as e:
+            logging.error(f"Error collecting blocks: {e}")
+            return ""
+
+        # Find the target table position
+        target_table_pos = -1
+        table_count = 0
+        for i, (block_type, pos, _) in enumerate(blocks):
+            if block_type == 't':
+                if table_count == table_index:
+                    target_table_pos = pos
+                    break
+                table_count += 1
+
+        if target_table_pos == -1:
+            return ""  # Target table not found
+
+        # Find the nearest heading paragraph in reverse order
+        nearest_title = None
+        for i in range(len(blocks)-1, -1, -1):
+            block_type, pos, block = blocks[i]
+            if pos >= target_table_pos:  # Skip blocks after the table
+                continue
+
+            if block_type != 'p':
+                continue
+
+            if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
+                try:
+                    level_match = re.search(r"(\d+)", block.style.name)
+                    if level_match:
+                        level = int(level_match.group(1))
+                        if level <= 7:  # Support up to 7 heading levels
+                            title_text = block.text.strip()
+                            if title_text:  # Avoid empty titles
+                                nearest_title = (level, title_text)
+                                break
+                except Exception as e:
+                    logging.error(f"Error parsing heading level: {e}")
+
+        if nearest_title:
+            # Add current title
+            titles.append(nearest_title)
+            current_level = nearest_title[0]
+
+            # Find all parent headings, allowing cross-level search
+            while current_level > 1:
+                found = False
+                for i in range(len(blocks)-1, -1, -1):
+                    block_type, pos, block = blocks[i]
+                    if pos >= target_table_pos:  # Skip blocks after the table
+                        continue
+
+                    if block_type != 'p':
+                        continue
+
+                    if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
+                        try:
+                            level_match = re.search(r"(\d+)", block.style.name)
+                            if level_match:
+                                level = int(level_match.group(1))
+                                # Find any heading with a higher level
+                                if level < current_level:
+                                    title_text = block.text.strip()
+                                    if title_text:  # Avoid empty titles
+                                        titles.append((level, title_text))
+                                        current_level = level
+                                        found = True
+                                        break
+                        except Exception as e:
+                            logging.error(f"Error parsing parent heading: {e}")
+
+                if not found:  # Break if no parent heading is found
+                    break
+
+            # Sort by level (ascending, from highest to lowest)
+            titles.sort(key=lambda x: x[0])
+            # Organize titles (from highest to lowest)
+            hierarchy = [doc_name] + [t[1] for t in titles]
+            return " > ".join(hierarchy)
+
+        return ""
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        lines = []
+        last_image = None
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page:
+                if p.text.strip():
+                    if p.style and p.style.name == 'Caption':
+                        former_image = None
+                        if lines and lines[-1][1] and lines[-1][2] != 'Caption':
+                            former_image = lines[-1][1].pop()
+                        elif last_image:
+                            former_image = last_image
+                            last_image = None
+                        lines.append((self.__clean(p.text), [former_image], p.style.name))
+                    else:
+                        current_image = self.get_picture(self.doc, p)
+                        image_list = [current_image]
+                        if last_image:
+                            image_list.insert(0, last_image)
+                            last_image = None
+                        lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
+                else:
+                    if current_image := self.get_picture(self.doc, p):
+                        if lines:
+                            lines[-1][1].append(current_image)
+                        else:
+                            last_image = current_image
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
+
+        tbls = []
+        for i, tb in enumerate(self.doc.tables):
+            title = self.__get_nearest_title(i, filename)
+            html = "<table>"
+            if title:
+                html += f"<caption>Table Location: {title}</caption>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                try:
+                    while i < len(r.cells):
+                        span = 1
+                        c = r.cells[i]
+                        for j in range(i + 1, len(r.cells)):
+                            if c.text == r.cells[j].text:
+                                span += 1
+                                i = j
+                            else:
+                                break
+                        i += 1
+                        html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                except Exception as e:
+                    logging.warning(f"Error parsing table, ignore: {e}")
+                html += "</tr>"
+            html += "</table>"
+            tbls.append(((None, html), ""))
+        return new_line, tbls
+
+    def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
+        """
+        This function uses mammoth, licensed under the BSD 2-Clause License.
+        """
+
+        import base64
+        import uuid
+
+        import mammoth
+        from markdownify import markdownify
+
+        docx_file = BytesIO(binary) if binary else open(filename, "rb")
+
+        def _convert_image_to_base64(image):
+            try:
+                with image.open() as image_file:
+                    image_bytes = image_file.read()
+                encoded = base64.b64encode(image_bytes).decode("utf-8")
+                base64_url = f"data:{image.content_type};base64,{encoded}"
+
+                alt_name = "image"
+                alt_name = f"img_{uuid.uuid4().hex[:8]}"
+
+                return {"src": base64_url, "alt": alt_name}
+            except Exception as e:
+                logging.warning(f"Failed to convert image to base64: {e}")
+                return {"src": "", "alt": "image"}
+
+        try:
+            if inline_images:
+                result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
+            else:
+                result = mammoth.convert_to_html(docx_file)
+
+            html = result.value
+
+            markdown_text = markdownify(html)
+            return markdown_text
+
+        finally:
+            if not binary:
+                docx_file.close()
+
+
+class Pdf(PdfParser):
+    def __init__(self):
+        super().__init__()
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
+        start = timer()
+        first_start = start
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+        logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge(zoomin=zoomin)
+        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
+
+        if separate_tables_figures:
+            tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
+            self._concat_downward()
+            logging.info("layouts cost: {}s".format(timer() - first_start))
+            return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
+        else:
+            tbls = self._extract_table_figure(True, zoomin, True, True)
+            self._naive_vertical_merge()
+            self._concat_downward()
+            self._final_reading_order_merge()
+            # self._filter_forpages()
+            logging.info("layouts cost: {}s".format(timer() - first_start))
+            return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
+
+
+class Markdown(MarkdownParser):
+    def md_to_html(self, sections):
+        if not sections:
+            return []
+        if isinstance(sections, type("")):
+            text = sections
+        elif isinstance(sections[0], type("")):
+            text = sections[0]
+        else:
+            return []
+
+        from bs4 import BeautifulSoup
+        html_content = markdown(text)
+        soup = BeautifulSoup(html_content, 'html.parser')
+        return soup
+
+    def get_picture_urls(self, soup):
+        if soup:
+            return [img.get('src') for img in soup.find_all('img') if img.get('src')]
+        return []
+
+    def get_hyperlink_urls(self, soup):
+        if soup:
+            return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
+        return []
+
+    def get_pictures(self, text):
+        """Download and open all images from markdown text."""
+        import requests
+        soup = self.md_to_html(text)
+        image_urls = self.get_picture_urls(soup)
+        images = []
+        # Find all image URLs in text
+        for url in image_urls:
+            if not url:
+                continue
+            try:
+                # check if the url is a local file or a remote URL
+                if url.startswith(('http://', 'https://')):
+                    # For remote URLs, download the image
+                    response = requests.get(url, stream=True, timeout=30)
+                    if response.status_code == 200 and response.headers['Content-Type'] and response.headers['Content-Type'].startswith('image/'):
+                        img = Image.open(BytesIO(response.content)).convert('RGB')
+                        images.append(img)
+                else:
+                    # For local file paths, open the image directly
+                    from pathlib import Path
+                    local_path = Path(url)
+                    if not local_path.exists():
+                        logging.warning(f"Local image file not found: {url}")
+                        continue
+                    img = Image.open(url).convert('RGB')
+                    images.append(img)
+            except Exception as e:
+                logging.error(f"Failed to download/open image from {url}: {e}")
+                continue
+
+        return images if images else None
+
+    def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
+        if binary:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+        else:
+            with open(filename, "r") as f:
+                txt = f.read()
+
+        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
+        # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
+        # extractor = MarkdownElementExtractor(remainder)
+        extractor = MarkdownElementExtractor(txt)
+        element_sections = extractor.extract_elements(delimiter)
+        sections = [(element, "") for element in element_sections]
+        tbls = []
+        for table in tables:
+            tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
+        return sections, tbls
+
+def load_from_xml_v2(baseURI, rels_item_xml):
+    """
+    Return |_SerializedRelationships| instance loaded with the
+    relationships contained in *rels_item_xml*. Returns an empty
+    collection if *rels_item_xml* is |None|.
+    """
+    srels = _SerializedRelationships()
+    if rels_item_xml is not None:
+        rels_elm = parse_xml(rels_item_xml)
+        for rel_elm in rels_elm.Relationship_lst:
+            if rel_elm.target_ref in ('../NULL', 'NULL'):
+                continue
+            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
+    return srels
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, vision_model=None, **kwargs):
+    """
+        Supported file formats are docx, doc, pdf, excel, txt, markdown, html, json.
+        This method apply the naive ways to chunk files.
+        Successive text will be sliced into pieces using 'delimiter'.
+        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
+    """
+    urls = set()
+    url_res = []
+
+
+    is_english = lang.lower() == "english"  # is_english(cks)
+    parser_config = kwargs.get(
+        "parser_config", {
+            "layout_recognize": "DeepDOC", "chunk_token_num": 512, "delimiter": "\n!?。；！？", "analyze_hyperlink": True})
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    res = []
+    pdf_parser = None
+    section_images = None
+
+    is_root = kwargs.get("is_root", True)
+    embed_res = []
+    if is_root:
+        # Only extract embedded files at the root call
+        embeds = []
+        if binary is not None:
+            embeds = extract_embed_file(binary)
+        else:
+            raise Exception("Embedding extraction from file path is not supported.")
+
+        # Recursively chunk each embedded file and collect results
+        for embed_filename, embed_bytes in embeds:
+            try:
+                sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, vision_model=vision_model, is_root=False, **kwargs) or []
+                embed_res.extend(sub_res)
+            except Exception as e:
+                if callback:
+                    callback(0.05, f"Failed to chunk embed {embed_filename}: {e}")
+                continue
+
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        if parser_config.get("analyze_hyperlink", False) and is_root:
+            urls = extract_links_from_docx(binary)
+            for index, url in enumerate(urls):
+                html_bytes, metadata = extract_html(url)
+                if not html_bytes:
+                    continue
+                try:
+                    sub_url_res = chunk(url, html_bytes, lang=lang, callback=callback, vision_model=vision_model, is_root=False, **kwargs)
+                except Exception as e:
+                    logging.info(f"Failed to chunk url in registered file type {url}: {e}")
+                    sub_url_res = chunk(f"{index}.html", html_bytes, lang=lang, callback=callback, vision_model=vision_model, is_root=False, **kwargs)
+                url_res.extend(sub_url_res)
+
+        # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
+        _SerializedRelationships.load_from_xml = load_from_xml_v2
+        sections, tables = Docx()(filename, binary)
+
+        tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback, vision_model=vision_model, **kwargs)
+
+        res = tokenize_table(tables, doc, is_english)
+        callback(0.8, "Finish parsing.")
+
+        st = timer()
+
+        chunks, images = naive_merge_docx(
+            sections, int(parser_config.get(
+                "chunk_token_num", 128)), parser_config.get(
+                "delimiter", "\n!?。；！？"))
+
+        if kwargs.get("section_only", False):
+            chunks.extend(embed_res)
+            chunks.extend(url_res)
+            return chunks
+
+        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+        logging.info("naive_merge({}): {}".format(filename, timer() - st))
+        res.extend(embed_res)
+        res.extend(url_res)
+        return res
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+        if parser_config.get("analyze_hyperlink", False) and is_root:
+            urls = extract_links_from_pdf(binary)
+
+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+
+        name = layout_recognizer.strip().lower()
+        parser = PARSERS.get(name, by_plaintext)
+        callback(0.1, "Start to parse.")
+
+        sections, tables, pdf_parser = parser(
+            filename=filename,
+            binary=binary,
+            from_page=from_page,
+            to_page=to_page,
+            lang=lang,
+            callback=callback,
+            vision_model=vision_model,
+            layout_recognizer=layout_recognizer,
+            **kwargs
+        )
+
+        if not sections and not tables:
+            return []
+
+        if name in ["mineru", "textln"]:
+            parser_config["chunk_token_num"] = 0
+
+        res = tokenize_table(tables, doc, is_english)
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.pptx?$", filename, re.IGNORECASE):
+        if not binary:
+            with open(filename, "rb") as f:
+                binary = f.read()
+        from app.core.rag.app.presentation import Ppt
+        ppt_parser = Ppt()
+        for pn, (txt, img) in enumerate(ppt_parser(
+                filename if not binary else binary, from_page, to_page, callback)):
+            d = copy.deepcopy(doc)
+            pn += from_page
+            d["image"] = img
+            d["doc_type_kwd"] = "image"
+            d["page_num_int"] = [pn + 1]
+            d["top_int"] = [0]
+            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
+            tokenize(d, txt, is_english)
+            res.append(d)
+        return res
+
+    elif re.search(r"\.(da|wave|wav|mp3|aac|flac|ogg|aiff|au|midi|wma|realaudio|vqf|oggvorbis|ape?)$", filename, re.IGNORECASE):
+        if not binary:
+            with open(filename, "rb") as f:
+                binary = f.read()
+        from app.core.rag.app.audio import chunk as parser
+        return parser(filename, binary, lang=lang, callback=callback, seq2txt_mdl=vision_model, **kwargs)
+
+    elif re.search(r"\.(png|jpeg|jpg|gif|bmp|svg|mp4|mov|avi|flv|mpeg|mpg|webm|wmv|3gp|3gpp|mkv?)$", filename, re.IGNORECASE):
+        if not binary:
+            with open(filename, "rb") as f:
+                binary = f.read()
+        from app.core.rag.app.picture import chunk as parser
+        return parser(filename, binary, lang=lang, callback=callback, vision_model=vision_model, **kwargs)
+
+    elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        if not binary:
+            with open(filename, "rb") as f:
+                binary = f.read()
+        excel_parser = ExcelParser()
+        if parser_config.get("html4excel"):
+            sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+        else:
+            sections = [(_, "") for _ in excel_parser(binary) if _]
+        parser_config["chunk_token_num"] = 12800
+
+    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = TxtParser()(filename, binary,
+                               parser_config.get("chunk_token_num", 128),
+                               parser_config.get("delimiter", "\n!?;。；！？"))
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
+        sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。；！？"))
+
+        if vision_model:
+            # Process images for each section
+            section_images = []
+            for idx, (section_text, _) in enumerate(sections):
+                images = markdown_parser.get_pictures(section_text) if section_text else None
+
+                if images:
+                    # If multiple images found, combine them using concat_img
+                    combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
+                    section_images.append(combined_image)
+                    markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
+                    boosted_figures = markdown_vision_parser(callback=callback)
+                    sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1][0] for fig in boosted_figures]), sections[idx][1])
+                else:
+                    section_images.append(None)
+
+        else:
+            logging.warning("No visual model detected. Skipping figure parsing enhancement.")
+
+        if parser_config.get("hyperlink_urls", False) and is_root:
+            for idx, (section_text, _) in enumerate(sections):
+                soup = markdown_parser.md_to_html(section_text)
+                hyperlink_urls = markdown_parser.get_hyperlink_urls(soup)
+                urls.update(hyperlink_urls)
+        res = tokenize_table(tables, doc, is_english)
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        chunk_token_num = int(parser_config.get("chunk_token_num", 128))
+        sections = HtmlParser()(filename, binary, chunk_token_num)
+        sections = [(_, "") for _ in sections if _]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        chunk_token_num = int(parser_config.get("chunk_token_num", 128))
+        sections = JsonParser(chunk_token_num)(filename)
+        sections = [(_, "") for _ in sections if _]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+
+        try:
+            import tika
+            os.environ['TIKA_SERVER_JAR'] = "/tmp/tika-server.jar"
+            os.environ['TIKA_SERVER_PORT'] = '9998'
+            # java11 Initialize Tika 3.1.0.jar  service url：http://localhost:9998  view process：lsof -i :9998
+            tika.initVM()
+            from tika import parser as tika_parser
+        except Exception as e:
+            callback(0.8, f"tika not available: {e}. Unsupported .doc parsing.")
+            logging.warning(f"tika not available: {e}. Unsupported .doc parsing for {filename}.")
+            return []
+
+        doc_parsed = tika_parser.from_file(filename)
+        if doc_parsed.get('content', None) is not None:
+            sections = doc_parsed['content'].split('\n')
+            sections = [(_, "") for _ in sections if _]
+            callback(0.8, "Finish parsing.")
+        else:
+            callback(0.8, f"tika.parser got empty content from {filename}.")
+            logging.warning(f"tika.parser got empty content from {filename}.")
+            return []
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
+
+    st = timer()
+    if section_images:
+        # if all images are None, set section_images to None
+        if all(image is None for image in section_images):
+            section_images = None
+
+    if section_images:
+        chunks, images = naive_merge_with_images(sections, section_images,
+                                        int(parser_config.get(
+                                            "chunk_token_num", 128)), parser_config.get(
+                                            "delimiter", "\n!?。；！？"))
+        if kwargs.get("section_only", False):
+            chunks.extend(embed_res)
+            return chunks
+
+        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+    else:
+        chunks = naive_merge(
+            sections, int(parser_config.get(
+                "chunk_token_num", 128)), parser_config.get(
+                "delimiter", "\n!?。；！？"))
+        if kwargs.get("section_only", False):
+            chunks.extend(embed_res)
+            return chunks
+
+        res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+
+    if urls and parser_config.get("analyze_hyperlink", False) and is_root:
+        for index, url in enumerate(urls):
+            html_bytes, metadata = extract_html(url)
+            if not html_bytes:
+                continue
+            try:
+                sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
+            except Exception as e:
+                logging.info(f"Failed to chunk url in registered file type {url}: {e}")
+                sub_url_res = chunk(f"{index}.html", html_bytes, lang=lang, callback=callback, vision_model=vision_model, is_root=False, **kwargs)
+            url_res.extend(sub_url_res)
+
+    logging.info("naive_merge({}): {}".format(filename, timer() - st))
+
+    if embed_res:
+        res.extend(embed_res)
+    if url_res:
+        res.extend(url_res)
+    return res
+
+
+if __name__ == "__main__":
+    # import sys
+    # chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
+
+    # Prepare to configure vision_model information
+    vision_model = QWenCV(
+        key="sk-8e9e40cd171749858ce2d3722ea75669",
+        model_name="qwen-vl-max",
+        lang="chinese",  # 默认使用中文
+        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
+    )
+
+    def progress_callback(prog=None, msg=None):
+        print(f"prog: {prog} msg: {msg}\n")
+
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/1.txt"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/2.md"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/3.md" # 带图url
+    file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/义务教育教科书·中国历史七年级上册 (2)_Compressed.md"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/4.doc"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/5.json"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/6.html"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/7.xlsx"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/8.pdf"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/9.pptx"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/10.png"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/11.mp4"
+    # file_path = "/Users/sbtjfdn/Downloads/记忆科学/files/12.mp3"
+    res = chunk(filename=file_path,
+                from_page=0,
+                to_page=10,
+                callback=progress_callback,
+                vision_model=vision_model,
+                parser_config={
+                    "layout_recognize": "DeepDOC",
+                    "chunk_token_num": 128,
+                    "delimiter": "\n",
+                    "analyze_hyperlink": True,
+                    "auto_keywords": 0,
+                    "auto_questions": 0,
+                    "html4excel": "false"
+                },
+                is_root=False)
+    for index, item in enumerate(res):
+        print(f"Index: {index}\n----")
+        print(item)
+        print("----")
--- a/api/app/core/rag/app/one.py
+++ b/api/app/core/rag/app/one.py
@@ -0,0 +1,149 @@
+import logging
+from io import BytesIO
+import re
+
+from app.core.rag.deepdoc.parser.utils import get_text
+from . import naive
+from app.core.rag.nlp import rag_tokenizer, tokenize
+from app.core.rag.deepdoc.parser import PdfParser, ExcelParser, HtmlParser
+from app.core.rag.deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
+from app.core.rag.app.naive import by_plaintext, PARSERS
+
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin, drop=False)
+        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug("layouts cost: {}s".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        self._concat_downward()
+
+        sections = [(b["text"], self.get_position(b, zoomin))
+                    for i, b in enumerate(self.boxes)]
+        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
+            x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, excel, txt.
+        One file forms a chunk which maintains original text order.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    eng = lang.lower() == "english"  # is_english(cks)
+
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections, tbls = naive.Docx()(filename, binary)
+        tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
+        sections = [s for s, _ in sections if s]
+        for (_, html), _ in tbls:
+            sections.append(html)
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+
+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+
+        name = layout_recognizer.strip().lower()
+        parser = PARSERS.get(name, by_plaintext)
+        callback(0.1, "Start to parse.")
+
+        sections, tbls, pdf_parser = parser(
+            filename = filename,
+            binary = binary,
+            from_page = from_page,
+            to_page = to_page,
+            lang = lang,
+            callback = callback,
+            pdf_cls = Pdf,
+            **kwargs
+        )
+
+        if not sections and not tbls:
+            return []
+
+        if name in ["tcadp", "docling", "mineru"]:
+            parser_config["chunk_token_num"] = 0
+        
+        callback(0.8, "Finish parsing.")
+        
+        for (img, rows), poss in tbls:
+            if not rows:
+                continue
+            sections.append((rows if isinstance(rows, str) else rows[0],
+                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
+        sections = [s for s, _ in sections if s]
+
+    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = ExcelParser()
+        sections = excel_parser.html(binary, 1000000000)
+
+    elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        sections = txt.split("\n")
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = HtmlParser()(filename, binary)
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        binary = BytesIO(binary)
+        doc_parsed = parser.from_buffer(binary)
+        sections = doc_parsed['content'].split('\n')
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(doc, docx, pdf, txt supported)")
+
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    tokenize(doc, "\n".join(sections), eng)
+    return [doc]
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/api/app/core/rag/app/paper.py
+++ b/api/app/core/rag/app/paper.py
@@ -0,0 +1,284 @@
+import logging
+import copy
+import re
+
+from app.core.rag.deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
+from app.core.rag.common.constants import ParserType
+from app.core.rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
+from app.core.rag.deepdoc.parser import PdfParser, PlainParser
+import numpy as np
+
+class Pdf(PdfParser):
+    def __init__(self):
+        self.model_speciess = ParserType.PAPER.value
+        super().__init__()
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug(f"layouts cost: {timer() - start}s")
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
+        self._concat_downward()
+        self._filter_forpages()
+        callback(0.75, "Text merged ({:.2f}s)".format(timer() - start))
+
+        # clean mess
+        if column_width < self.page_images[0].size[0] / zoomin / 2:
+            logging.debug("two_column................... {} {}".format(column_width,
+                  self.page_images[0].size[0] / zoomin / 2))
+            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
+        for b in self.boxes:
+            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
+
+        def _begin(txt):
+            return re.match(
+                "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
+                txt.lower().strip())
+
+        if from_page > 0:
+            return {
+                "title": "",
+                "authors": "",
+                "abstract": "",
+                "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
+                             re.match(r"(text|title)", b.get("layoutno", "text"))],
+                "tables": tbls
+            }
+        # get title and authors
+        title = ""
+        authors = []
+        i = 0
+        while i < min(32, len(self.boxes)-1):
+            b = self.boxes[i]
+            i += 1
+            if b.get("layoutno", "").find("title") >= 0:
+                title = b["text"]
+                if _begin(title):
+                    title = ""
+                    break
+                for j in range(3):
+                    if _begin(self.boxes[i + j]["text"]):
+                        break
+                    authors.append(self.boxes[i + j]["text"])
+                    break
+                break
+        # get abstract
+        abstr = ""
+        i = 0
+        while i + 1 < min(32, len(self.boxes)):
+            b = self.boxes[i]
+            i += 1
+            txt = b["text"].lower().strip()
+            if re.match("(abstract|摘要)", txt):
+                if len(txt.split()) > 32 or len(txt) > 64:
+                    abstr = txt + self._line_tag(b, zoomin)
+                    break
+                txt = self.boxes[i]["text"].lower().strip()
+                if len(txt.split()) > 32 or len(txt) > 64:
+                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
+                i += 1
+                break
+        if not abstr:
+            i = 0
+
+        callback(
+            0.8, "Page {}~{}: Text merging finished".format(
+                from_page, min(
+                    to_page, self.total_page)))
+        for b in self.boxes:
+            logging.debug("{} {}".format(b["text"], b.get("layoutno")))
+        logging.debug("{}".format(tbls))
+
+        return {
+            "title": title,
+            "authors": " ".join(authors),
+            "abstract": abstr,
+            "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+                         re.match(r"(text|title)", b.get("layoutno", "text"))],
+            "tables": tbls
+        }
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Only pdf is supported.
+        The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    if re.search(r"\.pdf$", filename, re.IGNORECASE):
+        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
+            paper = {
+                "title": filename,
+                "authors": " ",
+                "abstract": "",
+                "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
+                "tables": []
+            }
+        else:
+            pdf_parser = Pdf()
+            paper = pdf_parser(filename if not binary else binary,
+                               from_page=from_page, to_page=to_page, callback=callback)
+        tbls=paper["tables"]
+        tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
+        paper["tables"] = tbls
+    else:
+        raise NotImplementedError("file type not supported yet(pdf supported)")
+
+    doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
+           "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
+    # is it English
+    eng = lang.lower() == "english"  # pdf_parser.is_english
+    logging.debug("It's English.....{}".format(eng))
+
+    res = tokenize_table(paper["tables"], doc, eng)
+
+    if paper["abstract"]:
+        d = copy.deepcopy(doc)
+        txt = pdf_parser.remove_tag(paper["abstract"])
+        d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
+        d["important_tks"] = " ".join(d["important_kwd"])
+        d["image"], poss = pdf_parser.crop(
+            paper["abstract"], need_position=True)
+        add_positions(d, poss)
+        tokenize(d, txt, eng)
+        res.append(d)
+
+    sorted_sections = paper["sections"]
+    # set pivot using the most frequent type of title,
+    # then merge between 2 pivot
+    bull = bullets_category([txt for txt, _ in sorted_sections])
+    most_level, levels = title_frequency(bull, sorted_sections)
+    assert len(sorted_sections) == len(levels)
+    sec_ids = []
+    sid = 0
+    for i, lvl in enumerate(levels):
+        if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
+            sid += 1
+        sec_ids.append(sid)
+        logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
+
+    chunks = []
+    last_sid = -2
+    for (txt, _), sec_id in zip(sorted_sections, sec_ids):
+        if sec_id == last_sid:
+            if chunks:
+                chunks[-1] += "\n" + txt
+                continue
+        chunks.append(txt)
+        last_sid = sec_id
+    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+    return res
+
+
+"""
+    readed = [0] * len(paper["lines"])
+    # find colon firstly
+    i = 0
+    while i + 1 < len(paper["lines"]):
+        txt = pdf_parser.remove_tag(paper["lines"][i][0])
+        j = i
+        if txt.strip("\n").strip()[-1] not in ":：":
+            i += 1
+            continue
+        i += 1
+        while i < len(paper["lines"]) and not paper["lines"][i][0]:
+            i += 1
+        if i >= len(paper["lines"]): break
+        proj = [paper["lines"][i][0].strip()]
+        i += 1
+        while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
+            proj.append(paper["lines"][i])
+            i += 1
+        for k in range(j, i): readed[k] = True
+        txt = txt[::-1]
+        if eng:
+            r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
+            txt = r.group(1)[::-1] if r else txt[::-1]
+        else:
+            r = re.search(r"(.*?) ([。？；！]|$)", txt)
+            txt = r.group(1)[::-1] if r else txt[::-1]
+        for p in proj:
+            d = copy.deepcopy(doc)
+            txt += "\n" + pdf_parser.remove_tag(p)
+            d["image"], poss = pdf_parser.crop(p, need_position=True)
+            add_positions(d, poss)
+            tokenize(d, txt, eng)
+            res.append(d)
+
+    i = 0
+    chunk = []
+    tk_cnt = 0
+    def add_chunk():
+        nonlocal chunk, res, doc, pdf_parser, tk_cnt
+        d = copy.deepcopy(doc)
+        ck = "\n".join(chunk)
+        tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
+        d["image"], poss = pdf_parser.crop(ck, need_position=True)
+        add_positions(d, poss)
+        res.append(d)
+        chunk = []
+        tk_cnt = 0
+
+    while i < len(paper["lines"]):
+        if tk_cnt > 128:
+            add_chunk()
+        if readed[i]:
+            i += 1
+            continue
+        readed[i] = True
+        txt, layouts = paper["lines"][i]
+        txt_ = pdf_parser.remove_tag(txt)
+        i += 1
+        cnt = num_tokens_from_string(txt_)
+        if any([
+            layouts.find("title") >= 0 and chunk,
+            cnt + tk_cnt > 128 and tk_cnt > 32,
+        ]):
+            add_chunk()
+            chunk = [txt]
+            tk_cnt = cnt
+        else:
+            chunk.append(txt)
+            tk_cnt += cnt
+
+    if chunk: add_chunk()
+    for i, d in enumerate(res):
+        print(d)
+        # d["image"].save(f"./logs/{i}.jpg")
+    return res
+"""
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], callback=dummy)
--- a/api/app/core/rag/app/picture.py
+++ b/api/app/core/rag/app/picture.py
@@ -0,0 +1,96 @@
+import io
+import re
+
+import numpy as np
+from PIL import Image
+
+from app.core.rag.deepdoc.vision import OCR
+from app.core.rag.nlp import rag_tokenizer, tokenize
+from app.core.rag.common.string_utils import clean_markdown_block
+
+ocr = OCR()
+
+# Gemini supported MIME types
+VIDEO_EXTS = [".mp4", ".mov", ".avi", ".flv", ".mpeg", ".mpg", ".webm", ".wmv", ".3gp", ".3gpp", ".mkv"]
+
+
+def chunk(filename, binary, lang, callback=None, vision_model=None, **kwargs):
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
+    }
+    eng = lang.lower() == "english"
+
+    if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
+        try:
+            doc.update({"doc_type_kwd": "video"})
+            ans = vision_model.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
+            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
+            ans += "\n" + ans
+            tokenize(doc, ans, eng)
+            return [doc]
+        except Exception as e:
+            callback(prog=-1, msg=str(e))
+    else:
+        img = Image.open(io.BytesIO(binary)).convert("RGB")
+        doc.update(
+            {
+                "image": img,
+                "doc_type_kwd": "image",
+            }
+        )
+        bxs = ocr(np.array(img))
+        txt = "\n".join([t[0] for _, t in bxs if t[0]])
+        callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
+        if (eng and len(txt.split()) > 32) or len(txt) > 32:
+            tokenize(doc, txt, eng)
+            callback(0.8, "OCR results is too long to use CV LLM.")
+            return [doc]
+
+        try:
+            callback(0.4, "Use CV LLM to describe the picture.")
+            img_binary = io.BytesIO()
+            img.save(img_binary, format="JPEG")
+            img_binary.seek(0)
+            ans = vision_model.describe(img_binary.read())
+            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
+            txt += "\n" + ans
+            tokenize(doc, txt, eng)
+            return [doc]
+        except Exception as e:
+            callback(prog=-1, msg=str(e))
+
+    return []
+
+
+def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
+    """
+    A simple wrapper to process image to markdown texts via VLM.
+
+    Returns:
+        Simple markdown texts generated by VLM.
+    """
+    callback = callback or (lambda prog, msg: None)
+
+    img = binary
+    txt = ""
+
+    try:
+        with io.BytesIO() as img_binary:
+            try:
+                img.save(img_binary, format="JPEG")
+            except Exception:
+                img_binary.seek(0)
+                img_binary.truncate()
+                img.save(img_binary, format="PNG")
+                
+            img_binary.seek(0)
+            description, token_count = vision_model.describe_with_prompt(img_binary.read(), prompt)
+            ans = clean_markdown_block(description)
+            txt += "\n" + ans
+            return txt
+
+    except Exception as e:
+        callback(-1, str(e))
+
+    return ""
--- a/api/app/core/rag/app/presentation.py
+++ b/api/app/core/rag/app/presentation.py
@@ -0,0 +1,164 @@
+import copy
+import re
+from io import BytesIO
+from PIL import Image
+
+from app.core.rag.nlp import tokenize, is_english
+from app.core.rag.nlp import rag_tokenizer
+from app.core.rag.deepdoc.parser import PdfParser, PptParser, PlainParser
+from PyPDF2 import PdfReader as pdf2_read
+from app.core.rag.app.naive import by_plaintext, PARSERS
+
+class Ppt(PptParser):
+    def __call__(self, fnm, from_page, to_page, callback=None):
+        txts = super().__call__(fnm, from_page, to_page)
+
+        callback(0.5, "Text extraction finished.")
+        import aspose.slides as slides
+        import aspose.pydrawing as drawing
+        imgs = []
+        with slides.Presentation(BytesIO(fnm)) as presentation:
+            for i, slide in enumerate(presentation.slides[from_page: to_page]):
+                try:
+                    with BytesIO() as buffered:
+                        slide.get_thumbnail(
+                            0.1, 0.1).save(
+                            buffered, drawing.imaging.ImageFormat.jpeg)
+                        buffered.seek(0)
+                        imgs.append(Image.open(buffered).copy())
+                except RuntimeError as e:
+                    raise RuntimeError(f'ppt parse error at page {i+1}, original error: {str(e)}') from e
+        assert len(imgs) == len(
+            txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
+        callback(0.9, "Image extraction finished")
+        self.is_english = is_english(txts)
+        return [(txts[i], imgs[i]) for i in range(len(txts))]
+
+class Pdf(PdfParser):
+    def __init__(self):
+        super().__init__()
+
+    def __garbage(self, txt):
+        txt = txt.lower().strip()
+        if re.match(r"[0-9\.,%/-]+$", txt):
+            return True
+        if len(txt) < 3:
+            return True
+        return False
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(filename if not binary else binary,
+                        zoomin, from_page, to_page, callback)
+        callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
+        assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
+            len(self.boxes), len(self.page_images))
+        res = []
+        for i in range(len(self.boxes)):
+            lines = "\n".join([b["text"] for b in self.boxes[i]
+                              if not self.__garbage(b["text"])])
+            res.append((lines, self.page_images[i]))
+        callback(0.9, "Page {}~{}: Parsing finished".format(
+            from_page, min(to_page, self.total_page)))
+        return res, []
+
+
+class PlainPdf(PlainParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, callback=None, **kwargs):
+        self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
+        page_txt = []
+        for page in self.pdf.pages[from_page: to_page]:
+            page_txt.append(page.extract_text())
+        callback(0.9, "Parsing finished")
+        return [(txt, None) for txt in page_txt], []
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, vision_model=None, parser_config=None, **kwargs):
+    """
+    The supported file formats are pdf, pptx.
+    Every page will be treated as a chunk. And the thumbnail of every page will be stored.
+    PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
+    """
+    if parser_config is None:
+        parser_config = {}
+    eng = lang.lower() == "english"
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    res = []
+    if re.search(r"\.pptx?$", filename, re.IGNORECASE):
+        if not binary:
+            with open(filename, "rb") as f:
+                binary = f.read()
+        ppt_parser = Ppt()
+        for pn, (txt, img) in enumerate(ppt_parser(
+                filename if not binary else binary, from_page, 1000000, callback)):
+            d = copy.deepcopy(doc)
+            pn += from_page
+            d["image"] = img
+            d["doc_type_kwd"] = "image"
+            d["page_num_int"] = [pn + 1]
+            d["top_int"] = [0]
+            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
+            tokenize(d, txt, eng)
+            res.append(d)
+        return res
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+
+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+
+        name = layout_recognizer.strip().lower()
+        parser = PARSERS.get(name, by_plaintext)
+        callback(0.1, "Start to parse.")
+
+        sections, _, _ = parser(
+            filename=filename,
+            binary=binary,
+            from_page=from_page,
+            to_page=to_page,
+            lang=lang,
+            callback=callback,
+            vision_model=vision_model,
+            pdf_cls=Pdf,
+            **kwargs
+        )
+
+        if not sections:
+            return []
+
+        if name in ["tcadp", "docling", "mineru"]:
+            parser_config["chunk_token_num"] = 0
+        
+        callback(0.8, "Finish parsing.")
+
+        for pn, (txt, img) in enumerate(sections):
+            d = copy.deepcopy(doc)
+            pn += from_page
+            if img:
+                d["image"] = img
+            d["page_num_int"] = [pn + 1]
+            d["top_int"] = [0]
+            d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
+            tokenize(d, txt, eng)
+            res.append(d)
+        return res
+
+    raise NotImplementedError(
+        "file type not supported yet(pptx, pdf supported)")
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(a, b):
+        pass
+    chunk(sys.argv[1], callback=dummy)
--- a/api/app/core/rag/app/qa.py
+++ b/api/app/core/rag/app/qa.py
@@ -0,0 +1,455 @@
+import logging
+import re
+import csv
+from copy import deepcopy
+from io import BytesIO
+from timeit import default_timer as timer
+from openpyxl import load_workbook
+
+from app.core.rag.deepdoc.parser.utils import get_text
+from app.core.rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
+from app.core.rag.nlp import rag_tokenizer, tokenize_table, concat_img
+from app.core.rag.deepdoc.parser import PdfParser, ExcelParser, DocxParser
+from docx import Document
+from PIL import Image
+from markdown import markdown
+
+from app.core.rag.common.float_utils import get_float
+
+
+class Excel(ExcelParser):
+    def __call__(self, fnm, binary=None, callback=None):
+        if not binary:
+            wb = load_workbook(fnm)
+        else:
+            wb = load_workbook(BytesIO(binary))
+        total = 0
+        for sheetname in wb.sheetnames:
+            total += len(list(wb[sheetname].rows))
+
+        res, fails = [], []
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            for i, r in enumerate(rows):
+                q, a = "", ""
+                for cell in r:
+                    if not cell.value:
+                        continue
+                    if not q:
+                        q = str(cell.value)
+                    elif not a:
+                        a = str(cell.value)
+                    else:
+                        break
+                if q and a:
+                    res.append((q, a))
+                else:
+                    fails.append(str(i + 1))
+                if len(res) % 999 == 0:
+                    callback(len(res) *
+                             0.6 /
+                             total, ("Extract pairs: {}".format(len(res)) +
+                                     (f"{len(fails)} failure, line: %s..." %
+                                      (",".join(fails[:3])) if fails else "")))
+
+        callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        self.is_english = is_english(
+            [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
+        return res
+
+
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+        logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
+        start = timer()
+        self._layouts_rec(zoomin, drop=False)
+        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        #self._naive_vertical_merge()
+        # self._concat_downward()
+        #self._filter_forpages()
+        logging.debug("layouts: {}".format(timer() - start))
+        sections = [b["text"] for b in self.boxes]
+        bull_x0_list = []
+        q_bull, reg = qbullets_category(sections)
+        if q_bull == -1:
+            raise ValueError("Unable to recognize Q&A structure.")
+        qai_list = []
+        last_q, last_a, last_tag = '', '', ''
+        last_index = -1
+        last_box = {'text':''}
+        last_bull = None
+        def sort_key(element):
+            tbls_pn = element[1][0][0]
+            tbls_top = element[1][0][3]
+            return tbls_pn, tbls_top
+        tbls.sort(key=sort_key)
+        tbl_index = 0
+        last_pn, last_bottom = 0, 0
+        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
+        for box in self.boxes:
+            section, line_tag = box['text'], self._line_tag(box, zoomin)
+            has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
+            last_box, last_index, last_bull = box, index, has_bull
+            line_pn = get_float(line_tag.lstrip('@@').split('\t')[0])
+            line_top = get_float(line_tag.rstrip('##').split('\t')[3])
+            tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
+            if not has_bull:  # No question bullet
+                if not last_q:
+                    if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top):    # image passed
+                        tbl_index += 1
+                    continue
+                else:
+                    sum_tag = line_tag
+                    sum_section = section
+                    while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
+                        and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)):    # add image at the middle of current answer
+                        sum_tag = f'{tbl_tag}{sum_tag}'
+                        sum_section = f'{tbl_text}{sum_section}'
+                        tbl_index += 1
+                        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
+                    last_a = f'{last_a}{sum_section}'
+                    last_tag = f'{last_tag}{sum_tag}'
+            else:
+                if last_q:
+                    while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
+                        and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)):    # add image at the end of last answer
+                        last_tag = f'{last_tag}{tbl_tag}'
+                        last_a = f'{last_a}{tbl_text}'
+                        tbl_index += 1
+                        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
+                    image, poss = self.crop(last_tag, need_position=True)
+                    qai_list.append((last_q, last_a, image, poss))
+                    last_q, last_a, last_tag = '', '', ''
+                last_q = has_bull.group()
+                _, end = has_bull.span()
+                last_a = section[end:]
+                last_tag = line_tag
+            last_bottom = float(line_tag.rstrip('##').split('\t')[4])
+            last_pn = line_pn
+        if last_q:
+            qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
+        return qai_list, tbls
+
+    def get_tbls_info(self, tbls, tbl_index):
+        if tbl_index >= len(tbls):
+            return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
+        tbl_pn = tbls[tbl_index][1][0][0]+1
+        tbl_left = tbls[tbl_index][1][0][1]
+        tbl_right = tbls[tbl_index][1][0][2]
+        tbl_top = tbls[tbl_index][1][0][3]
+        tbl_bottom = tbls[tbl_index][1][0][4]
+        tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
+            .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
+        _tbl_text = ''.join(tbls[tbl_index][0][1])
+        return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, _tbl_text
+
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def get_picture(self, document, paragraph):
+        img = paragraph._element.xpath('.//pic:pic')
+        if not img:
+            return None
+        img = img[0]
+        embed = img.xpath('.//a:blip/@r:embed')[0]
+        related_part = document.part.related_parts[embed]
+        image = related_part.image
+        image = Image.open(BytesIO(image.blob)).convert('RGB')
+        return image
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        last_answer, last_image = "", None
+        question_stack, level_stack = [], []
+        qai_list = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            question_level, p_text = 0, ''
+            if from_page <= pn < to_page and p.text.strip():
+                question_level, p_text = docx_question_level(p)
+            if not question_level or question_level > 6: # not a question
+                last_answer = f'{last_answer}\n{p_text}'
+                current_image = self.get_picture(self.doc, p)
+                last_image = concat_img(last_image, current_image)
+            else:   # is a question
+                if last_answer or last_image:
+                    sum_question = '\n'.join(question_stack)
+                    if sum_question:
+                        qai_list.append((sum_question, last_answer, last_image))
+                    last_answer, last_image = '', None
+
+                i = question_level
+                while question_stack and i <= level_stack[-1]:
+                    question_stack.pop()
+                    level_stack.pop()
+                question_stack.append(p_text)
+                level_stack.append(question_level)
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        if last_answer:
+            sum_question = '\n'.join(question_stack)
+            if sum_question:
+                qai_list.append((sum_question, last_answer, last_image))
+
+        tbls = []
+        for tb in self.doc.tables:
+            html= "<table>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                while i < len(r.cells):
+                    span = 1
+                    c = r.cells[i]
+                    for j in range(i+1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            i = j
+                    i += 1
+                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                html += "</tr>"
+            html += "</table>"
+            tbls.append(((None, html), ""))
+        return qai_list, tbls
+
+
+def rmPrefix(txt):
+    return re.sub(
+        r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:： ]+", "", txt.strip(), flags=re.IGNORECASE)
+
+
+def beAdocPdf(d, q, a, eng, image, poss):
+    qprefix = "Question: " if eng else "问题："
+    aprefix = "Answer: " if eng else "回答："
+    d["content_with_weight"] = "\t".join(
+        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    if image:
+        d["image"] = image
+        d["doc_type_kwd"] = "image"
+    add_positions(d, poss)
+    return d
+
+
+def beAdocDocx(d, q, a, eng, image, row_num=-1):
+    qprefix = "Question: " if eng else "问题："
+    aprefix = "Answer: " if eng else "回答："
+    d["content_with_weight"] = "\t".join(
+        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    if image:
+        d["image"] = image
+        d["doc_type_kwd"] = "image"
+    if row_num >= 0:
+        d["top_int"] = [row_num]
+    return d
+
+
+def beAdoc(d, q, a, eng, row_num=-1):
+    qprefix = "Question: " if eng else "问题："
+    aprefix = "Answer: " if eng else "回答："
+    d["content_with_weight"] = "\t".join(
+        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    if row_num >= 0:
+        d["top_int"] = [row_num]
+    return d
+
+
+def mdQuestionLevel(s):
+    match = re.match(r'#*', s)
+    return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+    """
+        Excel and csv(txt) format files are supported.
+        If the file is in excel format, there should be 2 column question and answer without header.
+        And question column is ahead of answer column.
+        And it's O.K if it has multiple sheets as long as the columns are rightly composed.
+
+        If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
+
+        All the deformed lines will be ignored.
+        Every pair of Q&A will be treated as a chunk.
+    """
+    eng = lang.lower() == "english"
+    res = []
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = Excel()
+        for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
+            res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
+        return res
+
+    elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        comma, tab = 0, 0
+        for line in lines:
+            if len(line.split(",")) == 2:
+                comma += 1
+            if len(line.split("\t")) == 2:
+                tab += 1
+        delimiter = "\t" if tab >= comma else ","
+
+        fails = []
+        question, answer = "", ""
+        i = 0
+        while i < len(lines):
+            arr = lines[i].split(delimiter)
+            if len(arr) != 2:
+                if question:
+                    answer += "\n" + lines[i]
+                else:
+                    fails.append(str(i+1))
+            elif len(arr) == 2:
+                if question and answer:
+                    res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
+                question, answer = arr
+            i += 1
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        if question:
+            res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines)))
+
+        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        return res
+
+    elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        delimiter = "\t" if any("\t" in line for line in lines) else ","
+
+        fails = []
+        question, answer = "", ""
+        res = []
+        reader = csv.reader(lines, delimiter=delimiter)
+
+        for i, row in enumerate(reader):
+            if len(row) != 2:
+                if question:
+                    answer += "\n" + lines[i]
+                else:
+                    fails.append(str(i + 1))
+            elif len(row) == 2:
+                if question and answer:
+                    res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
+                question, answer = row
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        if question:
+            res.append(beAdoc(deepcopy(doc), question, answer, eng, len(list(reader))))
+
+        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        return res
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        pdf_parser = Pdf()
+        qai_list, tbls = pdf_parser(filename if not binary else binary,
+                                    from_page=from_page, to_page=to_page, callback=callback)
+        for q, a, image, poss in qai_list:
+            res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
+        return res
+
+    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        _last_question, last_answer = "", ""
+        question_stack, level_stack = [], []
+        code_block = False
+        for index, line in enumerate(lines):
+            if line.strip().startswith('```'):
+                code_block = not code_block
+            question_level, question = 0, ''
+            if not code_block:
+                question_level, question = mdQuestionLevel(line)
+
+            if not question_level or question_level > 6: # not a question
+                last_answer = f'{last_answer}\n{line}'
+            else:   # is a question
+                if last_answer.strip():
+                    sum_question = '\n'.join(question_stack)
+                    if sum_question:
+                        res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
+                    last_answer = ''
+
+                i = question_level
+                while question_stack and i <= level_stack[-1]:
+                    question_stack.pop()
+                    level_stack.pop()
+                question_stack.append(question)
+                level_stack.append(question_level)
+        if last_answer.strip():
+            sum_question = '\n'.join(question_stack)
+            if sum_question:
+                res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
+        return res
+
+    elif re.search(r"\.docx$", filename, re.IGNORECASE):
+        docx_parser = Docx()
+        qai_list, tbls = docx_parser(filename, binary,
+                                    from_page=0, to_page=10000, callback=callback)
+        res = tokenize_table(tbls, doc, eng)
+        for i, (q, a, image) in enumerate(qai_list):
+            res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i))
+        return res
+
+    raise NotImplementedError(
+        "Excel, csv(txt), pdf, markdown and docx format files are supported.")
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)