From a7b8ba0c660d3b7456f09010375f3cd809f4616e Mon Sep 17 00:00:00 2001 From: Timebomb2018 <18868801967@163.com> Date: Thu, 9 Apr 2026 17:48:16 +0800 Subject: [PATCH] fix(rag): fix pdfplumber concurrency issue and add debug logging The pdfplumber parser now uses a global lock to prevent concurrent access issues during PDF image rendering. Additionally, added a warning log to trace knowledge retrieval results for debugging purposes. The syntax fix in knowledge node's match case ensures correct pattern matching behavior. BREAKING CHANGE: The pdfplumber parser now requires LOCK_KEY_pdfplumber to be defined in sys.modules for thread safety. Closes #841 --- api/app/core/rag/deepdoc/parser/mineru_parser.py | 7 ++++--- api/app/core/workflow/nodes/knowledge/node.py | 2 +- api/app/services/draft_run_service.py | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/api/app/core/rag/deepdoc/parser/mineru_parser.py b/api/app/core/rag/deepdoc/parser/mineru_parser.py index fe6178ec..c2f7af16 100644 --- a/api/app/core/rag/deepdoc/parser/mineru_parser.py +++ b/api/app/core/rag/deepdoc/parser/mineru_parser.py @@ -292,9 +292,10 @@ class MinerUParser(RAGPdfParser): self.page_from = page_from self.page_to = page_to try: - with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf: - self.pdf = pdf - self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])] + with sys.modules[LOCK_KEY_pdfplumber]: # ← 加这一行,获取全局锁 + with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf: + self.pdf = pdf + self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])] except Exception as e: self.page_images = None self.total_page = 0 diff --git a/api/app/core/workflow/nodes/knowledge/node.py b/api/app/core/workflow/nodes/knowledge/node.py index 29e46902..0601883d 100644 --- a/api/app/core/workflow/nodes/knowledge/node.py +++ b/api/app/core/workflow/nodes/knowledge/node.py @@ -233,7 +233,7 @@ class KnowledgeRetrievalNode(BaseNode): } ) ) - case RetrieveType.HYBRID | RetrieveType.Graph: + case (RetrieveType.HYBRID, RetrieveType.Graph): rs1_task = asyncio.to_thread( vector_service.search_by_vector, **{ "query": query, diff --git a/api/app/services/draft_run_service.py b/api/app/services/draft_run_service.py index 978dfdab..461ee0c4 100644 --- a/api/app/services/draft_run_service.py +++ b/api/app/services/draft_run_service.py @@ -224,6 +224,7 @@ def create_knowledge_retrieval_tool(kb_config, kb_ids, user_id, citations_collec retrieve_chunks_result = knowledge_retrieval(query, kb_config) if retrieve_chunks_result: retrieval_knowledge = [i.page_content for i in retrieve_chunks_result] + logger.warning(f"检索知识结果:{retrieval_knowledge}") context = '\n\n'.join(retrieval_knowledge) logger.info( "知识库检索成功",