fix(rag): fix pdfplumber concurrency issue and add debug logging
The pdfplumber parser now uses a global lock to prevent concurrent access issues during PDF image rendering. Additionally, added a warning log to trace knowledge retrieval results for debugging purposes. The syntax fix in knowledge node's match case ensures correct pattern matching behavior. BREAKING CHANGE: The pdfplumber parser now requires LOCK_KEY_pdfplumber to be defined in sys.modules for thread safety. Closes #841
This commit is contained in:
@@ -292,9 +292,10 @@ class MinerUParser(RAGPdfParser):
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
try:
|
||||
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
||||
self.pdf = pdf
|
||||
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
|
||||
with sys.modules[LOCK_KEY_pdfplumber]: # ← 加这一行,获取全局锁
|
||||
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
||||
self.pdf = pdf
|
||||
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
|
||||
except Exception as e:
|
||||
self.page_images = None
|
||||
self.total_page = 0
|
||||
|
||||
@@ -233,7 +233,7 @@ class KnowledgeRetrievalNode(BaseNode):
|
||||
}
|
||||
)
|
||||
)
|
||||
case RetrieveType.HYBRID | RetrieveType.Graph:
|
||||
case (RetrieveType.HYBRID, RetrieveType.Graph):
|
||||
rs1_task = asyncio.to_thread(
|
||||
vector_service.search_by_vector, **{
|
||||
"query": query,
|
||||
|
||||
@@ -224,6 +224,7 @@ def create_knowledge_retrieval_tool(kb_config, kb_ids, user_id, citations_collec
|
||||
retrieve_chunks_result = knowledge_retrieval(query, kb_config)
|
||||
if retrieve_chunks_result:
|
||||
retrieval_knowledge = [i.page_content for i in retrieve_chunks_result]
|
||||
logger.warning(f"检索知识结果:{retrieval_knowledge}")
|
||||
context = '\n\n'.join(retrieval_knowledge)
|
||||
logger.info(
|
||||
"知识库检索成功",
|
||||
|
||||
Reference in New Issue
Block a user