fix(rag): fix pdfplumber concurrency issue and add debug logging
The pdfplumber parser now uses a global lock to prevent concurrent access issues during PDF image rendering. Additionally, added a warning log to trace knowledge retrieval results for debugging purposes. The syntax fix in knowledge node's match case ensures correct pattern matching behavior. BREAKING CHANGE: The pdfplumber parser now requires LOCK_KEY_pdfplumber to be defined in sys.modules for thread safety. Closes #841
This commit is contained in:
@@ -292,9 +292,10 @@ class MinerUParser(RAGPdfParser):
|
|||||||
self.page_from = page_from
|
self.page_from = page_from
|
||||||
self.page_to = page_to
|
self.page_to = page_to
|
||||||
try:
|
try:
|
||||||
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
with sys.modules[LOCK_KEY_pdfplumber]: # ← 加这一行,获取全局锁
|
||||||
self.pdf = pdf
|
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
||||||
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
|
self.pdf = pdf
|
||||||
|
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.page_images = None
|
self.page_images = None
|
||||||
self.total_page = 0
|
self.total_page = 0
|
||||||
|
|||||||
@@ -233,7 +233,7 @@ class KnowledgeRetrievalNode(BaseNode):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
case RetrieveType.HYBRID | RetrieveType.Graph:
|
case (RetrieveType.HYBRID, RetrieveType.Graph):
|
||||||
rs1_task = asyncio.to_thread(
|
rs1_task = asyncio.to_thread(
|
||||||
vector_service.search_by_vector, **{
|
vector_service.search_by_vector, **{
|
||||||
"query": query,
|
"query": query,
|
||||||
|
|||||||
@@ -224,6 +224,7 @@ def create_knowledge_retrieval_tool(kb_config, kb_ids, user_id, citations_collec
|
|||||||
retrieve_chunks_result = knowledge_retrieval(query, kb_config)
|
retrieve_chunks_result = knowledge_retrieval(query, kb_config)
|
||||||
if retrieve_chunks_result:
|
if retrieve_chunks_result:
|
||||||
retrieval_knowledge = [i.page_content for i in retrieve_chunks_result]
|
retrieval_knowledge = [i.page_content for i in retrieve_chunks_result]
|
||||||
|
logger.warning(f"检索知识结果:{retrieval_knowledge}")
|
||||||
context = '\n\n'.join(retrieval_knowledge)
|
context = '\n\n'.join(retrieval_knowledge)
|
||||||
logger.info(
|
logger.info(
|
||||||
"知识库检索成功",
|
"知识库检索成功",
|
||||||
|
|||||||
Reference in New Issue
Block a user