From 00780289920c0cc84a22c1e7e64a451de5295b8a Mon Sep 17 00:00:00 2001 From: lixiangcheng1 Date: Tue, 30 Dec 2025 11:53:16 +0800 Subject: [PATCH] [ADD]Support graph search --- api/app/controllers/chunk_controller.py | 41 +++++++++++++++---- api/app/core/rag/graphrag/search.py | 29 ++++++------- api/app/core/rag/utils/es_conn.py | 2 +- api/app/repositories/knowledge_repository.py | 10 ++--- .../repositories/knowledgeshare_repository.py | 8 ++-- api/app/schemas/chunk_schema.py | 1 + 6 files changed, 59 insertions(+), 32 deletions(-) diff --git a/api/app/controllers/chunk_controller.py b/api/app/controllers/chunk_controller.py index 162c8e57..509ad442 100644 --- a/api/app/controllers/chunk_controller.py +++ b/api/app/controllers/chunk_controller.py @@ -18,6 +18,9 @@ from app.schemas.response_schema import ApiResponse from app.core.response_utils import success from app.services import knowledge_service, document_service, file_service, knowledgeshare_service from app.core.rag.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory +from app.core.rag.common.settings import kg_retriever +from app.core.rag.llm.chat_model import Base +from app.core.rag.llm.embedding_model import OpenAIEmbed from app.core.logging_config import get_api_logger # Obtain a dedicated API logger @@ -389,36 +392,41 @@ async def retrieve_chunks( knowledge_model.Knowledge.chunk_num > 0, knowledge_model.Knowledge.status == 1 ] - existing_ids = knowledge_service.get_chunded_knowledgeids( + private_items = knowledge_service.get_chunded_knowledgeids( db=db, filters=filters, current_user=current_user ) + private_kb_ids = [item[0] for item in private_items] + private_workspace_ids = [item[1] for item in private_items] filters = [ knowledge_model.Knowledge.id.in_(retrieve_data.kb_ids), knowledge_model.Knowledge.permission_id == knowledge_model.PermissionType.Share, knowledge_model.Knowledge.chunk_num > 0, knowledge_model.Knowledge.status == 1 ] - share_ids = knowledge_service.get_chunded_knowledgeids( + items = knowledge_service.get_chunded_knowledgeids( db=db, filters=filters, current_user=current_user ) - if share_ids: + if items: filters = [ knowledgeshare_model.KnowledgeShare.target_kb_id.in_(retrieve_data.kb_ids) ] - items = knowledgeshare_service.get_source_kb_ids_by_target_kb_id( + share_items = knowledgeshare_service.get_source_kb_ids_by_target_kb_id( db=db, filters=filters, current_user=current_user ) - existing_ids.extend(items) - if not existing_ids: + share_kb_ids = [item[0] for item in share_items] + share_workspace_ids = [item[1] for item in share_items] + private_kb_ids.extend(share_kb_ids) + private_workspace_ids.extend(share_workspace_ids) + if not private_kb_ids: return success(data=[], msg="retrieval successful") - kb_id = existing_ids[0] - uuid_strs = [f"Vector_index_{kb_id}_Node".lower() for kb_id in existing_ids] + kb_id = private_kb_ids[0] + uuid_strs = [f"Vector_index_{kb_id}_Node".lower() for kb_id in private_kb_ids] indices = ",".join(uuid_strs) db_knowledge = knowledge_service.get_knowledge_by_id(db, knowledge_id=kb_id, current_user=current_user) if not db_knowledge: @@ -448,4 +456,21 @@ async def retrieve_chunks( seen_ids.add(doc.metadata["doc_id"]) unique_rs.append(doc) rs = vector_service.rerank(query=retrieve_data.query, docs=unique_rs, top_k=retrieve_data.top_k) + if retrieve_data.retrieve_type == chunk_schema.RetrieveType.Graph: + kb_ids = [str(kb_id) for kb_id in private_kb_ids] + workspace_ids = [str(workspace_id) for workspace_id in private_workspace_ids] + # Prepare to configure chat_mdl、embedding_model、vision_model information + chat_model = Base( + key=db_knowledge.llm.api_keys[0].api_key, + model_name=db_knowledge.llm.api_keys[0].model_name, + base_url=db_knowledge.llm.api_keys[0].api_base + ) + embedding_model = OpenAIEmbed( + key=db_knowledge.embedding.api_keys[0].api_key, + model_name=db_knowledge.embedding.api_keys[0].model_name, + base_url=db_knowledge.embedding.api_keys[0].api_base + ) + doc = kg_retriever.retrieval(question=retrieve_data.query, workspace_ids=workspace_ids, kb_ids= kb_ids, emb_mdl=embedding_model, llm=chat_model) + if doc: + rs.insert(0, doc) return success(data=rs, msg="retrieval successful") \ No newline at end of file diff --git a/api/app/core/rag/graphrag/search.py b/api/app/core/rag/graphrag/search.py index 27eb674e..8823145f 100644 --- a/api/app/core/rag/graphrag/search.py +++ b/api/app/core/rag/graphrag/search.py @@ -4,6 +4,7 @@ from collections import defaultdict from copy import deepcopy import json_repair import pandas as pd +import time import trio from app.core.rag.common.misc_utils import get_uuid @@ -262,21 +263,21 @@ class KGSearch(Dealer): relas = "" return { - "chunk_id": get_uuid(), - "content_ltks": "", - "page_content": ents + relas + self._community_retrieval_([n for n, _ in ents_from_query], filters, kb_ids, idxnms, - comm_topn, max_token), + "page_content": ents + relas + self._community_retrieval_([n for n, _ in ents_from_query], filters, kb_ids, idxnms, comm_topn, max_token), + "vector": None, + "metadata": { + "doc_id": get_uuid(), + "file_id": "", + "file_name": "Related content in Knowledge Graph", + "file_created_at": int(time.time() * 1000), "document_id": "", - "docnm_kwd": "Related content in Knowledge Graph", - "kb_id": kb_ids, - "important_kwd": [], - "image_id": "", - "similarity": 1., - "vector_similarity": 1., - "term_similarity": 0, - "vector": [], - "positions": [], - } + "knowledge_id": kb_ids, + "sort_id": 0, + "status": 1, + "score": 1 + }, + "children": None + } def _community_retrieval_(self, entities, condition, kb_ids, idxnms, topn, max_token): ## Community retrieval diff --git a/api/app/core/rag/utils/es_conn.py b/api/app/core/rag/utils/es_conn.py index 43b1dfbe..7fbf0e38 100644 --- a/api/app/core/rag/utils/es_conn.py +++ b/api/app/core/rag/utils/es_conn.py @@ -213,7 +213,7 @@ class ESConnection(DocStoreConnection): m.topn * 2, query_vector=list(m.embedding_data), filter=bqry.to_dict(), - similarity=similarity, + # similarity=similarity ) if bqry and rank_feature: diff --git a/api/app/repositories/knowledge_repository.py b/api/app/repositories/knowledge_repository.py index b7908cb0..0766c447 100644 --- a/api/app/repositories/knowledge_repository.py +++ b/api/app/repositories/knowledge_repository.py @@ -52,19 +52,19 @@ def get_knowledges_paginated( raise -def get_chunked_knowledgeids( +def get_chunded_knowledgeids( db: Session, filters: list ) -> list: """ Query the list of vectorized knowledge base IDs - Return: list[UUID] - List of knowledge base IDs + Return: list[(id,workspace_id)] - List of knowledge base id and workspace_id """ db_logger.debug(f"Query the list of vectorized knowledge base IDs: filters_count={len(filters)}") try: # Only query the id field - query = db.query(Knowledge.id) + query = db.query(Knowledge.id, Knowledge.workspace_id) # Apply filter conditions for filter_cond in filters: @@ -74,8 +74,8 @@ def get_chunked_knowledgeids( items = query.all() db_logger.info(f"Querying the vectorized knowledge base id list succeeded: count={len(items)}") - # Return the list of IDs directly. Since only the ID field is queried, the returned data is a single column - return [item[0] for item in items] + # Return the list of ID and workspace_id directly. Since only the ID and workspace_id field is queried + return items except Exception as e: db_logger.error(f"Querying the vectorized knowledge base id list failed: {str(e)}") raise diff --git a/api/app/repositories/knowledgeshare_repository.py b/api/app/repositories/knowledgeshare_repository.py index e4976b8d..28bb07e9 100644 --- a/api/app/repositories/knowledgeshare_repository.py +++ b/api/app/repositories/knowledgeshare_repository.py @@ -61,14 +61,14 @@ def get_source_kb_ids_by_target_kb_id( ) -> list: """ Query the original knowledge base ID list by sharing the knowledge base - Return: list[UUID] - List of knowledge base IDs + Return: list[(source_kb_id,source_workspace_id)] - List of knowledge base source_kb_id and source_workspace_id """ db_logger.debug( f"Query the original knowledge base id list by sharing the knowledge base: filters_count={len(filters)}") try: # Only query the id field - query = db.query(KnowledgeShare.source_kb_id) + query = db.query(KnowledgeShare.source_kb_id, KnowledgeShare.source_workspace_id) # Apply filter conditions for filter_cond in filters: @@ -78,8 +78,8 @@ def get_source_kb_ids_by_target_kb_id( items = query.all() db_logger.info(f"Successfully queried the original knowledge base ID list by sharing the knowledge base: count={len(items)}") - # Return the list of IDs directly. Since only the ID field is queried, the returned data is a single column - return [item[0] for item in items] + # Return the list of source_kb_id and source_workspace_id directly. Since only the source_kb_id and source_workspace_id field is queried + return items except Exception as e: db_logger.error(f"Failed to query the original knowledge base ID list through knowledge base sharing: {str(e)}") raise diff --git a/api/app/schemas/chunk_schema.py b/api/app/schemas/chunk_schema.py index c38fe765..cef9b9cb 100644 --- a/api/app/schemas/chunk_schema.py +++ b/api/app/schemas/chunk_schema.py @@ -10,6 +10,7 @@ class RetrieveType(StrEnum): PARTICIPLE = "participle" SEMANTIC = "semantic" HYBRID = "hybrid" + Graph = "graph" class ChunkCreate(BaseModel):