From cd12844a7cfaf95c02b92c728243d95e8299b775 Mon Sep 17 00:00:00 2001 From: lixiangcheng1 Date: Sat, 27 Dec 2025 17:12:04 +0800 Subject: [PATCH] [fix]build knowledge graph --- api/app/core/rag/res/mapping.json | 212 ++++++++++++++++++++++++++++++ api/app/tasks.py | 4 +- 2 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 api/app/core/rag/res/mapping.json diff --git a/api/app/core/rag/res/mapping.json b/api/app/core/rag/res/mapping.json new file mode 100644 index 00000000..f32acb02 --- /dev/null +++ b/api/app/core/rag/res/mapping.json @@ -0,0 +1,212 @@ +{ + "settings": { + "index": { + "number_of_shards": 2, + "number_of_replicas": 0, + "refresh_interval": "1000ms" + }, + "similarity": { + "scripted_sim": { + "type": "scripted", + "script": { + "source": "double idf = Math.log(1+(field.docCount-term.docFreq+0.5)/(term.docFreq + 0.5))/Math.log(1+((field.docCount-0.5)/1.5)); return query.boost * idf * Math.min(doc.freq, 1);" + } + } + } + }, + "mappings": { + "properties": { + "lat_lon": { + "type": "geo_point", + "store": "true" + } + }, + "date_detection": "true", + "dynamic_templates": [ + { + "int": { + "match": "*_int", + "mapping": { + "type": "integer", + "store": "true" + } + } + }, + { + "ulong": { + "match": "*_ulong", + "mapping": { + "type": "unsigned_long", + "store": "true" + } + } + }, + { + "long": { + "match": "*_long", + "mapping": { + "type": "long", + "store": "true" + } + } + }, + { + "short": { + "match": "*_short", + "mapping": { + "type": "short", + "store": "true" + } + } + }, + { + "numeric": { + "match": "*_flt", + "mapping": { + "type": "float", + "store": true + } + } + }, + { + "tks": { + "match": "*_tks", + "mapping": { + "type": "text", + "similarity": "scripted_sim", + "analyzer": "whitespace", + "store": true + } + } + }, + { + "ltks": { + "match": "*_ltks", + "mapping": { + "type": "text", + "analyzer": "whitespace", + "store": true + } + } + }, + { + "kwd": { + "match_pattern": "regex", + "match": "^(.*_(kwd|id|ids|uid|uids)|uid)$", + "mapping": { + "type": "keyword", + "similarity": "boolean", + "store": true + } + } + }, + { + "dt": { + "match_pattern": "regex", + "match": "^.*(_dt|_time|_at)$", + "mapping": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM-dd_HH:mm:ss", + "store": true + } + } + }, + { + "nested": { + "match": "*_nst", + "mapping": { + "type": "nested" + } + } + }, + { + "object": { + "match": "*_obj", + "mapping": { + "type": "object", + "dynamic": "true" + } + } + }, + { + "string": { + "match_pattern": "regex", + "match": "^.*_(with_weight|list)$", + "mapping": { + "type": "text", + "index": "false", + "store": true + } + } + }, + { + "rank_feature": { + "match": "*_fea", + "mapping": { + "type": "rank_feature" + } + } + }, + { + "rank_features": { + "match": "*_feas", + "mapping": { + "type": "rank_features" + } + } + }, + { + "dense_vector": { + "match": "*_512_vec", + "mapping": { + "type": "dense_vector", + "index": true, + "similarity": "cosine", + "dims": 512 + } + } + }, + { + "dense_vector": { + "match": "*_768_vec", + "mapping": { + "type": "dense_vector", + "index": true, + "similarity": "cosine", + "dims": 768 + } + } + }, + { + "dense_vector": { + "match": "*_1024_vec", + "mapping": { + "type": "dense_vector", + "index": true, + "similarity": "cosine", + "dims": 1024 + } + } + }, + { + "dense_vector": { + "match": "*_1536_vec", + "mapping": { + "type": "dense_vector", + "index": true, + "similarity": "cosine", + "dims": 1536 + } + } + }, + { + "binary": { + "match": "*_bin", + "mapping": { + "type": "binary" + } + } + } + ] + } +} \ No newline at end of file diff --git a/api/app/tasks.py b/api/app/tasks.py index 36c61c05..16173904 100644 --- a/api/app/tasks.py +++ b/api/app/tasks.py @@ -280,8 +280,10 @@ def build_graphrag_for_kb(kb_id: uuid.UUID): build knowledge graph """ db = next(get_db()) # Manually call the generator + db_document = None db_knowledge = None try: + db_document = db.query(Document).filter(Document.kb_id == kb_id).all() db_knowledge = db.query(Knowledge).filter(Knowledge.id == kb_id).first() # 1. Prepare to configure chat_mdl、embedding_model、vision_model information chat_model = Base( @@ -304,7 +306,7 @@ def build_graphrag_for_kb(kb_id: uuid.UUID): # 2. get all document_ids from knowledge base vector_service = ElasticSearchVectorFactory().init_vector(knowledge=db_knowledge) total, items = vector_service.search_by_segment(document_id=None, query=None, pagesize=9999, page=1, asc=True) - document_ids = [item.metadata["document_id"] for item in items] + document_ids = [item.id for item in db_document] # 2. using graphrag if db_knowledge.parser_config.get("graphrag", {}).get("use_graphrag", False):