From cd12844a7cfaf95c02b92c728243d95e8299b775 Mon Sep 17 00:00:00 2001
From: lixiangcheng1 <lixiangcheng1@wanda.cn>
Date: Sat, 27 Dec 2025 17:12:04 +0800
Subject: [PATCH] [fix]build knowledge graph

---
 api/app/core/rag/res/mapping.json | 212 ++++++++++++++++++++++++++++++
 api/app/tasks.py                  |   4 +-
 2 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 api/app/core/rag/res/mapping.json

diff --git a/api/app/core/rag/res/mapping.json b/api/app/core/rag/res/mapping.json
new file mode 100644
index 00000000..f32acb02
--- /dev/null
+++ b/api/app/core/rag/res/mapping.json
@@ -0,0 +1,212 @@
+{
+  "settings": {
+    "index": {
+      "number_of_shards": 2,
+      "number_of_replicas": 0,
+      "refresh_interval": "1000ms"
+    },
+    "similarity": {
+      "scripted_sim": {
+        "type": "scripted",
+        "script": {
+          "source": "double idf = Math.log(1+(field.docCount-term.docFreq+0.5)/(term.docFreq + 0.5))/Math.log(1+((field.docCount-0.5)/1.5)); return query.boost * idf * Math.min(doc.freq, 1);"
+        }
+      }
+    }
+  },
+  "mappings": {
+    "properties": {
+      "lat_lon": {
+        "type": "geo_point",
+        "store": "true"
+      }
+    },
+    "date_detection": "true",
+    "dynamic_templates": [
+      {
+        "int": {
+          "match": "*_int",
+          "mapping": {
+            "type": "integer",
+            "store": "true"
+          }
+        }
+      },
+      {
+        "ulong": {
+          "match": "*_ulong",
+          "mapping": {
+            "type": "unsigned_long",
+            "store": "true"
+          }
+        }
+      },
+      {
+        "long": {
+          "match": "*_long",
+          "mapping": {
+            "type": "long",
+            "store": "true"
+          }
+        }
+      },
+      {
+        "short": {
+          "match": "*_short",
+          "mapping": {
+            "type": "short",
+            "store": "true"
+          }
+        }
+      },
+      {
+        "numeric": {
+          "match": "*_flt",
+          "mapping": {
+            "type": "float",
+            "store": true
+          }
+        }
+      },
+      {
+        "tks": {
+          "match": "*_tks",
+          "mapping": {
+            "type": "text",
+            "similarity": "scripted_sim",
+            "analyzer": "whitespace",
+            "store": true
+          }
+        }
+      },
+      {
+        "ltks": {
+          "match": "*_ltks",
+          "mapping": {
+            "type": "text",
+            "analyzer": "whitespace",
+            "store": true
+          }
+        }
+      },
+      {
+        "kwd": {
+          "match_pattern": "regex",
+          "match": "^(.*_(kwd|id|ids|uid|uids)|uid)$",
+          "mapping": {
+            "type": "keyword",
+            "similarity": "boolean",
+            "store": true
+          }
+        }
+      },
+      {
+        "dt": {
+          "match_pattern": "regex",
+          "match": "^.*(_dt|_time|_at)$",
+          "mapping": {
+            "type": "date",
+            "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM-dd_HH:mm:ss",
+            "store": true
+          }
+        }
+      },
+      {
+        "nested": {
+          "match": "*_nst",
+          "mapping": {
+            "type": "nested"
+          }
+        }
+      },
+      {
+        "object": {
+          "match": "*_obj",
+          "mapping": {
+            "type": "object",
+            "dynamic": "true"
+          }
+        }
+      },
+      {
+        "string": {
+          "match_pattern": "regex",
+          "match": "^.*_(with_weight|list)$",
+          "mapping": {
+            "type": "text",
+            "index": "false",
+            "store": true
+          }
+        }
+      },
+      {
+        "rank_feature": {
+          "match": "*_fea",
+          "mapping": {
+            "type": "rank_feature"
+          }
+        }
+      },
+      {
+        "rank_features": {
+          "match": "*_feas",
+          "mapping": {
+            "type": "rank_features"
+          }
+        }
+      },
+      {
+        "dense_vector": {
+          "match": "*_512_vec",
+          "mapping": {
+            "type": "dense_vector",
+            "index": true,
+            "similarity": "cosine",
+            "dims": 512
+          }
+        }
+      },
+      {
+        "dense_vector": {
+          "match": "*_768_vec",
+          "mapping": {
+            "type": "dense_vector",
+            "index": true,
+            "similarity": "cosine",
+            "dims": 768
+          }
+        }
+      },
+      {
+        "dense_vector": {
+          "match": "*_1024_vec",
+          "mapping": {
+            "type": "dense_vector",
+            "index": true,
+            "similarity": "cosine",
+            "dims": 1024
+          }
+        }
+      },
+      {
+        "dense_vector": {
+          "match": "*_1536_vec",
+          "mapping": {
+            "type": "dense_vector",
+            "index": true,
+            "similarity": "cosine",
+            "dims": 1536
+          }
+        }
+      },
+      {
+        "binary": {
+          "match": "*_bin",
+          "mapping": {
+            "type": "binary"
+          }
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/api/app/tasks.py b/api/app/tasks.py
index 36c61c05..16173904 100644
--- a/api/app/tasks.py
+++ b/api/app/tasks.py
@@ -280,8 +280,10 @@ def build_graphrag_for_kb(kb_id: uuid.UUID):
     build knowledge graph
     """
     db = next(get_db())  # Manually call the generator
+    db_document = None
     db_knowledge = None
     try:
+        db_document = db.query(Document).filter(Document.kb_id == kb_id).all()
         db_knowledge = db.query(Knowledge).filter(Knowledge.id == kb_id).first()
         # 1. Prepare to configure chat_mdl、embedding_model、vision_model information
         chat_model = Base(
@@ -304,7 +306,7 @@ def build_graphrag_for_kb(kb_id: uuid.UUID):
         # 2. get all document_ids from knowledge base
         vector_service = ElasticSearchVectorFactory().init_vector(knowledge=db_knowledge)
         total, items = vector_service.search_by_segment(document_id=None, query=None, pagesize=9999, page=1, asc=True)
-        document_ids = [item.metadata["document_id"] for item in items]
+        document_ids = [item.id for item in db_document]
 
         # 2. using graphrag
         if db_knowledge.parser_config.get("graphrag", {}).get("use_graphrag", False):