From c4461c4917ddfc66e2325994100ac0ef9ee60b4a Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Tue, 24 Mar 2026 12:27:13 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90add=E3=80=91User=20alias=20extraction?= =?UTF-8?q?=20and=20retrieval?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../prompt/Problem_Extension_prompt.jinja2 | 52 ++++++++++++++++++ .../prompt/problem_breakdown_prompt.jinja2 | 52 ++++++++++++++++++ .../deduplication/deduped_and_disamb.py | 43 +++++++++++++++ .../prompt/prompts/extract_triplet.jinja2 | 18 ++++++ api/app/repositories/neo4j/cypher_queries.py | 55 +++++++++++++++++++ api/app/repositories/neo4j/graph_search.py | 3 +- api/app/schemas/end_user_schema.py | 3 +- 7 files changed, 224 insertions(+), 2 deletions(-) diff --git a/api/app/core/memory/agent/utils/prompt/Problem_Extension_prompt.jinja2 b/api/app/core/memory/agent/utils/prompt/Problem_Extension_prompt.jinja2 index a0e21fbd..c78cbaac 100644 --- a/api/app/core/memory/agent/utils/prompt/Problem_Extension_prompt.jinja2 +++ b/api/app/core/memory/agent/utils/prompt/Problem_Extension_prompt.jinja2 @@ -39,6 +39,30 @@ 比如:输入历史信息内容:[{'Query': '4月27日,我和你推荐过一本书,书名是什么?', 'ANswer': '张曼玉推荐了《小王子》'}] 拆分问题:4月27日,我和你推荐过一本书,书名是什么?,可以拆分为:4月27日,张曼玉推荐过一本书,书名是什么? +## 指代消歧规则(Coreference Resolution): +在拆分问题时,必须解析并替换所有指代词和抽象称呼,使问题具体化: + +1. **"用户"的消歧**: + - "用户是谁?" → 分析历史记录,找出对话发起者的姓名 + - 如果历史中有"我叫X"、"我的名字是X"、或多次提到某个人物,则"用户"指的就是这个人 + - 示例:历史中有"老李的原名叫李建国",则"用户是谁?"应拆分为"李建国是谁?"或"老李(李建国)是谁?" + +2. **"我"的消歧**: + - "我喜欢什么?" → 从历史中找出对话发起者的姓名,替换为"X喜欢什么?" + - 示例:历史中有"张曼玉推荐了《小王子》",则"我推荐的书是什么?"应拆分为"张曼玉推荐的书是什么?" + +3. **"他/她/它"的消歧**: + - 从上下文或历史中找出最近提到的同类实体 + - 示例:历史中有"老李的同事叫他建国哥",则"他的同事怎么称呼他?"应拆分为"老李的同事怎么称呼他?" + +4. **"那个人/这个人"的消歧**: + - 从历史中找出最近提到的人物 + - 示例:历史中有"李建国",则"那个人的原名是什么?"应拆分为"李建国的原名是什么?" + +5. **优先级**: + - 如果历史记录中反复出现某个人物(如"老李"、"李建国"、"建国哥"),则"用户"很可能指的就是这个人 + - 如果无法从历史中确定指代对象,保留原问题,但在reason中说明"无法确定指代对象" + 输出要求: @@ -71,6 +95,34 @@ "reason": "输出原问题的关键要素" } ] + +## 指代消歧示例(重要): +示例1 - "用户"的消歧: +输入历史:[{'Query': '老李的原名叫什么?', 'Answer': '李建国'}, {'Query': '老李的同事叫他什么?', 'Answer': '建国哥'}] +输入问题:"用户是谁?" +输出: +[ + { + "original_question": "用户是谁?", + "extended_question": "李建国是谁?", + "type": "单跳", + "reason": "历史中反复提到'老李/李建国/建国哥','用户'指的就是对话发起者李建国" + } +] + +示例2 - "我"的消歧: +输入历史:[{'Query': '张曼玉推荐了什么书?', 'Answer': '《小王子》'}] +输入问题:"我推荐的书是什么?" +输出: +[ + { + "original_question": "我推荐的书是什么?", + "extended_question": "张曼玉推荐的书是什么?", + "type": "单跳", + "reason": "历史中提到张曼玉推荐了书,'我'指的就是张曼玉" + } +] + **Output format** **CRITICAL JSON FORMATTING REQUIREMENTS:** 1. Use only standard ASCII double quotes (") for JSON structure - never use Chinese quotation marks ("") or other Unicode quotes diff --git a/api/app/core/memory/agent/utils/prompt/problem_breakdown_prompt.jinja2 b/api/app/core/memory/agent/utils/prompt/problem_breakdown_prompt.jinja2 index aca716a4..ff134ddb 100644 --- a/api/app/core/memory/agent/utils/prompt/problem_breakdown_prompt.jinja2 +++ b/api/app/core/memory/agent/utils/prompt/problem_breakdown_prompt.jinja2 @@ -27,6 +27,30 @@ 比如:输入历史信息内容:[{'Query': '4月27日,我和你推荐过一本书,书名是什么?', 'ANswer': '张曼玉推荐了《小王子》'}] 拆分问题:4月27日,我和你推荐过一本书,书名是什么?,可以拆分为:4月27日,张曼玉推荐过一本书,书名是什么? +## 指代消歧规则(Coreference Resolution): +在拆分问题时,必须解析并替换所有指代词和抽象称呼,使问题具体化: + +1. **"用户"的消歧**: + - "用户是谁?" → 分析历史记录,找出对话发起者的姓名 + - 如果历史中有"我叫X"、"我的名字是X"、或多次提到某个人物(如"老李"、"李建国"),则"用户"指的就是这个人 + - 示例:历史中反复出现"老李/李建国/建国哥",则"用户是谁?"应拆分为"李建国是谁?"或"老李(李建国)是谁?" + +2. **"我"的消歧**: + - "我喜欢什么?" → 从历史中找出对话发起者的姓名,替换为"X喜欢什么?" + - 示例:历史中有"张曼玉推荐了《小王子》",则"我推荐的书是什么?"应拆分为"张曼玉推荐的书是什么?" + +3. **"他/她/它"的消歧**: + - 从上下文或历史中找出最近提到的同类实体 + - 示例:历史中有"老李的同事叫他建国哥",则"他的同事怎么称呼他?"应拆分为"老李的同事怎么称呼他?" + +4. **"那个人/这个人"的消歧**: + - 从历史中找出最近提到的人物 + - 示例:历史中有"李建国",则"那个人的原名是什么?"应拆分为"李建国的原名是什么?" + +5. **优先级**: + - 如果历史记录中反复出现某个人物(如"老李"、"李建国"、"建国哥"),则"用户"很可能指的就是这个人 + - 如果无法从历史中确定指代对象,保留原问题,但在reason中说明"无法确定指代对象" + ## 指令: 你是一个智能数据拆分助手,请根据数据特性判断输入属于哪种类型: 单跳(Single-hop) @@ -151,6 +175,34 @@ ] - 必须通过json.loads()的格式支持的形式输出 - 必须通过json.loads()的格式支持的形式输出,响应必须是与此确切模式匹配的有效JSON对象。不要在JSON之前或之后包含任何文本。 + +## 指代消歧示例(重要): +示例1 - "用户"的消歧: +输入历史:[{'Query': '老李的原名叫什么?', 'Answer': '李建国'}, {'Query': '老李的同事叫他什么?', 'Answer': '建国哥'}] +输入问题:"用户是谁?" +输出: +[ + { + "id": "Q1", + "question": "李建国是谁?", + "type": "单跳", + "reason": "历史中反复提到'老李/李建国/建国哥','用户'指的就是对话发起者李建国" + } +] + +示例2 - "我"的消歧: +输入历史:[{'Query': '张曼玉推荐了什么书?', 'Answer': '《小王子》'}] +输入问题:"我推荐的书是什么?" +输出: +[ + { + "id": "Q1", + "question": "张曼玉推荐的书是什么?", + "type": "单跳", + "reason": "历史中提到张曼玉推荐了书,'我'指的就是张曼玉" + } +] + - 关键的JSON格式要求 1.JSON结构仅使用标准ASCII双引号(“)-切勿使用中文引号(“”)或其他Unicode引号 2.如果提取的语句文本包含引号,请使用反斜杠(\“)正确转义它们 diff --git a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py index f2f14d9e..622f6e05 100644 --- a/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py +++ b/api/app/core/memory/storage_services/extraction_engine/deduplication/deduped_and_disamb.py @@ -203,6 +203,7 @@ def accurate_match( ) -> Tuple[List[ExtractedEntityNode], Dict[str, str], Dict[str, Dict]]: """ 精确匹配:按 (end_user_id, name, entity_type) 合并实体并建立重定向与合并记录。 + 同时检测某实体的 name 是否命中另一实体的 aliases,若命中则直接合并。 返回: (deduped_entities, id_redirect, exact_merge_map) """ exact_merge_map: Dict[str, Dict] = {} @@ -240,6 +241,48 @@ def accurate_match( pass deduped_entities = list(canonical_map.values()) + + # 2) 第二轮:检测某实体的 name 是否命中另一实体的 aliases(alias-to-name 精确合并) + # 场景:LLM 把 aliases 中的词(如"齐齐")又单独抽取为独立实体,需在此阶段合并掉 + # 优化:先构建 (end_user_id, alias_lower) -> canonical 的反向索引,查找 O(1) + alias_index: Dict[tuple, ExtractedEntityNode] = {} + for canonical in deduped_entities: + uid = getattr(canonical, "end_user_id", None) + for alias in (getattr(canonical, "aliases", []) or []): + alias_lower = alias.strip().lower() + if alias_lower: + alias_index[(uid, alias_lower)] = canonical + + i = 0 + while i < len(deduped_entities): + ent = deduped_entities[i] + ent_name = (getattr(ent, "name", "") or "").strip().lower() + ent_uid = getattr(ent, "end_user_id", None) + canonical = alias_index.get((ent_uid, ent_name)) + # 确保不是自身 + if canonical is not None and canonical.id != ent.id: + _merge_attribute(canonical, ent) + id_redirect[ent.id] = canonical.id + for k, v in list(id_redirect.items()): + if v == ent.id: + id_redirect[k] = canonical.id + try: + k = f"{canonical.end_user_id}|{(canonical.name or '').strip()}|{(canonical.entity_type or '').strip()}" + if k not in exact_merge_map: + exact_merge_map[k] = { + "canonical_id": canonical.id, + "end_user_id": canonical.end_user_id, + "name": canonical.name, + "entity_type": canonical.entity_type, + "merged_ids": set(), + } + exact_merge_map[k]["merged_ids"].add(ent.id) + except Exception: + pass + deduped_entities.pop(i) + else: + i += 1 + return deduped_entities, id_redirect, exact_merge_map def fuzzy_match( diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index b2f287f4..25fffa33 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -96,6 +96,15 @@ The following shows type inheritance relationships (Child → Parent → Grandpa {% endif %} * Include common alternative names, abbreviations and full names * If no aliases exist, use empty array: [] + + **姓名别名识别规则(Name Alias Recognition):** + * 当前对话的用户实体 name 固定为"用户",不得使用用户透露的真实姓名作为 name + * 自我称呼模式:用户说"我的名字是X"、"我叫X" → X 加入 aliases(name 保持为"用户") + * 昵称/小名模式:识别"小名"、"昵称"、"英文名"、"网名"等关键词后的称呼 → 加入 aliases + * 他人称呼模式:识别"同事叫我X"、"朋友叫我X"、"大家叫我X" → X 加入 aliases + * 同一实体的多个称呼应合并到同一 Entity 的 aliases 列表中 + * aliases 中不应包含与 name 完全相同的字符串 + * **严禁将已加入某实体 aliases 的词再单独抽取为另一个独立实体**:若某个词已作为别名归属于"用户"实体,则不得再将该词作为独立 Entity 的 name 出现在 entities 列表中 - Exclude lengthy quotes, calendar dates, temporal ranges, and temporal expressions - For numeric values: extract as separate entities (instance_of: 'Numeric', name: units, numeric_value: value) Example: £30 → name: 'GBP', numeric_value: 30, instance_of: 'Numeric' @@ -207,6 +216,15 @@ Output: {"entity_idx": 0, "name": "三脚架", "type": "Equipment", "description": "摄影器材配件", "example": "", "aliases": ["相机三脚架"], "is_explicit_memory": false} ] } + +**Example 4 (姓名别名识别 - Chinese):** "我的名字是乐力齐,我的小名是齐齐,同事们都叫我小乐" +Output: +{ + "triplets": [], + "entities": [ + {"entity_idx": 0, "name": "用户", "type": "Person", "description": "用户本人,有多个称呼", "example": "", "aliases": ["乐力齐", "齐齐", "小乐"], "is_explicit_memory": false} + ] +} {% endif %} ===End of Examples=== diff --git a/api/app/repositories/neo4j/cypher_queries.py b/api/app/repositories/neo4j/cypher_queries.py index 1f699ad8..f80b7e26 100644 --- a/api/app/repositories/neo4j/cypher_queries.py +++ b/api/app/repositories/neo4j/cypher_queries.py @@ -336,6 +336,61 @@ ORDER BY score DESC LIMIT $limit """ +SEARCH_ENTITIES_BY_NAME_OR_ALIAS = """ +CALL db.index.fulltext.queryNodes("entitiesFulltext", $q) YIELD node AS e, score +WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id) +OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e) +OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s) +RETURN e.id AS id, + e.name AS name, + e.end_user_id AS end_user_id, + e.entity_type AS entity_type, + e.created_at AS created_at, + e.expired_at AS expired_at, + e.entity_idx AS entity_idx, + e.statement_id AS statement_id, + e.description AS description, + e.aliases AS aliases, + e.name_embedding AS name_embedding, + e.connect_strength AS connect_strength, + collect(DISTINCT s.id) AS statement_ids, + collect(DISTINCT c.id) AS chunk_ids, + COALESCE(e.activation_value, e.importance_score, 0.5) AS activation_value, + COALESCE(e.importance_score, 0.5) AS importance_score, + e.last_access_time AS last_access_time, + COALESCE(e.access_count, 0) AS access_count, + score +UNION +MATCH (e:ExtractedEntity) +WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id) + AND e.aliases IS NOT NULL + AND ANY(alias IN e.aliases WHERE toLower(alias) CONTAINS toLower($q)) +OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e) +OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s) +RETURN e.id AS id, + e.name AS name, + e.end_user_id AS end_user_id, + e.entity_type AS entity_type, + e.created_at AS created_at, + e.expired_at AS expired_at, + e.entity_idx AS entity_idx, + e.statement_id AS statement_id, + e.description AS description, + e.aliases AS aliases, + e.name_embedding AS name_embedding, + e.connect_strength AS connect_strength, + collect(DISTINCT s.id) AS statement_ids, + collect(DISTINCT c.id) AS chunk_ids, + COALESCE(e.activation_value, e.importance_score, 0.5) AS activation_value, + COALESCE(e.importance_score, 0.5) AS importance_score, + e.last_access_time AS last_access_time, + COALESCE(e.access_count, 0) AS access_count, + 0.8 AS score +ORDER BY score DESC +LIMIT $limit +""" + + SEARCH_CHUNKS_BY_CONTENT = """ CALL db.index.fulltext.queryNodes("chunksFulltext", $q) YIELD node AS c, score WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id) diff --git a/api/app/repositories/neo4j/graph_search.py b/api/app/repositories/neo4j/graph_search.py index d3aabd32..c5d3bcca 100644 --- a/api/app/repositories/neo4j/graph_search.py +++ b/api/app/repositories/neo4j/graph_search.py @@ -13,6 +13,7 @@ from app.repositories.neo4j.cypher_queries import ( SEARCH_COMMUNITIES_BY_KEYWORD, SEARCH_DIALOGUE_BY_DIALOG_ID, SEARCH_ENTITIES_BY_NAME, + SEARCH_ENTITIES_BY_NAME_OR_ALIAS, SEARCH_MEMORY_SUMMARIES_BY_KEYWORD, SEARCH_STATEMENTS_BY_CREATED_AT, SEARCH_STATEMENTS_BY_KEYWORD, @@ -264,7 +265,7 @@ async def search_graph( if "entities" in include: tasks.append(connector.execute_query( - SEARCH_ENTITIES_BY_NAME, + SEARCH_ENTITIES_BY_NAME_OR_ALIAS, q=q, end_user_id=end_user_id, limit=limit, diff --git a/api/app/schemas/end_user_schema.py b/api/app/schemas/end_user_schema.py index bbb6fd5c..09671b91 100644 --- a/api/app/schemas/end_user_schema.py +++ b/api/app/schemas/end_user_schema.py @@ -1,6 +1,6 @@ import uuid import datetime -from typing import Optional +from typing import Optional, List from pydantic import BaseModel, Field from pydantic import ConfigDict @@ -49,6 +49,7 @@ class EndUserProfileUpdate(BaseModel): """终端用户基本信息更新请求模型""" end_user_id: str = Field(description="终端用户ID") other_name: Optional[str] = Field(description="其他名称", default="") + aliases: Optional[List[str]] = Field(description="别名列表", default=None) position: Optional[str] = Field(description="职位", default=None) department: Optional[str] = Field(description="部门", default=None) contact: Optional[str] = Field(description="联系方式", default=None)