feat(memory): add alias invalidation support for entity alias management

Introduce the `别名失效` predicate to handle cases where an alias is
explicitly no longer applicable to an entity.

Changes:
- write_pipeline.py: extend _merge_alias_in_memory to process
  `别名失效` edges — removes invalidated alias names from target
  entity's aliases list in-memory before Neo4j write
- cypher_queries.py: add REMOVE_INVALID_ALIASES and DELETE_ALIAS_NODES
  queries; update REDIRECT_ALIAS_EDGES to handle both `别名属于` and
  `别名失效` predicates
- tasks.py: add step 1.5 in post_store_dedup_and_alias_merge_task to
  execute REMOVE_INVALID_ALIASES and sync removals to PostgreSQL;
  add step 3 to delete alias nodes after edge redirection; add
  snapshot step 3.5 for post-merge entity state; pass snapshot_dir
  to the task
- end_user_info_repository.py: add remove_aliases() method to remove
  specified aliases from end_user_info.aliases (case-insensitive)
- write_snapshot_recorder.py: add save_alias_merge_result() static
  method to write stage 8 snapshot after alias merge and deletion
- extract_triplet.jinja2: document `别名失效` predicate with usage
  rules — only use when conversation explicitly negates an alias
This commit is contained in:
lanceyq
2026-05-07 20:07:53 +08:00
parent e3ab19dd4f
commit aa9eb66668
6 changed files with 255 additions and 23 deletions

View File

@@ -455,16 +455,18 @@ class WritePipeline:
# ──────────────────────────────────────────────
def _merge_alias_in_memory(self, result: ExtractionResult) -> None:
"""别名归并(内存侧):处理 predicate="别名属于" 的边。
"""别名归并(内存侧):处理 predicate="别名属于" 和 predicate="别名失效" 的边。
在写入 Neo4j 之前执行,确保写入的数据已经完成别名归并:
- 将别名实体的 name 追加到目标实体的 aliases
- 将别名实体的 description 拼接到目标实体的 description
- 别名属于:将别名实体的 name 追加到目标实体的 aliases
- 别名属于:将别名实体的 description 拼接到目标实体的 description
- 别名失效:从目标实体的 aliases 中移除对应的旧别名
- 重定向指向别名节点的边到目标节点
纯内存操作,不涉及 Neo4j。
"""
ALIAS_PREDICATE = "别名属于"
ALIAS_INVALID_PREDICATE = "别名失效"
alias_edges = [
e
@@ -472,15 +474,22 @@ class WritePipeline:
if getattr(e, "relation_type", "") == ALIAS_PREDICATE
or getattr(e, "predicate", "") == ALIAS_PREDICATE
]
invalid_alias_edges = [
e
for e in result.entity_entity_edges
if getattr(e, "relation_type", "") == ALIAS_INVALID_PREDICATE
or getattr(e, "predicate", "") == ALIAS_INVALID_PREDICATE
]
if not alias_edges:
logger.debug("[AliasMerge] 无 '别名属于' 关系,跳过")
if not alias_edges and not invalid_alias_edges:
logger.debug("[AliasMerge] 无 '别名属于'/'别名失效' 关系,跳过")
return
try:
entity_map = {e.id: e for e in result.entity_nodes}
alias_to_target: dict[str, str] = {}
# ── 处理 别名属于:追加 aliases ──
for edge in alias_edges:
source_node = entity_map.get(edge.source)
target_node = entity_map.get(edge.target)
@@ -507,29 +516,52 @@ class WritePipeline:
f"{tgt_desc}{src_desc}" if tgt_desc else src_desc
)
# ── 处理 别名失效:从 aliases 中移除旧别名 ──
invalid_alias_to_target: dict[str, str] = {}
for edge in invalid_alias_edges:
source_node = entity_map.get(edge.source)
target_node = entity_map.get(edge.target)
if not source_node or not target_node:
continue
invalid_alias_to_target[edge.source] = edge.target
# 从 target.aliases 中移除 source.name忽略大小写
invalid_name = (source_node.name or "").strip()
if invalid_name and target_node.aliases:
target_node.aliases = [
a for a in target_node.aliases
if a.lower() != invalid_name.lower()
]
logger.debug(
f"[AliasMerge] 从 '{target_node.name}' 的 aliases 中移除失效别名 '{invalid_name}'"
)
# 重定向指向别名节点的边到目标节点
alias_ids = set(alias_to_target.keys())
alias_ids = set(alias_to_target.keys()) | set(invalid_alias_to_target.keys())
all_alias_map = {**alias_to_target, **invalid_alias_to_target}
redirected_ee_count = 0
redirected_se_count = 0
for edge in result.entity_entity_edges:
rel_type = getattr(edge, "relation_type", "")
if rel_type == ALIAS_PREDICATE:
if rel_type in (ALIAS_PREDICATE, ALIAS_INVALID_PREDICATE):
continue
if edge.source in alias_ids:
edge.source = alias_to_target[edge.source]
edge.source = all_alias_map[edge.source]
redirected_ee_count += 1
if edge.target in alias_ids:
edge.target = alias_to_target[edge.target]
edge.target = all_alias_map[edge.target]
redirected_ee_count += 1
for edge in result.stmt_entity_edges:
if edge.target in alias_ids:
edge.target = alias_to_target[edge.target]
edge.target = all_alias_map[edge.target]
redirected_se_count += 1
logger.info(
f"[AliasMerge] 内存归并完成,处理 {len(alias_edges)}'别名属于' 边,"
f"{len(invalid_alias_edges)}'别名失效' 边,"
f"重定向 entity_entity 边 {redirected_ee_count} 次,"
f"重定向 stmt_entity 边 {redirected_se_count}"
)
@@ -574,6 +606,7 @@ class WritePipeline:
"end_user_id": self.end_user_id,
"entity_ids": [e.id for e in result.entity_nodes],
"llm_model_id": llm_model_id,
"snapshot_dir": snapshot_dir,
},
)

View File

@@ -125,6 +125,44 @@ class WriteSnapshotRecorder:
},
)
# ── Stage 8: 别名归并后(异步,由 Celery PostStore 任务写入) ──
@staticmethod
def save_alias_merge_result(snapshot_dir: str, entity_rows: List[Dict]) -> None:
"""将别名归并+节点删除后的 Neo4j 实体状态写入 8_after_alias_merge.json。
由 Celery post_store_dedup_and_alias_merge 任务在完成归并和删除后调用,
直接写入已有的 snapshot 目录,无需重建 WriteSnapshotRecorder 实例。
Args:
snapshot_dir: 主流水线创建的 snapshot 目录绝对路径。
entity_rows: 从 Neo4j 查询到的实体属性列表,每项包含
id / name / entity_type / description / aliases 字段。
"""
import json
from pathlib import Path
try:
path = Path(snapshot_dir) / "8_after_alias_merge.json"
data = {
"entity_nodes": [
{
"id": row.get("id"),
"name": row.get("name"),
"entity_type": row.get("entity_type"),
"description": row.get("description"),
"aliases": row.get("aliases", []),
}
for row in entity_rows
],
"entity_count": len(entity_rows),
}
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2, default=str)
logger.debug(f"[Snapshot] 8_after_alias_merge → {path}")
except Exception as e:
logger.warning(f"[Snapshot] 保存 8_after_alias_merge 失败: {e}")
# ── Stage 0: 汇总 ──
def record_summary(self, stats: Dict[str, int]) -> None:

View File

@@ -234,11 +234,12 @@ Each relation class now keeps only one canonical `covered_predicates` value. Onc
{% endif %}
- `命名关系`
- definition: 表达实体名称、别名、称呼之间的对应关系。
- covered_predicates: `别名属于`
- positive_examples: `山哥 -> 别名属于 -> 用户`、`多多 -> 别名属于 -> 用户的小狗`
- definition: 表达实体名称、别名、称呼之间的对应关系,包括新增别名和别名失效两种子语义
- covered_predicates: `别名属于`、`别名失效`
- positive_examples (`别名属于`): `山哥 -> 别名属于 -> 用户`、`多多 -> 别名属于 -> 用户的小狗`
- positive_examples (`别名失效`): `老陈 -> 别名失效 -> 用户`(当对话明确表达"不再叫X了"时使用)
- negative_examples: `导师 -> 别名属于 -> 用户`、`好人 -> 别名属于 -> 用户`
- notes: 只处理名字性表达,不处理角色、职业、评价词。
- notes: 只处理名字性表达,不处理角色、职业、评价词。`别名失效` 仅在对话**明确**表达某个别名已不再适用时使用,不要因为出现新别名就自动推断旧别名失效。
- status: `enabled`
- `归属身份关系`
@@ -341,6 +342,7 @@ Each relation class now keeps only one canonical `covered_predicates` value. Onc
只能使用以下中文关系类型。如果没有完全匹配的关系,请选择最接近的一项,不要发明新关系。
- `别名属于`: 别名指向其对应的规范实体
- `别名失效`: 某个别名已不再适用于该实体(仅在对话明确表达"不再叫X"时使用)
- `属于类型`: 实体属于某种类别、身份、职业、角色或归属对象
- `位于`: 实体位于某地点、场所或空间位置
- `前往`: 主体前往某个地点、场所、组织或活动对象
@@ -477,7 +479,7 @@ Each relation class now keeps only one canonical `covered_predicates` value. Onc
- Do not use ordinary time expressions as triplet objects.
{% endif %}
**Alias Relation (`别名属于`):**
**Alias Relation (`别名属于` / `别名失效`):**
{% if language == "zh" %}
- 当多个名字明确指向同一实体时,使用 `别名属于`。
@@ -488,6 +490,8 @@ Each relation class now keeps only one canonical `covered_predicates` value. Onc
- 如果所有格同时明确表达持有关系,也应抽取 `X -> 拥有 -> X 的 Y`。
- 在用户自指场景中,规范实体应为已经规范化后的 `用户`。
- 不要把角色、职业、身份、类别、夸赞、评价或其他非名字性描述抽成 `别名属于`。
- 当对话**明确**表达某个别名已不再适用时(如"不叫X了""X这个称呼已经不用了"),使用 `别名失效`,方向为 `旧别名 -> 别名失效 -> canonical entity`。
- 不要因为出现新别名就自动推断旧别名失效;只有对话中有明确的否定/废弃表达时才使用 `别名失效`。
{% else %}
- Use `别名属于` when multiple names clearly refer to the same entity.
- Direction is always `alias -> 别名属于 -> canonical entity`.
@@ -497,6 +501,8 @@ Each relation class now keeps only one canonical `covered_predicates` value. Onc
- If the possessive phrase also explicitly expresses possession, also extract `X -> 拥有 -> X's Y`.
- In user self-reference cases, the canonical entity should be the normalized user entity `用户`.
- Do not use `别名属于` for roles, occupations, identities, categories, compliments, evaluations, or other non-name descriptions.
- Use `别名失效` when the conversation **explicitly** states that an alias is no longer applicable (e.g., "no longer called X", "X is not used anymore"). Direction is `old_alias -> 别名失效 -> canonical entity`.
- Do not infer alias invalidation just because a new alias appears; only use `别名失效` when there is an explicit negation or abandonment expression in the conversation.
{% endif %}
**subject_name / object_name Consistency:**