refactor(memory): add PilotWritePipeline and enrich extraction schema

- Add dedicated PilotWritePipeline (statement → triplet → graph_build → layer-1 dedup, no Neo4j write)
- Add type_description/predicate_description fields across entity and triplet models, Cypher queries, and graph builders
- Refactor data_pruning with LRU cache and snapshot support; skip assistant chunks in extraction
- Remove strict Predicate enum whitelist; support statement_text alias in legacy extractor
- Wire PipelineSnapshot through preprocessing and emotion extraction for debug tracing
- Add PILOT_RUN_USE_REFACTORED_PIPELINE env toggle for pipeline selection
This commit is contained in:
lanceyq
2026-04-27 18:15:46 +08:00
parent b0ddd12cc6
commit 2355536b44
23 changed files with 806 additions and 1070 deletions

View File

@@ -1382,6 +1382,7 @@ def extract_emotion_batch_task(
llm_model_id: str,
language: str = "zh",
emotion_config: Optional[Dict[str, Any]] = None,
snapshot_dir: Optional[str] = None,
) -> Dict[str, Any]:
"""Celery task: batch emotion extraction + Neo4j backfill.
@@ -1395,6 +1396,10 @@ def extract_emotion_batch_task(
language: Language code ("zh" / "en").
emotion_config: Optional dict with emotion step config overrides
(emotion_extract_keywords, emotion_enable_subject).
snapshot_dir: Optional absolute path of the current run's snapshot directory.
When provided (only in debug mode), emotion outputs will be
dumped to <snapshot_dir>/4_emotion_outputs.json for offline
comparison between the legacy / new pipelines.
"""
task_id = self.request.id
total = len(statements)
@@ -1445,6 +1450,8 @@ def extract_emotion_batch_task(
extracted = 0
failed = 0
update_items = []
# 快照用:收集每条 statement 的 EmotionStepOutput仅当 snapshot_dir 非空时使用)
snapshot_outputs: Dict[str, Any] = {} if snapshot_dir else None # type: ignore[assignment]
async def _extract_one(stmt_dict: Dict[str, str]):
nonlocal extracted, failed
@@ -1461,6 +1468,8 @@ def extract_emotion_batch_task(
"emotion_intensity": result.emotion_intensity,
"emotion_keywords": result.emotion_keywords,
})
if snapshot_outputs is not None:
snapshot_outputs[stmt_dict["statement_id"]] = result.model_dump()
extracted += 1
logger.debug(
f"[Emotion] 单条提取完成: stmt={stmt_dict['statement_id']}, "
@@ -1468,12 +1477,33 @@ def extract_emotion_batch_task(
)
except Exception as e:
failed += 1
if snapshot_outputs is not None:
snapshot_outputs[stmt_dict["statement_id"]] = {"error": str(e)}
logger.warning(
f"[Emotion] 单条提取失败 stmt={stmt_dict['statement_id']}: {e}"
)
await asyncio.gather(*[_extract_one(s) for s in statements])
# 快照落盘worker 端):不影响 Neo4j 写入流程,失败只打日志
if snapshot_outputs is not None:
try:
from pathlib import Path as _Path
import json as _json
_dir = _Path(snapshot_dir)
_dir.mkdir(parents=True, exist_ok=True)
_path = _dir / "4_emotion_outputs.json"
with open(_path, "w", encoding="utf-8") as _f:
_json.dump(snapshot_outputs, _f, ensure_ascii=False, indent=2, default=str)
logger.info(
f"[Emotion][Snapshot] 已落盘 {len(snapshot_outputs)} 条情绪结果 → {_path}"
)
except Exception as _e:
logger.warning(
f"[Emotion][Snapshot] 快照落盘失败(不影响主流程): {_e}"
)
# Batch update Neo4j via write transaction
if update_items:
connector = Neo4jConnector()