refactor(memory): add PilotWritePipeline and enrich extraction schema

- Add dedicated PilotWritePipeline (statement → triplet → graph_build → layer-1 dedup, no Neo4j write) - Add type_description/predicate_description fields across entity and triplet models, Cypher queries, and graph builders - Refactor data_pruning with LRU cache and snapshot support; skip assistant chunks in extraction - Remove strict Predicate enum whitelist; support statement_text alias in legacy extractor - Wire PipelineSnapshot through preprocessing and emotion extraction for debug tracing - Add PILOT_RUN_USE_REFACTORED_PIPELINE env toggle for pipeline selection
2026-04-27 18:15:46 +08:00
parent b0ddd12cc6
commit 2355536b44
23 changed files with 806 additions and 1070 deletions
--- a/api/app/services/memory_storage_service.py
+++ b/api/app/services/memory_storage_service.py
@@ -441,21 +441,12 @@ class DataConfigService:  # 数据配置服务类（PostgreSQL）
            with open(result_path, "r", encoding="utf-8") as rf:
                extracted_result = json.load(rf)

-            # 步骤 6: 计算本体覆盖率并合并到结果中
+            # 步骤 6: 组装结果（试运行不做额外覆盖率后处理）
            result_data = {
                "config_id": cid,
                "time_log": os.path.join(project_root, "logs", "time.log"),
                "extracted_result": extracted_result,
            }
-            try:
-                ontology_coverage = await self._compute_ontology_coverage(
-                    extracted_result=extracted_result,
-                    memory_config=memory_config,
-                )
-                if ontology_coverage:
-                    result_data["ontology_coverage"] = ontology_coverage
-            except Exception as cov_err:
-                logger.warning(f"[PILOT_RUN_STREAM] Ontology coverage computation failed: {cov_err}", exc_info=True)

            yield format_sse_message("result", result_data)

@@ -479,100 +470,6 @@ class DataConfigService:  # 数据配置服务类（PostgreSQL）
                "time": int(time.time() * 1000)
            })

-    async def _compute_ontology_coverage(
-            self,
-            extracted_result: Dict[str, Any],
-            memory_config,
-    ) -> Optional[Dict[str, Any]]:
-        """根据提取结果中的实体类型，与场景/通用本体类型做互斥分类统计。
-
-        分类规则（互斥）：场景类型优先 > 通用类型 > 未匹配
-        确保: 场景实体数 + 通用实体数 + 未匹配数 = 总实体数
-
-        Returns:
-            包含三部分统计的字典，或 None（无实体数据时）
-        """
-        core_entities = extracted_result.get("core_entities", [])
-        if not core_entities:
-            return None
-
-        # 1. 加载场景本体类型集合
-        scene_ontology_types: set = set()
-        try:
-            from app.repositories.ontology_class_repository import OntologyClassRepository
-
-            if memory_config.scene_id:
-                class_repo = OntologyClassRepository(self.db)
-                ontology_classes = class_repo.get_classes_by_scene(memory_config.scene_id)
-                scene_ontology_types = {oc.class_name for oc in ontology_classes}
-        except Exception as e:
-            logger.warning(f"Failed to load scene ontology types: {e}")
-
-        # 2. 加载通用本体类型集合
-        general_ontology_types: set = set()
-        try:
-            from app.core.memory.ontology_services.ontology_type_loader import (
-                get_general_ontology_registry,
-                is_general_ontology_enabled,
-            )
-
-            if is_general_ontology_enabled():
-                registry = get_general_ontology_registry()
-                if registry:
-                    general_ontology_types = set(registry.types.keys())
-        except Exception as e:
-            logger.warning(f"Failed to load general ontology types: {e}")
-
-        # 3. 互斥分类：场景优先 > 通用 > 未匹配
-        scene_distribution: list = []
-        general_distribution: list = []
-        unmatched_distribution: list = []
-        scene_total = 0
-        general_total = 0
-        unmatched_total = 0
-
-        for item in core_entities:
-            entity_type = item.get("type", "")
-            count = item.get("count", 0)
-
-            if entity_type in scene_ontology_types:
-                scene_distribution.append({"type": entity_type, "count": count})
-                scene_total += count
-            elif entity_type in general_ontology_types:
-                general_distribution.append({"type": entity_type, "count": count})
-                general_total += count
-            else:
-                unmatched_distribution.append({"type": entity_type, "count": count})
-                unmatched_total += count
-
-        # 按数量降序排列
-        scene_distribution.sort(key=lambda x: x["count"], reverse=True)
-        general_distribution.sort(key=lambda x: x["count"], reverse=True)
-        unmatched_distribution.sort(key=lambda x: x["count"], reverse=True)
-
-        total_entities = scene_total + general_total + unmatched_total
-
-        return {
-            "scene_type_distribution": {
-                "type_count": len(scene_distribution),
-                "entity_total": scene_total,
-                "types": scene_distribution,
-            },
-            "general_type_distribution": {
-                "type_count": len(general_distribution),
-                "entity_total": general_total,
-                "types": general_distribution,
-            },
-            "unmatched": {
-                "type_count": len(unmatched_distribution),
-                "entity_total": unmatched_total,
-                "types": unmatched_distribution,
-            },
-            "total_entities": total_entities,
-            "time": int(time.time() * 1000),
-        }
-
-
 # -------------------- Neo4j Search & Analytics (fused from data_search_service.py) --------------------
 # Ensure env for connector (e.g., NEO4J_PASSWORD)