From 9b07775395fa7765c0f4c9a0063bed0b73e6f0a2 Mon Sep 17 00:00:00 2001 From: lanceyq <1982376970@qq.com> Date: Mon, 9 Feb 2026 20:12:24 +0800 Subject: [PATCH] [fix]Memory extraction output the core engineering effect --- api/app/query_ontology_matched_entities.py | 52 +++++----- api/app/services/memory_storage_service.py | 105 +++++++++++++++++++++ 2 files changed, 128 insertions(+), 29 deletions(-) diff --git a/api/app/query_ontology_matched_entities.py b/api/app/query_ontology_matched_entities.py index 73490134..c878d258 100644 --- a/api/app/query_ontology_matched_entities.py +++ b/api/app/query_ontology_matched_entities.py @@ -169,10 +169,10 @@ async def query_ontology_matched_entities(end_user_id: str, config_id: Optional[ print(f" 找到 {len(entities)} 个实体") - # 4. 分类实体(场景类型、通用类型、未匹配) - scene_matched_entities = [] - general_matched_entities = [] - both_matched_entities = [] # 同时匹配场景和通用类型 + # 4. 互斥分类实体:场景类型优先 > 通用类型 > 未匹配 + # 确保: 场景实体数 + 通用实体数 + 未匹配数 = 总实体数 + scene_matched_entities = [] # 匹配场景类型(含同时匹配两者的) + general_matched_entities = [] # 仅匹配通用类型(不含已归入场景的) unmatched_entities = [] scene_type_distribution = defaultdict(list) @@ -183,11 +183,8 @@ async def query_ontology_matched_entities(end_user_id: str, config_id: Optional[ in_scene = entity_type in scene_ontology_types in_general = entity_type in general_ontology_types - if in_scene and in_general: - both_matched_entities.append(entity) - scene_type_distribution[entity_type].append(entity) - general_type_distribution[entity_type].append(entity) - elif in_scene: + if in_scene: + # 场景类型优先,同时匹配两者的也归入场景 scene_matched_entities.append(entity) scene_type_distribution[entity_type].append(entity) elif in_general: @@ -197,9 +194,8 @@ async def query_ontology_matched_entities(end_user_id: str, config_id: Optional[ unmatched_entities.append(entity) # 5. 输出匹配场景类型的实体 - total_scene_matched = len(scene_matched_entities) + len(both_matched_entities) print(f"\n{'='*70}") - print(f"✅ 匹配场景本体类型的实体 (共 {total_scene_matched} 个)") + print(f"✅ 匹配场景本体类型的实体 (共 {len(scene_matched_entities)} 个)") print(f"{'='*70}") if scene_type_distribution: @@ -219,9 +215,8 @@ async def query_ontology_matched_entities(end_user_id: str, config_id: Optional[ print(f"\n (无匹配场景类型的实体)") # 6. 输出匹配通用类型的实体 - total_general_matched = len(general_matched_entities) + len(both_matched_entities) print(f"\n{'='*70}") - print(f"✅ 匹配通用本体类型的实体 (共 {total_general_matched} 个)") + print(f"✅ 匹配通用本体类型的实体 (共 {len(general_matched_entities)} 个)") print(f"{'='*70}") if general_type_distribution: @@ -265,7 +260,6 @@ async def query_ontology_matched_entities(end_user_id: str, config_id: Optional[ # 8. 统计摘要 total_entities = len(entities) - any_matched = total_entities - len(unmatched_entities) print(f"\n{'='*70}") print(f"📊 统计摘要") @@ -276,35 +270,35 @@ async def query_ontology_matched_entities(end_user_id: str, config_id: Optional[ print(f" 场景本体类型数: {len(scene_ontology_types)}") print(f" 通用本体类型数: {len(general_ontology_types)}") - print(f"\n 匹配率统计:") + print(f"\n 互斥分类统计 (三者之和 = 总实体数):") print(f" {'-'*50}") - scene_rate = total_scene_matched / total_entities * 100 if total_entities > 0 else 0 - general_rate = total_general_matched / total_entities * 100 if total_entities > 0 else 0 - any_rate = any_matched / total_entities * 100 if total_entities > 0 else 0 + scene_rate = len(scene_matched_entities) / total_entities * 100 if total_entities > 0 else 0 + general_rate = len(general_matched_entities) / total_entities * 100 if total_entities > 0 else 0 unmatched_rate = len(unmatched_entities) / total_entities * 100 if total_entities > 0 else 0 - print(f" 匹配场景类型: {total_scene_matched} 个 ({scene_rate:.1f}%)") - print(f" 匹配通用类型: {total_general_matched} 个 ({general_rate:.1f}%)") - print(f" 同时匹配两者: {len(both_matched_entities)} 个 ({len(both_matched_entities)/total_entities*100:.1f}%)") - print(f" 仅匹配场景类型: {len(scene_matched_entities)} 个 ({len(scene_matched_entities)/total_entities*100:.1f}%)") - print(f" 仅匹配通用类型: {len(general_matched_entities)} 个 ({len(general_matched_entities)/total_entities*100:.1f}%)") - print(f" 匹配任一类型: {any_matched} 个 ({any_rate:.1f}%)") + print(f" 匹配场景类型: {len(scene_matched_entities)} 个 ({scene_rate:.1f}%)") + print(f" 匹配通用类型: {len(general_matched_entities)} 个 ({general_rate:.1f}%)") print(f" 未匹配任何类型: {len(unmatched_entities)} 个 ({unmatched_rate:.1f}%)") + print(f" ─────────────────────────────") + print(f" 合计: {len(scene_matched_entities)} + {len(general_matched_entities)} + {len(unmatched_entities)} = {len(scene_matched_entities) + len(general_matched_entities) + len(unmatched_entities)}") - # 9. 类型分布详情 + # 9. 场景类型分布详情(全部) if scene_type_distribution: - print(f"\n 场景类型分布 (Top 10):") + print(f"\n 场景类型分布 (全部 {len(scene_type_distribution)} 种):") print(f" {'-'*50}") sorted_scene_types = sorted(scene_type_distribution.items(), key=lambda x: len(x[1]), reverse=True) - for type_name, entities_list in sorted_scene_types[:10]: + for type_name, entities_list in sorted_scene_types: print(f" - {type_name}: {len(entities_list)} 个") + print(f" 场景类型实体总数: {len(scene_matched_entities)} 个") + # 10. 通用类型分布详情(全部) if general_type_distribution: - print(f"\n 通用类型分布 (Top 10):") + print(f"\n 通用类型分布 (全部 {len(general_type_distribution)} 种):") print(f" {'-'*50}") sorted_general_types = sorted(general_type_distribution.items(), key=lambda x: len(x[1]), reverse=True) - for type_name, entities_list in sorted_general_types[:10]: + for type_name, entities_list in sorted_general_types: print(f" - {type_name}: {len(entities_list)} 个") + print(f" 通用类型实体总数: {len(general_matched_entities)} 个") except Exception as e: print(f"\n❌ 查询出错: {str(e)}") diff --git a/api/app/services/memory_storage_service.py b/api/app/services/memory_storage_service.py index 71a644cf..16dc88c9 100644 --- a/api/app/services/memory_storage_service.py +++ b/api/app/services/memory_storage_service.py @@ -407,6 +407,17 @@ class DataConfigService: # 数据配置服务类(PostgreSQL) } yield format_sse_message("result", result_data) + # 步骤 6.5: 计算本体覆盖率统计并发出 + try: + ontology_coverage = await self._compute_ontology_coverage( + extracted_result=extracted_result, + memory_config=memory_config, + ) + if ontology_coverage: + yield format_sse_message("ontology_coverage", ontology_coverage) + except Exception as cov_err: + logger.warning(f"[PILOT_RUN_STREAM] Ontology coverage computation failed: {cov_err}", exc_info=True) + # 步骤 7: 发出完成事件 yield format_sse_message("done", { "message": "试运行完成", @@ -428,6 +439,100 @@ class DataConfigService: # 数据配置服务类(PostgreSQL) }) + async def _compute_ontology_coverage( + self, + extracted_result: Dict[str, Any], + memory_config, + ) -> Optional[Dict[str, Any]]: + """根据提取结果中的实体类型,与场景/通用本体类型做互斥分类统计。 + + 分类规则(互斥):场景类型优先 > 通用类型 > 未匹配 + 确保: 场景实体数 + 通用实体数 + 未匹配数 = 总实体数 + + Returns: + 包含三部分统计的字典,或 None(无实体数据时) + """ + core_entities = extracted_result.get("core_entities", []) + if not core_entities: + return None + + # 1. 加载场景本体类型集合 + scene_ontology_types: set = set() + try: + from app.repositories.ontology_class_repository import OntologyClassRepository + + if memory_config.scene_id: + class_repo = OntologyClassRepository(self.db) + ontology_classes = class_repo.get_classes_by_scene(memory_config.scene_id) + scene_ontology_types = {oc.class_name for oc in ontology_classes} + except Exception as e: + logger.warning(f"Failed to load scene ontology types: {e}") + + # 2. 加载通用本体类型集合 + general_ontology_types: set = set() + try: + from app.core.memory.ontology_services.ontology_type_loader import ( + get_general_ontology_registry, + is_general_ontology_enabled, + ) + + if is_general_ontology_enabled(): + registry = get_general_ontology_registry() + if registry: + general_ontology_types = set(registry.types.keys()) + except Exception as e: + logger.warning(f"Failed to load general ontology types: {e}") + + # 3. 互斥分类:场景优先 > 通用 > 未匹配 + scene_distribution: list = [] + general_distribution: list = [] + unmatched_distribution: list = [] + scene_total = 0 + general_total = 0 + unmatched_total = 0 + + for item in core_entities: + entity_type = item.get("type", "") + count = item.get("count", 0) + + if entity_type in scene_ontology_types: + scene_distribution.append({"type": entity_type, "count": count}) + scene_total += count + elif entity_type in general_ontology_types: + general_distribution.append({"type": entity_type, "count": count}) + general_total += count + else: + unmatched_distribution.append({"type": entity_type, "count": count}) + unmatched_total += count + + # 按数量降序排列 + scene_distribution.sort(key=lambda x: x["count"], reverse=True) + general_distribution.sort(key=lambda x: x["count"], reverse=True) + unmatched_distribution.sort(key=lambda x: x["count"], reverse=True) + + total_entities = scene_total + general_total + unmatched_total + + return { + "scene_type_distribution": { + "type_count": len(scene_distribution), + "entity_total": scene_total, + "types": scene_distribution, + }, + "general_type_distribution": { + "type_count": len(general_distribution), + "entity_total": general_total, + "types": general_distribution, + }, + "unmatched": { + "type_count": len(unmatched_distribution), + "entity_total": unmatched_total, + "types": unmatched_distribution, + }, + "total_entities": total_entities, + "time": int(time.time() * 1000), + } + + # -------------------- Neo4j Search & Analytics (fused from data_search_service.py) -------------------- # Ensure env for connector (e.g., NEO4J_PASSWORD) load_dotenv()