diff --git a/api/app/core/memory/agent/utils/get_dialogs.py b/api/app/core/memory/agent/utils/get_dialogs.py index 22555fff..ea44d0a5 100644 --- a/api/app/core/memory/agent/utils/get_dialogs.py +++ b/api/app/core/memory/agent/utils/get_dialogs.py @@ -82,7 +82,9 @@ async def get_chunked_dialogs( pruning_config = PruningConfig( pruning_switch=memory_config.pruning_enabled, pruning_scene=memory_config.pruning_scene or "education", - pruning_threshold=memory_config.pruning_threshold + pruning_threshold=memory_config.pruning_threshold, + scene_id=str(memory_config.scene_id) if memory_config.scene_id else None, + ontology_classes=memory_config.ontology_classes, ) logger.info(f"[剪枝] 加载配置: switch={pruning_config.pruning_switch}, scene={pruning_config.pruning_scene}, threshold={pruning_config.pruning_threshold}") diff --git a/api/app/core/memory/models/config_models.py b/api/app/core/memory/models/config_models.py index ca1780aa..c2d62ac1 100644 --- a/api/app/core/memory/models/config_models.py +++ b/api/app/core/memory/models/config_models.py @@ -10,7 +10,7 @@ Classes: TemporalSearchParams: Parameters for temporal search queries """ -from typing import Optional +from typing import Optional, List from pydantic import BaseModel, Field @@ -55,17 +55,26 @@ class PruningConfig(BaseModel): Attributes: pruning_switch: Enable or disable semantic pruning - pruning_scene: Scene type for pruning ('education', 'online_service', 'outbound') + pruning_scene: Scene name for pruning, either a built-in key + ('education', 'online_service', 'outbound') or a custom scene_name + from ontology_scene table pruning_threshold: Pruning ratio (0-0.9, max 0.9 to avoid complete removal) + scene_id: Optional ontology scene UUID, used to load custom ontology classes + ontology_classes: List of class_name strings from ontology_class table, + injected into the prompt when pruning_scene is not a built-in scene """ pruning_switch: bool = Field(False, description="Enable semantic pruning when True.") pruning_scene: str = Field( "education", - description="Scene for pruning: one of 'education', 'online_service', 'outbound'.", + description="Scene for pruning: built-in key or custom scene_name from ontology_scene.", ) pruning_threshold: float = Field( 0.5, ge=0.0, le=0.9, description="Pruning ratio within 0-0.9 (max 0.9 to avoid termination).") + scene_id: Optional[str] = Field(None, description="Ontology scene UUID (optional).") + ontology_classes: Optional[List[str]] = Field( + None, description="Class names from ontology_class table for custom scenes." + ) class TemporalSearchParams(BaseModel): diff --git a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py index 0a913633..904b238f 100644 --- a/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py +++ b/api/app/core/memory/storage_services/extraction_engine/data_preprocessing/data_pruning.py @@ -86,19 +86,26 @@ class SemanticPruner: self._detailed_prune_logging = True # 是否启用详细日志 self._max_debug_msgs_per_dialog = 20 # 每个对话最多记录前N条消息的详细日志 - # 加载场景特定配置 + # 加载场景特定配置(内置场景走专门规则,自定义场景 fallback 到通用规则) self.scene_config: ScenePatterns = SceneConfigRegistry.get_config( self.config.pruning_scene, fallback_to_generic=True ) - # 检查场景是否有专门支持 - is_supported = SceneConfigRegistry.is_scene_supported(self.config.pruning_scene) - if is_supported: - self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene} 使用专门配置") + # 判断是否为内置专门场景 + self._is_builtin_scene = SceneConfigRegistry.is_scene_supported(self.config.pruning_scene) + + # 自定义场景的本体类型列表(用于注入提示词) + self._ontology_classes = getattr(self.config, "ontology_classes", None) or [] + + if self._is_builtin_scene: + self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene} 使用内置专门配置") else: - self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene} 未预定义,使用通用配置(保守策略)") - self._log(f"[剪枝-初始化] 支持的场景: {SceneConfigRegistry.get_all_scenes()}") + self._log(f"[剪枝-初始化] 场景={self.config.pruning_scene} 为自定义场景,使用通用规则 + 本体类型提示词注入") + if self._ontology_classes: + self._log(f"[剪枝-初始化] 注入本体类型: {self._ontology_classes}") + else: + self._log(f"[剪枝-初始化] 未找到本体类型,将使用通用提示词") # Load Jinja2 template self.template = prompt_env.get_template("extracat_Pruning.jinja2") @@ -424,12 +431,16 @@ class SemanticPruner: self._log(f"[剪枝-缓存] LRU缓存已满,删除最旧条目") rendered = self.template.render( - pruning_scene=self.config.pruning_scene, + pruning_scene=self.config.pruning_scene, + is_builtin_scene=self._is_builtin_scene, + ontology_classes=self._ontology_classes, dialog_text=dialog_text, language=self.language ) log_template_rendering("extracat_Pruning.jinja2", { "pruning_scene": self.config.pruning_scene, + "is_builtin_scene": self._is_builtin_scene, + "ontology_classes_count": len(self._ontology_classes), "language": self.language }) log_prompt_rendering("pruning-extract", rendered) diff --git a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 index 8253924b..6b620df9 100644 --- a/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extracat_Pruning.jinja2 @@ -1,6 +1,6 @@ {# 对话级抽取与相关性判定模板(用于剪枝加速) - 输入:pruning_scene, dialog_text + 输入:pruning_scene, is_builtin_scene, ontology_classes, dialog_text, language 输出:严格 JSON(不要包含任何多余文本),字段: - is_related: bool,是否与所选场景相关 - times: [string],从对话中抽取的时间相关文本(日期、时间、时间段、有效期等) @@ -16,7 +16,8 @@ - 仅输出上述键;避免多余解释或字段。 #} -{% set scene_instructions = { +{# ── 内置场景的固定说明 ── #} +{% set builtin_scene_instructions = { 'education': { 'zh': '教育场景:教学、课程、考试、作业、老师/学生互动、学习资源、学校管理等。', 'en': 'Education Scenario: Teaching, courses, exams, homework, teacher/student interaction, learning resources, school management, etc.' @@ -31,16 +32,40 @@ } } %} -{% set scene_key = pruning_scene %} -{% if scene_key not in scene_instructions %} -{% set scene_key = 'education' %} +{# ── 确定最终使用的场景说明 ── #} +{% if is_builtin_scene %} + {# 内置专门场景:使用固定说明 #} + {% set scene_key = pruning_scene %} + {% if scene_key not in builtin_scene_instructions %}{% set scene_key = 'education' %}{% endif %} + {% set instruction = builtin_scene_instructions[scene_key][language] if language in ['zh', 'en'] else builtin_scene_instructions[scene_key]['zh'] %} + {% set custom_types_str = '' %} +{% else %} + {# 自定义场景:使用场景名称 + 本体类型列表构建说明 #} + {% if ontology_classes and ontology_classes | length > 0 %} + {% if language == 'en' %} + {% set custom_types_str = ontology_classes | join(', ') %} + {% set instruction = 'Custom scene "' ~ pruning_scene ~ '": The dialogue is related to this scene if it involves any of the following entity types: ' ~ custom_types_str ~ '.' %} + {% else %} + {% set custom_types_str = ontology_classes | join('、') %} + {% set instruction = '自定义场景「' ~ pruning_scene ~ '」:对话涉及以下任意实体类型时视为相关:' ~ custom_types_str ~ '。' %} + {% endif %} + {% else %} + {# 无本体类型时退化为通用说明 #} + {% if language == 'en' %} + {% set instruction = 'Custom scene "' ~ pruning_scene ~ '": Determine whether the dialogue content is relevant to this scene based on overall context.' %} + {% else %} + {% set instruction = '自定义场景「' ~ pruning_scene ~ '」:根据对话整体内容判断是否与该场景相关。' %} + {% endif %} + {% set custom_types_str = '' %} + {% endif %} {% endif %} -{% set instruction = scene_instructions[scene_key][language] if language in ['zh', 'en'] else scene_instructions[scene_key]['zh'] %} - {% if language == "zh" %} 请在下方对话全文基础上,按该场景进行一次性抽取并判定相关性: 场景说明:{{ instruction }} +{% if not is_builtin_scene and custom_types_str %} +重要提示:只要对话中出现与上述实体类型({{ custom_types_str }})相关的内容,即判定为相关(is_related=true)。 +{% endif %} 对话全文: """ @@ -60,6 +85,9 @@ {% else %} Based on the full dialogue below, perform one-time extraction and relevance determination according to this scenario: Scenario Description: {{ instruction }} +{% if not is_builtin_scene and custom_types_str %} +Important: If the dialogue contains content related to any of the entity types above ({{ custom_types_str }}), mark it as relevant (is_related=true). +{% endif %} Full Dialogue: """ diff --git a/api/app/repositories/memory_config_repository.py b/api/app/repositories/memory_config_repository.py index 2dae51ef..22f13449 100644 --- a/api/app/repositories/memory_config_repository.py +++ b/api/app/repositories/memory_config_repository.py @@ -233,6 +233,7 @@ class MemoryConfigRepository: config_desc=params.config_desc, workspace_id=params.workspace_id, scene_id=params.scene_id, + pruning_scene=params.pruning_scene, llm_id=params.llm_id, embedding_id=params.embedding_id, rerank_id=params.rerank_id, diff --git a/api/app/schemas/memory_config_schema.py b/api/app/schemas/memory_config_schema.py index 0b63844b..0c359d70 100644 --- a/api/app/schemas/memory_config_schema.py +++ b/api/app/schemas/memory_config_schema.py @@ -417,6 +417,7 @@ class MemoryConfig: # Ontology scene association scene_id: Optional[UUID] = None + ontology_classes: Optional[list] = field(default=None) def __post_init__(self): """Validate configuration after initialization.""" diff --git a/api/app/schemas/memory_storage_schema.py b/api/app/schemas/memory_storage_schema.py index 776d2783..e396bbf6 100644 --- a/api/app/schemas/memory_storage_schema.py +++ b/api/app/schemas/memory_storage_schema.py @@ -232,14 +232,15 @@ class ConfigParamsCreate(BaseModel): # 创建配置参数模型(仅 body, # 本体场景关联(可选) scene_id: Optional[uuid.UUID] = Field(None, description="本体场景ID(UUID),关联ontology_scene表") + # 语义剪枝场景(由 service 层根据 scene_id 自动推导,值为关联场景的 scene_name,前端无需传入) + pruning_scene: Optional[str] = Field(None, description="语义剪枝场景,由 scene_id 对应的 scene_name 自动填充") + # 模型配置字段(可选,用于手动指定或自动填充) llm_id: Optional[str] = Field(None, description="LLM模型配置ID") embedding_id: Optional[str] = Field(None, description="嵌入模型配置ID") rerank_id: Optional[str] = Field(None, description="重排序模型配置ID") reflection_model_id: Optional[str] = Field(None, description="反思模型ID,默认与llm_id一致") emotion_model_id: Optional[str] = Field(None, description="情绪分析模型ID,默认与llm_id一致") - - class ConfigParamsDelete(BaseModel): # 删除配置参数模型(请求体) model_config = ConfigDict(populate_by_name=True, extra="forbid") # config_name: str = Field("配置名称", description="配置名称(字符串)") diff --git a/api/app/services/memory_config_service.py b/api/app/services/memory_config_service.py index ccfd5482..00757f8c 100644 --- a/api/app/services/memory_config_service.py +++ b/api/app/services/memory_config_service.py @@ -107,6 +107,40 @@ def _validate_config_id(config_id, db: Session = None): ) +# 专门场景的内置 key 集合,直接从 SceneConfigRegistry 派生,避免重复维护 +# 使用懒加载函数避免模块级循环导入 +def _get_builtin_pruning_scenes() -> set: + from app.core.memory.storage_services.extraction_engine.data_preprocessing.scene_config import SceneConfigRegistry + return set(SceneConfigRegistry.get_all_scenes()) + + +def _load_ontology_classes(db: Session, scene_id, pruning_scene: Optional[str]) -> Optional[list]: + """当 pruning_scene 不是内置场景时,从 ontology_class 表加载类型名称列表。 + + Args: + db: 数据库会话 + scene_id: 本体场景 UUID + pruning_scene: 语义剪枝场景名称 + + Returns: + class_name 字符串列表,或 None(内置场景 / 无数据时) + """ + if not scene_id: + return None + # 内置场景走 SceneConfigRegistry,不需要注入类型列表 + if pruning_scene in _get_builtin_pruning_scenes(): + return None + try: + from app.repositories.ontology_class_repository import OntologyClassRepository + repo = OntologyClassRepository(db) + classes = repo.get_classes_by_scene(scene_id) + names = [c.class_name for c in classes if c.class_name] + return names if names else None + except Exception as e: + logger.warning(f"Failed to load ontology classes for scene_id={scene_id}: {e}") + return None + + class MemoryConfigService: """ Centralized service for memory configuration loading and validation. @@ -359,6 +393,7 @@ class MemoryConfigService: pruning_threshold=float(memory_config.pruning_threshold) if memory_config.pruning_threshold is not None else 0.5, # Ontology scene association scene_id=memory_config.scene_id, + ontology_classes=_load_ontology_classes(self.db, memory_config.scene_id, memory_config.pruning_scene), ) elapsed_ms = (time.time() - start_time) * 1000 diff --git a/api/app/services/memory_storage_service.py b/api/app/services/memory_storage_service.py index 02fd1051..a83d6830 100644 --- a/api/app/services/memory_storage_service.py +++ b/api/app/services/memory_storage_service.py @@ -146,6 +146,10 @@ class DataConfigService: # 数据配置服务类(PostgreSQL) if not params.emotion_model_id: params.emotion_model_id = params.llm_id + # 根据关联的本体场景推导 pruning_scene(语义剪枝场景与本体工程场景保持一致) + if params.scene_id and not getattr(params, 'pruning_scene', None): + params.pruning_scene = self._resolve_pruning_scene_from_scene_id(params.scene_id) + config = MemoryConfigRepository.create(self.db, params) self.db.commit() return {"affected": 1, "config_id": config.config_id} @@ -161,6 +165,23 @@ class DataConfigService: # 数据配置服务类(PostgreSQL) finally: db_session.close() + def _resolve_pruning_scene_from_scene_id(self, scene_id) -> Optional[str]: + """根据本体场景ID获取对应的 scene_name,作为语义剪枝场景值 + + Args: + scene_id: 本体场景UUID + + Returns: + scene_name 字符串,查询失败时返回 None + """ + try: + from app.models.ontology_scene import OntologyScene + scene = self.db.query(OntologyScene).filter_by(scene_id=scene_id).first() + return scene.scene_name if scene else None + except Exception as e: + logger.warning(f"_resolve_pruning_scene_from_scene_id failed for scene_id={scene_id}: {e}", exc_info=True) + return None + # --- Delete --- def delete(self, key: ConfigParamsDelete) -> Dict[str, Any]: # 删除配置参数(按配置ID) success = MemoryConfigRepository.delete(self.db, key.config_id) @@ -196,6 +217,19 @@ class DataConfigService: # 数据配置服务类(PostgreSQL) def get_all(self, workspace_id = None) -> List[Dict[str, Any]]: # 获取所有配置参数 results = MemoryConfigRepository.get_all(self.db, workspace_id) + # 检查并修正 pruning_scene 与 scene_name 不一致的记录 + needs_commit = False + for config, scene_name in results: + if scene_name and config.pruning_scene != scene_name: + logger.info( + f"修正 pruning_scene: config_id={config.config_id} " + f"'{config.pruning_scene}' -> '{scene_name}'" + ) + config.pruning_scene = scene_name + needs_commit = True + if needs_commit: + self.db.commit() + # 将 ORM 对象转换为字典列表 data_list = [] for config, scene_name in results: diff --git a/api/app/services/workspace_service.py b/api/app/services/workspace_service.py index e93c0c5c..74880410 100644 --- a/api/app/services/workspace_service.py +++ b/api/app/services/workspace_service.py @@ -152,6 +152,7 @@ def create_workspace( # Initialize default ontology scenes for the workspace (先创建本体场景) default_scene_id = None + default_scene_name = None try: initializer = DefaultOntologyInitializer(db) success, error_msg = initializer.initialize_default_scenes( @@ -163,7 +164,7 @@ def create_workspace( f"为工作空间 {db_workspace.id} 创建默认本体场景成功 (language={language})" ) - # 获取默认场景ID,优先使用"在线教育"场景,如果不存在则使用"情感陪伴"场景 + # 获取默认场景ID,优先使用"在线教育"场景,如果不存在则使用"情感陪伴"场景 from app.repositories.ontology_scene_repository import OntologySceneRepository from app.config.default_ontology_config import ( ONLINE_EDUCATION_SCENE, @@ -179,6 +180,7 @@ def create_workspace( if education_scene: default_scene_id = education_scene.scene_id + default_scene_name = education_scene.scene_name business_logger.info( f"获取到教育场景ID用于默认记忆配置: {default_scene_id} (scene_name={education_scene_name})" ) @@ -189,6 +191,7 @@ def create_workspace( if companion_scene: default_scene_id = companion_scene.scene_id + default_scene_name = companion_scene.scene_name business_logger.info( f"教育场景不存在,使用情感陪伴场景ID用于默认记忆配置: {default_scene_id} (scene_name={companion_scene_name})" ) @@ -219,6 +222,7 @@ def create_workspace( embedding_id=embedding, rerank_id=rerank, scene_id=default_scene_id, # 传入默认场景ID(优先教育场景,其次情感陪伴场景) + pruning_scene_name=default_scene_name, # 传入场景名称作为语义剪枝场景值 ) business_logger.info( f"为工作空间 {db_workspace.id} 创建默认记忆配置成功 (scene_id={default_scene_id})" @@ -1159,6 +1163,7 @@ def _create_default_memory_config( embedding_id: Optional[uuid.UUID] = None, rerank_id: Optional[uuid.UUID] = None, scene_id: Optional[uuid.UUID] = None, + pruning_scene_name: Optional[str] = None, ) -> None: """Create a default memory config for a newly created workspace. @@ -1170,6 +1175,7 @@ def _create_default_memory_config( embedding_id: Optional embedding model ID rerank_id: Optional rerank model ID scene_id: Optional ontology scene ID (默认关联教育场景) + pruning_scene_name: Optional pruning scene name,取自 ontology_scene.scene_name """ from app.models.memory_config_model import MemoryConfig @@ -1183,7 +1189,8 @@ def _create_default_memory_config( llm_id=str(llm_id) if llm_id else None, embedding_id=str(embedding_id) if embedding_id else None, rerank_id=str(rerank_id) if rerank_id else None, - scene_id=scene_id, # 关联本体场景ID + scene_id=scene_id, # 关联本体场景ID(默认为"在线教育"场景) + pruning_scene=pruning_scene_name, # 语义剪枝场景直接使用 scene_name state=True, # Active by default is_default=True, # Mark as workspace default )