From 2687c3b80e6fd0b27e5d112c0bd03b3e9fb0d3f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B9=90=E5=8A=9B=E9=BD=90?= <162269739+lanceyq@users.noreply.github.com> Date: Fri, 30 Jan 2026 18:02:45 +0800 Subject: [PATCH] Fix/v022 bug (#263) * [fix]Fix the issue of inconsistent language in explicit and episodic memory. * [fix]Fix the issue of inconsistent language in explicit and episodic memory. * [add]Add scene_id * [fix]Based on the AI review to fix the code --- api/app/core/config.py | 5 ++ .../knowledge_extraction/memory_summary.py | 74 +++++++++++++++---- .../triplet_extraction.py | 12 ++- .../core/memory/utils/prompt/prompt_utils.py | 23 ++++-- .../episodic_type_classification.jinja2 | 11 +++ .../prompt/prompts/extract_triplet.jinja2 | 21 +++++- .../prompt/prompts/memory_summary.jinja2 | 18 ++++- api/app/models/memory_config_model.py | 3 + .../repositories/memory_config_repository.py | 1 + api/app/schemas/memory_storage_schema.py | 3 + api/env.example | 5 ++ 11 files changed, 151 insertions(+), 25 deletions(-) diff --git a/api/app/core/config.py b/api/app/core/config.py index a8981054..0de957c7 100644 --- a/api/app/core/config.py +++ b/api/app/core/config.py @@ -157,6 +157,11 @@ class Settings: if origin.strip() ] + # Language Configuration + # Supported values: "zh" (Chinese), "en" (English) + # This controls the language used for memory summary titles and other generated content + DEFAULT_LANGUAGE: str = os.getenv("DEFAULT_LANGUAGE", "zh") + # Logging settings LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO") LOG_FORMAT: str = os.getenv("LOG_FORMAT", "%(asctime)s - %(name)s - %(levelname)s - %(message)s") diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py index f39313a8..58633363 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py @@ -14,6 +14,34 @@ from pydantic import Field logger = get_memory_logger(__name__) +# 支持的语言列表和默认回退值 +SUPPORTED_LANGUAGES = {"zh", "en"} +FALLBACK_LANGUAGE = "en" + + +def validate_language(language: Optional[str]) -> str: + """ + 校验语言参数,确保其为有效值。 + + Args: + language: 待校验的语言代码 + + Returns: + 有效的语言代码("zh" 或 "en") + """ + if language is None: + return FALLBACK_LANGUAGE + + lang = str(language).lower().strip() + if lang in SUPPORTED_LANGUAGES: + return lang + + logger.warning( + f"无效的语言参数 '{language}',已回退到默认值 '{FALLBACK_LANGUAGE}'。" + f"支持的语言: {SUPPORTED_LANGUAGES}" + ) + return FALLBACK_LANGUAGE + class MemorySummaryResponse(RobustLLMResponse): """Structured response for summary generation per chunk. @@ -31,7 +59,8 @@ class MemorySummaryResponse(RobustLLMResponse): async def generate_title_and_type_for_summary( content: str, - llm_client + llm_client, + language: str = None ) -> Tuple[str, str]: """ 为MemorySummary生成标题和类型 @@ -41,11 +70,18 @@ async def generate_title_and_type_for_summary( Args: content: Summary的内容文本 llm_client: LLM客户端实例 + language: 生成标题使用的语言 ("zh" 中文, "en" 英文),如果为None则从配置读取 Returns: (标题, 类型)元组 """ from app.core.memory.utils.prompt.prompt_utils import render_episodic_title_and_type_prompt + from app.core.config import settings + + # 如果没有指定语言,从配置中读取,并校验有效性 + if language is None: + language = settings.DEFAULT_LANGUAGE + language = validate_language(language) # 定义有效的类型集合 VALID_TYPES = { @@ -57,13 +93,19 @@ async def generate_title_and_type_for_summary( } DEFAULT_TYPE = "conversation" # 默认类型 + # 根据语言设置默认标题 + DEFAULT_TITLE = "空内容" if language == "zh" else "Empty Content" + PARSE_ERROR_TITLE = "解析失败" if language == "zh" else "Parse Failed" + ERROR_TITLE = "错误" if language == "zh" else "Error" + UNKNOWN_TITLE = "未知标题" if language == "zh" else "Unknown Title" + try: if not content: - logger.warning("content为空,无法生成标题和类型") - return ("空内容", DEFAULT_TYPE) + logger.warning(f"content为空,无法生成标题和类型 (language={language})") + return (DEFAULT_TITLE, DEFAULT_TYPE) - # 1. 渲染Jinja2提示词模板 - prompt = await render_episodic_title_and_type_prompt(content) + # 1. 渲染Jinja2提示词模板,传递语言参数 + prompt = await render_episodic_title_and_type_prompt(content, language=language) # 2. 调用LLM生成标题和类型 messages = [ @@ -102,7 +144,7 @@ async def generate_title_and_type_for_summary( json_str = json_str.strip() result_data = json.loads(json_str) - title = result_data.get("title", "未知标题") + title = result_data.get("title", UNKNOWN_TITLE) episodic_type_raw = result_data.get("type", DEFAULT_TYPE) # 5. 校验和归一化类型 @@ -130,16 +172,16 @@ async def generate_title_and_type_for_summary( f"已归一化为 '{episodic_type}'" ) - logger.info(f"成功生成标题和类型: title={title}, type={episodic_type}") + logger.info(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}") return (title, episodic_type) except json.JSONDecodeError: - logger.error(f"无法解析LLM响应为JSON: {full_response}") - return ("解析失败", DEFAULT_TYPE) + logger.error(f"无法解析LLM响应为JSON (language={language}): {full_response}") + return (PARSE_ERROR_TITLE, DEFAULT_TYPE) except Exception as e: - logger.error(f"生成标题和类型时出错: {str(e)}", exc_info=True) - return ("错误", DEFAULT_TYPE) + logger.error(f"生成标题和类型时出错 (language={language}): {str(e)}", exc_info=True) + return (ERROR_TITLE, DEFAULT_TYPE) async def _process_chunk_summary( dialog: DialogData, @@ -153,11 +195,16 @@ async def _process_chunk_summary( return None try: + # 从配置中获取语言设置(只获取一次,复用),并校验有效性 + from app.core.config import settings + language = validate_language(settings.DEFAULT_LANGUAGE) + # Render prompt via Jinja2 for a single chunk prompt_content = await render_memory_summary_prompt( chunk_texts=chunk.content, json_schema=MemorySummaryResponse.model_json_schema(), max_words=200, + language=language, ) messages = [ @@ -178,9 +225,10 @@ async def _process_chunk_summary( try: title, episodic_type = await generate_title_and_type_for_summary( content=summary_text, - llm_client=llm_client + llm_client=llm_client, + language=language ) - logger.info(f"Generated title and type for MemorySummary: title={title}, type={episodic_type}") + logger.info(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}") except Exception as e: logger.warning(f"Failed to generate title and type for chunk {chunk.id}: {e}") # Continue without title and type diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py index bfc0bc88..8c3e31b4 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py @@ -25,6 +25,15 @@ class TripletExtractor: """ self.llm_client = llm_client + def _get_language(self) -> str: + """Get the configured language for entity descriptions + + Returns: + Language code ("zh" or "en") + """ + from app.core.config import settings + return settings.DEFAULT_LANGUAGE + async def _extract_triplets(self, statement: Statement, chunk_content: str) -> TripletExtractionResponse: """Process a single statement and return extracted triplets and entities""" # Render the prompt using helper function @@ -40,7 +49,8 @@ class TripletExtractor: statement=statement.statement, chunk_content=chunk_content, json_schema=TripletExtractionResponse.model_json_schema(), - predicate_instructions=PREDICATE_DEFINITIONS + predicate_instructions=PREDICATE_DEFINITIONS, + language=self._get_language() ) # Create messages for LLM diff --git a/api/app/core/memory/utils/prompt/prompt_utils.py b/api/app/core/memory/utils/prompt/prompt_utils.py index d8bf02c7..a4d2af95 100644 --- a/api/app/core/memory/utils/prompt/prompt_utils.py +++ b/api/app/core/memory/utils/prompt/prompt_utils.py @@ -177,7 +177,7 @@ def render_entity_dedup_prompt( # Args: # entity_a: Dict of entity A attributes -async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None) -> str: +async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None, language: str = "zh") -> str: """ Renders the triplet extraction prompt using the extract_triplet.jinja2 template. @@ -186,6 +186,7 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j chunk_content: The content of the chunk to process json_schema: JSON schema for the expected output format predicate_instructions: Optional predicate instructions + language: The language to use for entity descriptions ("zh" for Chinese, "en" for English) Returns: Rendered prompt content as string @@ -195,7 +196,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j statement=statement, chunk_content=chunk_content, json_schema=json_schema, - predicate_instructions=predicate_instructions + predicate_instructions=predicate_instructions, + language=language ) # 记录渲染结果到提示日志(与示例日志结构一致) log_prompt_rendering('triplet extraction', rendered_prompt) @@ -204,7 +206,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j 'statement': 'str', 'chunk_content': 'str', 'json_schema': 'TripletExtractionResponse.schema', - 'predicate_instructions': 'PREDICATE_DEFINITIONS' + 'predicate_instructions': 'PREDICATE_DEFINITIONS', + 'language': language }) return rendered_prompt @@ -213,6 +216,7 @@ async def render_memory_summary_prompt( chunk_texts: str, json_schema: dict, max_words: int = 200, + language: str = "zh", ) -> str: """ Renders the memory summary prompt using the memory_summary.jinja2 template. @@ -221,6 +225,7 @@ async def render_memory_summary_prompt( chunk_texts: Concatenated text of conversation chunks json_schema: JSON schema for the expected output format max_words: Maximum words for the summary + language: The language to use for summary generation ("zh" for Chinese, "en" for English) Returns: Rendered prompt content as string. @@ -230,12 +235,14 @@ async def render_memory_summary_prompt( chunk_texts=chunk_texts, json_schema=json_schema, max_words=max_words, + language=language, ) log_prompt_rendering('memory summary', rendered_prompt) log_template_rendering('memory_summary.jinja2', { 'chunk_texts_len': len(chunk_texts or ""), 'max_words': max_words, - 'json_schema': 'MemorySummaryResponse.schema' + 'json_schema': 'MemorySummaryResponse.schema', + 'language': language }) return rendered_prompt @@ -388,24 +395,26 @@ async def render_memory_insight_prompt( return rendered_prompt -async def render_episodic_title_and_type_prompt(content: str) -> str: +async def render_episodic_title_and_type_prompt(content: str, language: str = "zh") -> str: """ Renders the episodic title and type classification prompt using the episodic_type_classification.jinja2 template. Args: content: The content of the episodic memory summary to analyze + language: The language to use for title generation ("zh" for Chinese, "en" for English) Returns: Rendered prompt content as string """ template = prompt_env.get_template("episodic_type_classification.jinja2") - rendered_prompt = template.render(content=content) + rendered_prompt = template.render(content=content, language=language) # 记录渲染结果到提示日志 log_prompt_rendering('episodic title and type classification', rendered_prompt) # 可选:记录模板渲染信息 log_template_rendering('episodic_type_classification.jinja2', { - 'content_len': len(content) if content else 0 + 'content_len': len(content) if content else 0, + 'language': language }) return rendered_prompt diff --git a/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2 b/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2 index fa382ec7..d778890b 100644 --- a/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2 @@ -1,8 +1,19 @@ === Task === Generate a concise title and classify the episodic memory into the most appropriate category. +{% if language == "zh" %} +**重要:请使用中文生成标题和分类。** +{% else %} +**Important: Please generate the title and classification in English.** +{% endif %} + === Requirements === - Extract a clear, concise title (10-20 characters) that captures the core content +{% if language == "zh" %} +- 标题必须使用中文 +{% else %} +- Title must be in English +{% endif %} - Classify into exactly one category based on the primary theme - Be specific and avoid ambiguity - Output must be valid JSON conforming to the schema below diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 index 03691a04..67df162a 100644 --- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 @@ -5,6 +5,12 @@ ===Task=== Extract entities and knowledge triplets from the given statement. +{% if language == "zh" %} +**重要:请使用中文生成实体描述(description)和示例(example)。** +{% else %} +**Important: Please generate entity descriptions and examples in English.** +{% endif %} + ===Inputs=== **Chunk Content:** "{{ chunk_content }}" **Statement:** "{{ statement }}" @@ -13,6 +19,13 @@ Extract entities and knowledge triplets from the given statement. **Entity Extraction:** - Extract entities with their types, context-independent descriptions, **concise examples**, aliases, and semantic memory classification +{% if language == "zh" %} +- **实体描述(description)必须使用中文** +- **示例(example)必须使用中文** +{% else %} +- **Entity descriptions must be in English** +- **Examples must be in English** +{% endif %} - **Semantic Memory Classification (is_explicit_memory):** * Set to `true` if the entity represents **explicit/semantic memory**: - **Concepts:** "Machine Learning", "Photosynthesis", "Democracy", "人工智能", "光合作用", "民主" @@ -334,9 +347,11 @@ Output: - Escape quotation marks in text with backslashes (\") - Ensure proper string closure and comma separation - No line breaks within JSON string values -- The output language should ALWAYS match the input language -- If input is in English, extract statements in English -- If input is in Chinese, extract statements in Chinese +{% if language == "zh" %} +- **语言要求:实体描述(description)和示例(example)必须使用中文** +{% else %} +- **Language Requirement: Entity descriptions and examples must be in English** +{% endif %} - Preserve the original language and do not translate {{ json_schema }} \ No newline at end of file diff --git a/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2 b/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2 index 1dd86ca3..82f91cc4 100644 --- a/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2 @@ -5,10 +5,21 @@ === Task === Summarize the provided conversation chunks into a concise Memory summary. +{% if language == "zh" %} +**重要:请使用中文生成摘要内容。** +{% else %} +**Important: Please generate the summary content in English.** +{% endif %} + === Requirements === - Focus on factual statements, user preferences, relationships, and salient temporal context. - Avoid repetition and filler; be specific. - Keep it under {{ max_words or 200 }} words. +{% if language == "zh" %} +- 摘要内容必须使用中文 +{% else %} +- Summary content must be in English +{% endif %} - Output must be valid JSON conforming to the schema below. === Input === @@ -24,6 +35,11 @@ Summarize the provided conversation chunks into a concise Memory summary. 4. Do not include line breaks within JSON string values 5. Example of proper escaping: "statement": "张曼婷说:\"我很喜欢这本书。\"" -The output language should always be the same as the input language. +{% if language == "zh" %} +**语言要求:输出内容必须使用中文。** +{% else %} +**Language Requirement: The output content must be in English.** +{% endif %} + Return only a list of extracted labelled statements in the JSON ARRAY of objects that match the schema below: {{ json_schema }} \ No newline at end of file diff --git a/api/app/models/memory_config_model.py b/api/app/models/memory_config_model.py index 454b1b48..8a451f2d 100644 --- a/api/app/models/memory_config_model.py +++ b/api/app/models/memory_config_model.py @@ -20,6 +20,9 @@ class MemoryConfig(Base): end_user_id = Column(String, nullable=True, comment="组ID") user_id = Column(String, nullable=True, comment="用户ID") apply_id = Column(String, nullable=True, comment="应用ID") + + # 本体场景关联 + scene_id = Column(UUID(as_uuid=True), nullable=True, comment="本体场景ID,关联ontology_scene表") # 模型选择(从workspace继承) llm_id = Column(String, nullable=True, comment="LLM模型配置ID") diff --git a/api/app/repositories/memory_config_repository.py b/api/app/repositories/memory_config_repository.py index fbc04f2e..c00943f7 100644 --- a/api/app/repositories/memory_config_repository.py +++ b/api/app/repositories/memory_config_repository.py @@ -229,6 +229,7 @@ class MemoryConfigRepository: config_name=params.config_name, config_desc=params.config_desc, workspace_id=params.workspace_id, + scene_id=params.scene_id, llm_id=params.llm_id, embedding_id=params.embedding_id, rerank_id=params.rerank_id, diff --git a/api/app/schemas/memory_storage_schema.py b/api/app/schemas/memory_storage_schema.py index 5fda0a1d..5e22d70f 100644 --- a/api/app/schemas/memory_storage_schema.py +++ b/api/app/schemas/memory_storage_schema.py @@ -229,6 +229,9 @@ class ConfigParamsCreate(BaseModel): # 创建配置参数模型(仅 body, config_desc: str = Field("配置描述", description="配置描述(字符串)") workspace_id: Optional[uuid.UUID] = Field(None, description="工作空间ID(UUID)") + # 本体场景关联(可选) + scene_id: Optional[uuid.UUID] = Field(None, description="本体场景ID(UUID),关联ontology_scene表") + # 模型配置字段(可选,用于手动指定或自动填充) llm_id: Optional[str] = Field(None, description="LLM模型配置ID") embedding_id: Optional[str] = Field(None, description="嵌入模型配置ID") diff --git a/api/env.example b/api/env.example index 274049b9..98c96edc 100644 --- a/api/env.example +++ b/api/env.example @@ -1,4 +1,9 @@ +# Language Configuration +# Supported values: "zh" (Chinese), "en" (English) +# This controls the language used for memory summary titles and other generated content +DEFAULT_LANGUAGE=zh + # Neo4j Configuration (记忆系统数据库) NEO4J_URI= NEO4J_USERNAME=