From 2687c3b80e6fd0b27e5d112c0bd03b3e9fb0d3f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B9=90=E5=8A=9B=E9=BD=90?=
 <162269739+lanceyq@users.noreply.github.com>
Date: Fri, 30 Jan 2026 18:02:45 +0800
Subject: [PATCH] Fix/v022 bug (#263)

* [fix]Fix the issue of inconsistent language in explicit and episodic memory.

* [fix]Fix the issue of inconsistent language in explicit and episodic memory.

* [add]Add scene_id

* [fix]Based on the AI review to fix the code
---
 api/app/core/config.py                        |  5 ++
 .../knowledge_extraction/memory_summary.py    | 74 +++++++++++++++----
 .../triplet_extraction.py                     | 12 ++-
 .../core/memory/utils/prompt/prompt_utils.py  | 23 ++++--
 .../episodic_type_classification.jinja2       | 11 +++
 .../prompt/prompts/extract_triplet.jinja2     | 21 +++++-
 .../prompt/prompts/memory_summary.jinja2      | 18 ++++-
 api/app/models/memory_config_model.py         |  3 +
 .../repositories/memory_config_repository.py  |  1 +
 api/app/schemas/memory_storage_schema.py      |  3 +
 api/env.example                               |  5 ++
 11 files changed, 151 insertions(+), 25 deletions(-)

diff --git a/api/app/core/config.py b/api/app/core/config.py
index a8981054..0de957c7 100644
--- a/api/app/core/config.py
+++ b/api/app/core/config.py
@@ -157,6 +157,11 @@ class Settings:
         if origin.strip()
     ]
 
+    # Language Configuration
+    # Supported values: "zh" (Chinese), "en" (English)
+    # This controls the language used for memory summary titles and other generated content
+    DEFAULT_LANGUAGE: str = os.getenv("DEFAULT_LANGUAGE", "zh")
+
     # Logging settings
     LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
     LOG_FORMAT: str = os.getenv("LOG_FORMAT", "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py
index f39313a8..58633363 100644
--- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py
+++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py
@@ -14,6 +14,34 @@ from pydantic import Field
 
 logger = get_memory_logger(__name__)
 
+# 支持的语言列表和默认回退值
+SUPPORTED_LANGUAGES = {"zh", "en"}
+FALLBACK_LANGUAGE = "en"
+
+
+def validate_language(language: Optional[str]) -> str:
+    """
+    校验语言参数，确保其为有效值。
+    
+    Args:
+        language: 待校验的语言代码
+        
+    Returns:
+        有效的语言代码（"zh" 或 "en"）
+    """
+    if language is None:
+        return FALLBACK_LANGUAGE
+    
+    lang = str(language).lower().strip()
+    if lang in SUPPORTED_LANGUAGES:
+        return lang
+    
+    logger.warning(
+        f"无效的语言参数 '{language}'，已回退到默认值 '{FALLBACK_LANGUAGE}'。"
+        f"支持的语言: {SUPPORTED_LANGUAGES}"
+    )
+    return FALLBACK_LANGUAGE
+
 
 class MemorySummaryResponse(RobustLLMResponse):
     """Structured response for summary generation per chunk.
@@ -31,7 +59,8 @@ class MemorySummaryResponse(RobustLLMResponse):
 
 async def generate_title_and_type_for_summary(
     content: str,
-    llm_client
+    llm_client,
+    language: str = None
 ) -> Tuple[str, str]:
     """
     为MemorySummary生成标题和类型
@@ -41,11 +70,18 @@ async def generate_title_and_type_for_summary(
     Args:
         content: Summary的内容文本
         llm_client: LLM客户端实例
+        language: 生成标题使用的语言 ("zh" 中文, "en" 英文)，如果为None则从配置读取
         
     Returns:
         (标题, 类型)元组
     """
     from app.core.memory.utils.prompt.prompt_utils import render_episodic_title_and_type_prompt
+    from app.core.config import settings
+    
+    # 如果没有指定语言，从配置中读取，并校验有效性
+    if language is None:
+        language = settings.DEFAULT_LANGUAGE
+    language = validate_language(language)
     
     # 定义有效的类型集合
     VALID_TYPES = {
@@ -57,13 +93,19 @@ async def generate_title_and_type_for_summary(
     }
     DEFAULT_TYPE = "conversation"  # 默认类型
     
+    # 根据语言设置默认标题
+    DEFAULT_TITLE = "空内容" if language == "zh" else "Empty Content"
+    PARSE_ERROR_TITLE = "解析失败" if language == "zh" else "Parse Failed"
+    ERROR_TITLE = "错误" if language == "zh" else "Error"
+    UNKNOWN_TITLE = "未知标题" if language == "zh" else "Unknown Title"
+    
     try:
         if not content:
-            logger.warning("content为空，无法生成标题和类型")
-            return ("空内容", DEFAULT_TYPE)
+            logger.warning(f"content为空，无法生成标题和类型 (language={language})")
+            return (DEFAULT_TITLE, DEFAULT_TYPE)
         
-        # 1. 渲染Jinja2提示词模板
-        prompt = await render_episodic_title_and_type_prompt(content)
+        # 1. 渲染Jinja2提示词模板，传递语言参数
+        prompt = await render_episodic_title_and_type_prompt(content, language=language)
         
         # 2. 调用LLM生成标题和类型
         messages = [
@@ -102,7 +144,7 @@ async def generate_title_and_type_for_summary(
             json_str = json_str.strip()
             
             result_data = json.loads(json_str)
-            title = result_data.get("title", "未知标题")
+            title = result_data.get("title", UNKNOWN_TITLE)
             episodic_type_raw = result_data.get("type", DEFAULT_TYPE)
             
             # 5. 校验和归一化类型
@@ -130,16 +172,16 @@ async def generate_title_and_type_for_summary(
                     f"已归一化为 '{episodic_type}'"
                 )
             
-            logger.info(f"成功生成标题和类型: title={title}, type={episodic_type}")
+            logger.info(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}")
             return (title, episodic_type)
             
         except json.JSONDecodeError:
-            logger.error(f"无法解析LLM响应为JSON: {full_response}")
-            return ("解析失败", DEFAULT_TYPE)
+            logger.error(f"无法解析LLM响应为JSON (language={language}): {full_response}")
+            return (PARSE_ERROR_TITLE, DEFAULT_TYPE)
         
     except Exception as e:
-        logger.error(f"生成标题和类型时出错: {str(e)}", exc_info=True)
-        return ("错误", DEFAULT_TYPE)
+        logger.error(f"生成标题和类型时出错 (language={language}): {str(e)}", exc_info=True)
+        return (ERROR_TITLE, DEFAULT_TYPE)
 
 async def _process_chunk_summary(
     dialog: DialogData,
@@ -153,11 +195,16 @@ async def _process_chunk_summary(
         return None
 
     try:
+        # 从配置中获取语言设置（只获取一次，复用），并校验有效性
+        from app.core.config import settings
+        language = validate_language(settings.DEFAULT_LANGUAGE)
+        
         # Render prompt via Jinja2 for a single chunk
         prompt_content = await render_memory_summary_prompt(
             chunk_texts=chunk.content,
             json_schema=MemorySummaryResponse.model_json_schema(),
             max_words=200,
+            language=language,
         )
 
         messages = [
@@ -178,9 +225,10 @@ async def _process_chunk_summary(
         try:
             title, episodic_type = await generate_title_and_type_for_summary(
                 content=summary_text,
-                llm_client=llm_client
+                llm_client=llm_client,
+                language=language
             )
-            logger.info(f"Generated title and type for MemorySummary: title={title}, type={episodic_type}")
+            logger.info(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}")
         except Exception as e:
             logger.warning(f"Failed to generate title and type for chunk {chunk.id}: {e}")
             # Continue without title and type
diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py
index bfc0bc88..8c3e31b4 100644
--- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py
+++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py
@@ -25,6 +25,15 @@ class TripletExtractor:
         """
         self.llm_client = llm_client
 
+    def _get_language(self) -> str:
+        """Get the configured language for entity descriptions
+        
+        Returns:
+            Language code ("zh" or "en")
+        """
+        from app.core.config import settings
+        return settings.DEFAULT_LANGUAGE
+
     async def _extract_triplets(self, statement: Statement, chunk_content: str) -> TripletExtractionResponse:
         """Process a single statement and return extracted triplets and entities"""
         # Render the prompt using helper function
@@ -40,7 +49,8 @@ class TripletExtractor:
             statement=statement.statement,
             chunk_content=chunk_content,
             json_schema=TripletExtractionResponse.model_json_schema(),
-            predicate_instructions=PREDICATE_DEFINITIONS
+            predicate_instructions=PREDICATE_DEFINITIONS,
+            language=self._get_language()
         )
 
         # Create messages for LLM
diff --git a/api/app/core/memory/utils/prompt/prompt_utils.py b/api/app/core/memory/utils/prompt/prompt_utils.py
index d8bf02c7..a4d2af95 100644
--- a/api/app/core/memory/utils/prompt/prompt_utils.py
+++ b/api/app/core/memory/utils/prompt/prompt_utils.py
@@ -177,7 +177,7 @@ def render_entity_dedup_prompt(
 
 #     Args:
 #         entity_a: Dict of entity A attributes
-async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None) -> str:
+async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None, language: str = "zh") -> str:
     """
     Renders the triplet extraction prompt using the extract_triplet.jinja2 template.
 
@@ -186,6 +186,7 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
         chunk_content: The content of the chunk to process
         json_schema: JSON schema for the expected output format
         predicate_instructions: Optional predicate instructions
+        language: The language to use for entity descriptions ("zh" for Chinese, "en" for English)
 
     Returns:
         Rendered prompt content as string
@@ -195,7 +196,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
         statement=statement,
         chunk_content=chunk_content,
         json_schema=json_schema,
-        predicate_instructions=predicate_instructions
+        predicate_instructions=predicate_instructions,
+        language=language
     )
     # 记录渲染结果到提示日志（与示例日志结构一致）
     log_prompt_rendering('triplet extraction', rendered_prompt)
@@ -204,7 +206,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
         'statement': 'str',
         'chunk_content': 'str',
         'json_schema': 'TripletExtractionResponse.schema',
-        'predicate_instructions': 'PREDICATE_DEFINITIONS'
+        'predicate_instructions': 'PREDICATE_DEFINITIONS',
+        'language': language
     })
 
     return rendered_prompt
@@ -213,6 +216,7 @@ async def render_memory_summary_prompt(
     chunk_texts: str,
     json_schema: dict,
     max_words: int = 200,
+    language: str = "zh",
 ) -> str:
     """
     Renders the memory summary prompt using the memory_summary.jinja2 template.
@@ -221,6 +225,7 @@ async def render_memory_summary_prompt(
         chunk_texts: Concatenated text of conversation chunks
         json_schema: JSON schema for the expected output format
         max_words: Maximum words for the summary
+        language: The language to use for summary generation ("zh" for Chinese, "en" for English)
 
     Returns:
         Rendered prompt content as string.
@@ -230,12 +235,14 @@ async def render_memory_summary_prompt(
         chunk_texts=chunk_texts,
         json_schema=json_schema,
         max_words=max_words,
+        language=language,
     )
     log_prompt_rendering('memory summary', rendered_prompt)
     log_template_rendering('memory_summary.jinja2', {
         'chunk_texts_len': len(chunk_texts or ""),
         'max_words': max_words,
-        'json_schema': 'MemorySummaryResponse.schema'
+        'json_schema': 'MemorySummaryResponse.schema',
+        'language': language
     })
     return rendered_prompt
 
@@ -388,24 +395,26 @@ async def render_memory_insight_prompt(
     return rendered_prompt
 
 
-async def render_episodic_title_and_type_prompt(content: str) -> str:
+async def render_episodic_title_and_type_prompt(content: str, language: str = "zh") -> str:
     """
     Renders the episodic title and type classification prompt using the episodic_type_classification.jinja2 template.
 
     Args:
         content: The content of the episodic memory summary to analyze
+        language: The language to use for title generation ("zh" for Chinese, "en" for English)
 
     Returns:
         Rendered prompt content as string
     """
     template = prompt_env.get_template("episodic_type_classification.jinja2")
-    rendered_prompt = template.render(content=content)
+    rendered_prompt = template.render(content=content, language=language)
     
     # 记录渲染结果到提示日志
     log_prompt_rendering('episodic title and type classification', rendered_prompt)
     # 可选：记录模板渲染信息
     log_template_rendering('episodic_type_classification.jinja2', {
-        'content_len': len(content) if content else 0
+        'content_len': len(content) if content else 0,
+        'language': language
     })
     
     return rendered_prompt
diff --git a/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2 b/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2
index fa382ec7..d778890b 100644
--- a/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2
@@ -1,8 +1,19 @@
 === Task ===
 Generate a concise title and classify the episodic memory into the most appropriate category.
 
+{% if language == "zh" %}
+**重要：请使用中文生成标题和分类。**
+{% else %}
+**Important: Please generate the title and classification in English.**
+{% endif %}
+
 === Requirements ===
 - Extract a clear, concise title (10-20 characters) that captures the core content
+{% if language == "zh" %}
+- 标题必须使用中文
+{% else %}
+- Title must be in English
+{% endif %}
 - Classify into exactly one category based on the primary theme
 - Be specific and avoid ambiguity
 - Output must be valid JSON conforming to the schema below
diff --git a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2 b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2
index 03691a04..67df162a 100644
--- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2
@@ -5,6 +5,12 @@
 ===Task===
 Extract entities and knowledge triplets from the given statement.
 
+{% if language == "zh" %}
+**重要：请使用中文生成实体描述（description）和示例（example）。**
+{% else %}
+**Important: Please generate entity descriptions and examples in English.**
+{% endif %}
+
 ===Inputs===
 **Chunk Content:** "{{ chunk_content }}"
 **Statement:** "{{ statement }}"
@@ -13,6 +19,13 @@ Extract entities and knowledge triplets from the given statement.
 
 **Entity Extraction:**
 - Extract entities with their types, context-independent descriptions, **concise examples**, aliases, and semantic memory classification
+{% if language == "zh" %}
+- **实体描述（description）必须使用中文**
+- **示例（example）必须使用中文**
+{% else %}
+- **Entity descriptions must be in English**
+- **Examples must be in English**
+{% endif %}
 - **Semantic Memory Classification (is_explicit_memory):**
   * Set to `true` if the entity represents **explicit/semantic memory**:
     - **Concepts:** "Machine Learning", "Photosynthesis", "Democracy", "人工智能", "光合作用", "民主"
@@ -334,9 +347,11 @@ Output:
 - Escape quotation marks in text with backslashes (\")
 - Ensure proper string closure and comma separation
 - No line breaks within JSON string values
-- The output language should ALWAYS match the input language
-- If input is in English, extract statements in English
-- If input is in Chinese, extract statements in Chinese
+{% if language == "zh" %}
+- **语言要求：实体描述（description）和示例（example）必须使用中文**
+{% else %}
+- **Language Requirement: Entity descriptions and examples must be in English**
+{% endif %}
 - Preserve the original language and do not translate
 
 {{ json_schema }}
\ No newline at end of file
diff --git a/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2 b/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2
index 1dd86ca3..82f91cc4 100644
--- a/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2
@@ -5,10 +5,21 @@
 === Task ===
 Summarize the provided conversation chunks into a concise Memory summary.
 
+{% if language == "zh" %}
+**重要：请使用中文生成摘要内容。**
+{% else %}
+**Important: Please generate the summary content in English.**
+{% endif %}
+
 === Requirements ===
 - Focus on factual statements, user preferences, relationships, and salient temporal context.
 - Avoid repetition and filler; be specific.
 - Keep it under {{ max_words or 200 }} words.
+{% if language == "zh" %}
+- 摘要内容必须使用中文
+{% else %}
+- Summary content must be in English
+{% endif %}
 - Output must be valid JSON conforming to the schema below.
 
 === Input ===
@@ -24,6 +35,11 @@ Summarize the provided conversation chunks into a concise Memory summary.
 4. Do not include line breaks within JSON string values
 5. Example of proper escaping: "statement": "张曼婷说：\"我很喜欢这本书。\""
 
-The output language should always be the same as the input language.
+{% if language == "zh" %}
+**语言要求：输出内容必须使用中文。**
+{% else %}
+**Language Requirement: The output content must be in English.**
+{% endif %}
+
 Return only a list of extracted labelled statements in the JSON ARRAY of objects that match the schema below:
 {{ json_schema }}
\ No newline at end of file
diff --git a/api/app/models/memory_config_model.py b/api/app/models/memory_config_model.py
index 454b1b48..8a451f2d 100644
--- a/api/app/models/memory_config_model.py
+++ b/api/app/models/memory_config_model.py
@@ -20,6 +20,9 @@ class MemoryConfig(Base):
     end_user_id = Column(String, nullable=True, comment="组ID")
     user_id = Column(String, nullable=True, comment="用户ID")
     apply_id = Column(String, nullable=True, comment="应用ID")
+    
+    # 本体场景关联
+    scene_id = Column(UUID(as_uuid=True), nullable=True, comment="本体场景ID，关联ontology_scene表")
 
     # 模型选择（从workspace继承）
     llm_id = Column(String, nullable=True, comment="LLM模型配置ID")
diff --git a/api/app/repositories/memory_config_repository.py b/api/app/repositories/memory_config_repository.py
index fbc04f2e..c00943f7 100644
--- a/api/app/repositories/memory_config_repository.py
+++ b/api/app/repositories/memory_config_repository.py
@@ -229,6 +229,7 @@ class MemoryConfigRepository:
                 config_name=params.config_name,
                 config_desc=params.config_desc,
                 workspace_id=params.workspace_id,
+                scene_id=params.scene_id,
                 llm_id=params.llm_id,
                 embedding_id=params.embedding_id,
                 rerank_id=params.rerank_id,
diff --git a/api/app/schemas/memory_storage_schema.py b/api/app/schemas/memory_storage_schema.py
index 5fda0a1d..5e22d70f 100644
--- a/api/app/schemas/memory_storage_schema.py
+++ b/api/app/schemas/memory_storage_schema.py
@@ -229,6 +229,9 @@ class ConfigParamsCreate(BaseModel):  # 创建配置参数模型（仅 body，
     config_desc: str = Field("配置描述", description="配置描述（字符串）")
     workspace_id: Optional[uuid.UUID] = Field(None, description="工作空间ID（UUID）")
     
+    # 本体场景关联（可选）
+    scene_id: Optional[uuid.UUID] = Field(None, description="本体场景ID（UUID），关联ontology_scene表")
+    
     # 模型配置字段（可选，用于手动指定或自动填充）
     llm_id: Optional[str] = Field(None, description="LLM模型配置ID")
     embedding_id: Optional[str] = Field(None, description="嵌入模型配置ID")
diff --git a/api/env.example b/api/env.example
index 274049b9..98c96edc 100644
--- a/api/env.example
+++ b/api/env.example
@@ -1,4 +1,9 @@
 
+# Language Configuration
+# Supported values: "zh" (Chinese), "en" (English)
+# This controls the language used for memory summary titles and other generated content
+DEFAULT_LANGUAGE=zh
+
 # Neo4j Configuration (记忆系统数据库)
 NEO4J_URI= 
 NEO4J_USERNAME=