Fix/v022 bug (#263)

* [fix]Fix the issue of inconsistent language in explicit and episodic memory. * [fix]Fix the issue of inconsistent language in explicit and episodic memory. * [add]Add scene_id * [fix]Based on the AI review to fix the code
2026-01-30 18:02:45 +08:00
parent fa009327ad
commit 2687c3b80e
11 changed files with 151 additions and 25 deletions
--- a/api/app/core/config.py
+++ b/api/app/core/config.py
@@ -157,6 +157,11 @@ class Settings:
        if origin.strip()
    ]

+    # Language Configuration
+    # Supported values: "zh" (Chinese), "en" (English)
+    # This controls the language used for memory summary titles and other generated content
+    DEFAULT_LANGUAGE: str = os.getenv("DEFAULT_LANGUAGE", "zh")
+
    # Logging settings
    LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
    LOG_FORMAT: str = os.getenv("LOG_FORMAT", "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
--- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py
+++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/memory_summary.py
@@ -14,6 +14,34 @@ from pydantic import Field

 logger = get_memory_logger(__name__)

+# 支持的语言列表和默认回退值
+SUPPORTED_LANGUAGES = {"zh", "en"}
+FALLBACK_LANGUAGE = "en"
+
+
+def validate_language(language: Optional[str]) -> str:
+    """
+    校验语言参数，确保其为有效值。
+    
+    Args:
+        language: 待校验的语言代码
+        
+    Returns:
+        有效的语言代码（"zh" 或 "en"）
+    """
+    if language is None:
+        return FALLBACK_LANGUAGE
+    
+    lang = str(language).lower().strip()
+    if lang in SUPPORTED_LANGUAGES:
+        return lang
+    
+    logger.warning(
+        f"无效的语言参数 '{language}'，已回退到默认值 '{FALLBACK_LANGUAGE}'。"
+        f"支持的语言: {SUPPORTED_LANGUAGES}"
+    )
+    return FALLBACK_LANGUAGE
+

 class MemorySummaryResponse(RobustLLMResponse):
    """Structured response for summary generation per chunk.
@@ -31,7 +59,8 @@ class MemorySummaryResponse(RobustLLMResponse):

 async def generate_title_and_type_for_summary(
    content: str,
-    llm_client
+    llm_client,
+    language: str = None
 ) -> Tuple[str, str]:
    """
    为MemorySummary生成标题和类型
@@ -41,11 +70,18 @@ async def generate_title_and_type_for_summary(
    Args:
        content: Summary的内容文本
        llm_client: LLM客户端实例
+        language: 生成标题使用的语言 ("zh" 中文, "en" 英文)，如果为None则从配置读取
        
    Returns:
        (标题, 类型)元组
    """
    from app.core.memory.utils.prompt.prompt_utils import render_episodic_title_and_type_prompt
+    from app.core.config import settings
+    
+    # 如果没有指定语言，从配置中读取，并校验有效性
+    if language is None:
+        language = settings.DEFAULT_LANGUAGE
+    language = validate_language(language)
    
    # 定义有效的类型集合
    VALID_TYPES = {
@@ -57,13 +93,19 @@ async def generate_title_and_type_for_summary(
    }
    DEFAULT_TYPE = "conversation"  # 默认类型
    
+    # 根据语言设置默认标题
+    DEFAULT_TITLE = "空内容" if language == "zh" else "Empty Content"
+    PARSE_ERROR_TITLE = "解析失败" if language == "zh" else "Parse Failed"
+    ERROR_TITLE = "错误" if language == "zh" else "Error"
+    UNKNOWN_TITLE = "未知标题" if language == "zh" else "Unknown Title"
+    
    try:
        if not content:
-            logger.warning("content为空，无法生成标题和类型")
-            return ("空内容", DEFAULT_TYPE)
+            logger.warning(f"content为空，无法生成标题和类型 (language={language})")
+            return (DEFAULT_TITLE, DEFAULT_TYPE)
        
-        # 1. 渲染Jinja2提示词模板
-        prompt = await render_episodic_title_and_type_prompt(content)
+        # 1. 渲染Jinja2提示词模板，传递语言参数
+        prompt = await render_episodic_title_and_type_prompt(content, language=language)
        
        # 2. 调用LLM生成标题和类型
        messages = [
@@ -102,7 +144,7 @@ async def generate_title_and_type_for_summary(
            json_str = json_str.strip()
            
            result_data = json.loads(json_str)
-            title = result_data.get("title", "未知标题")
+            title = result_data.get("title", UNKNOWN_TITLE)
            episodic_type_raw = result_data.get("type", DEFAULT_TYPE)
            
            # 5. 校验和归一化类型
@@ -130,16 +172,16 @@ async def generate_title_and_type_for_summary(
                    f"已归一化为 '{episodic_type}'"
                )
            
-            logger.info(f"成功生成标题和类型: title={title}, type={episodic_type}")
+            logger.info(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}")
            return (title, episodic_type)
            
        except json.JSONDecodeError:
-            logger.error(f"无法解析LLM响应为JSON: {full_response}")
-            return ("解析失败", DEFAULT_TYPE)
+            logger.error(f"无法解析LLM响应为JSON (language={language}): {full_response}")
+            return (PARSE_ERROR_TITLE, DEFAULT_TYPE)
        
    except Exception as e:
-        logger.error(f"生成标题和类型时出错: {str(e)}", exc_info=True)
-        return ("错误", DEFAULT_TYPE)
+        logger.error(f"生成标题和类型时出错 (language={language}): {str(e)}", exc_info=True)
+        return (ERROR_TITLE, DEFAULT_TYPE)

 async def _process_chunk_summary(
    dialog: DialogData,
@@ -153,11 +195,16 @@ async def _process_chunk_summary(
        return None

    try:
+        # 从配置中获取语言设置（只获取一次，复用），并校验有效性
+        from app.core.config import settings
+        language = validate_language(settings.DEFAULT_LANGUAGE)
+        
        # Render prompt via Jinja2 for a single chunk
        prompt_content = await render_memory_summary_prompt(
            chunk_texts=chunk.content,
            json_schema=MemorySummaryResponse.model_json_schema(),
            max_words=200,
+            language=language,
        )

        messages = [
@@ -178,9 +225,10 @@ async def _process_chunk_summary(
        try:
            title, episodic_type = await generate_title_and_type_for_summary(
                content=summary_text,
-                llm_client=llm_client
+                llm_client=llm_client,
+                language=language
            )
-            logger.info(f"Generated title and type for MemorySummary: title={title}, type={episodic_type}")
+            logger.info(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}")
        except Exception as e:
            logger.warning(f"Failed to generate title and type for chunk {chunk.id}: {e}")
            # Continue without title and type
--- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py
+++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/triplet_extraction.py
@@ -25,6 +25,15 @@ class TripletExtractor:
        """
        self.llm_client = llm_client

+    def _get_language(self) -> str:
+        """Get the configured language for entity descriptions
+        
+        Returns:
+            Language code ("zh" or "en")
+        """
+        from app.core.config import settings
+        return settings.DEFAULT_LANGUAGE
+
    async def _extract_triplets(self, statement: Statement, chunk_content: str) -> TripletExtractionResponse:
        """Process a single statement and return extracted triplets and entities"""
        # Render the prompt using helper function
@@ -40,7 +49,8 @@ class TripletExtractor:
            statement=statement.statement,
            chunk_content=chunk_content,
            json_schema=TripletExtractionResponse.model_json_schema(),
-            predicate_instructions=PREDICATE_DEFINITIONS
+            predicate_instructions=PREDICATE_DEFINITIONS,
+            language=self._get_language()
        )

        # Create messages for LLM
--- a/api/app/core/memory/utils/prompt/prompt_utils.py
+++ b/api/app/core/memory/utils/prompt/prompt_utils.py
@@ -177,7 +177,7 @@ def render_entity_dedup_prompt(

 #     Args:
 #         entity_a: Dict of entity A attributes
-async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None) -> str:
+async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None, language: str = "zh") -> str:
    """
    Renders the triplet extraction prompt using the extract_triplet.jinja2 template.

@@ -186,6 +186,7 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
        chunk_content: The content of the chunk to process
        json_schema: JSON schema for the expected output format
        predicate_instructions: Optional predicate instructions
+        language: The language to use for entity descriptions ("zh" for Chinese, "en" for English)

    Returns:
        Rendered prompt content as string
@@ -195,7 +196,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
        statement=statement,
        chunk_content=chunk_content,
        json_schema=json_schema,
-        predicate_instructions=predicate_instructions
+        predicate_instructions=predicate_instructions,
+        language=language
    )
    # 记录渲染结果到提示日志（与示例日志结构一致）
    log_prompt_rendering('triplet extraction', rendered_prompt)
@@ -204,7 +206,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
        'statement': 'str',
        'chunk_content': 'str',
        'json_schema': 'TripletExtractionResponse.schema',
-        'predicate_instructions': 'PREDICATE_DEFINITIONS'
+        'predicate_instructions': 'PREDICATE_DEFINITIONS',
+        'language': language
    })

    return rendered_prompt
@@ -213,6 +216,7 @@ async def render_memory_summary_prompt(
    chunk_texts: str,
    json_schema: dict,
    max_words: int = 200,
+    language: str = "zh",
 ) -> str:
    """
    Renders the memory summary prompt using the memory_summary.jinja2 template.
@@ -221,6 +225,7 @@ async def render_memory_summary_prompt(
        chunk_texts: Concatenated text of conversation chunks
        json_schema: JSON schema for the expected output format
        max_words: Maximum words for the summary
+        language: The language to use for summary generation ("zh" for Chinese, "en" for English)

    Returns:
        Rendered prompt content as string.
@@ -230,12 +235,14 @@ async def render_memory_summary_prompt(
        chunk_texts=chunk_texts,
        json_schema=json_schema,
        max_words=max_words,
+        language=language,
    )
    log_prompt_rendering('memory summary', rendered_prompt)
    log_template_rendering('memory_summary.jinja2', {
        'chunk_texts_len': len(chunk_texts or ""),
        'max_words': max_words,
-        'json_schema': 'MemorySummaryResponse.schema'
+        'json_schema': 'MemorySummaryResponse.schema',
+        'language': language
    })
    return rendered_prompt

@@ -388,24 +395,26 @@ async def render_memory_insight_prompt(
    return rendered_prompt


-async def render_episodic_title_and_type_prompt(content: str) -> str:
+async def render_episodic_title_and_type_prompt(content: str, language: str = "zh") -> str:
    """
    Renders the episodic title and type classification prompt using the episodic_type_classification.jinja2 template.

    Args:
        content: The content of the episodic memory summary to analyze
+        language: The language to use for title generation ("zh" for Chinese, "en" for English)

    Returns:
        Rendered prompt content as string
    """
    template = prompt_env.get_template("episodic_type_classification.jinja2")
-    rendered_prompt = template.render(content=content)
+    rendered_prompt = template.render(content=content, language=language)
    
    # 记录渲染结果到提示日志
    log_prompt_rendering('episodic title and type classification', rendered_prompt)
    # 可选：记录模板渲染信息
    log_template_rendering('episodic_type_classification.jinja2', {
-        'content_len': len(content) if content else 0
+        'content_len': len(content) if content else 0,
+        'language': language
    })
    
    return rendered_prompt
--- a/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/episodic_type_classification.jinja2
@@ -1,8 +1,19 @@
 === Task ===
 Generate a concise title and classify the episodic memory into the most appropriate category.

+{% if language == "zh" %}
+**重要：请使用中文生成标题和分类。**
+{% else %}
+**Important: Please generate the title and classification in English.**
+{% endif %}
+
 === Requirements ===
 - Extract a clear, concise title (10-20 characters) that captures the core content
+{% if language == "zh" %}
+- 标题必须使用中文
+{% else %}
+- Title must be in English
+{% endif %}
 - Classify into exactly one category based on the primary theme
 - Be specific and avoid ambiguity
 - Output must be valid JSON conforming to the schema below
--- a/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/extract_triplet.jinja2
@@ -5,6 +5,12 @@
 ===Task===
 Extract entities and knowledge triplets from the given statement.

+{% if language == "zh" %}
+**重要：请使用中文生成实体描述（description）和示例（example）。**
+{% else %}
+**Important: Please generate entity descriptions and examples in English.**
+{% endif %}
+
 ===Inputs===
 **Chunk Content:** "{{ chunk_content }}"
 **Statement:** "{{ statement }}"
@@ -13,6 +19,13 @@ Extract entities and knowledge triplets from the given statement.

 **Entity Extraction:**
 - Extract entities with their types, context-independent descriptions, **concise examples**, aliases, and semantic memory classification
+{% if language == "zh" %}
+- **实体描述（description）必须使用中文**
+- **示例（example）必须使用中文**
+{% else %}
+- **Entity descriptions must be in English**
+- **Examples must be in English**
+{% endif %}
 - **Semantic Memory Classification (is_explicit_memory):**
  * Set to `true` if the entity represents **explicit/semantic memory**:
    - **Concepts:** "Machine Learning", "Photosynthesis", "Democracy", "人工智能", "光合作用", "民主"
@@ -334,9 +347,11 @@ Output:
 - Escape quotation marks in text with backslashes (\")
 - Ensure proper string closure and comma separation
 - No line breaks within JSON string values
- The output language should ALWAYS match the input language
- If input is in English, extract statements in English
- If input is in Chinese, extract statements in Chinese
+{% if language == "zh" %}
+- **语言要求：实体描述（description）和示例（example）必须使用中文**
+{% else %}
+- **Language Requirement: Entity descriptions and examples must be in English**
+{% endif %}
 - Preserve the original language and do not translate

 {{ json_schema }}
--- a/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2
+++ b/api/app/core/memory/utils/prompt/prompts/memory_summary.jinja2
@@ -5,10 +5,21 @@
 === Task ===
 Summarize the provided conversation chunks into a concise Memory summary.

+{% if language == "zh" %}
+**重要：请使用中文生成摘要内容。**
+{% else %}
+**Important: Please generate the summary content in English.**
+{% endif %}
+
 === Requirements ===
 - Focus on factual statements, user preferences, relationships, and salient temporal context.
 - Avoid repetition and filler; be specific.
 - Keep it under {{ max_words or 200 }} words.
+{% if language == "zh" %}
+- 摘要内容必须使用中文
+{% else %}
+- Summary content must be in English
+{% endif %}
 - Output must be valid JSON conforming to the schema below.

 === Input ===
@@ -24,6 +35,11 @@ Summarize the provided conversation chunks into a concise Memory summary.
 4. Do not include line breaks within JSON string values
 5. Example of proper escaping: "statement": "张曼婷说：\"我很喜欢这本书。\""

-The output language should always be the same as the input language.
+{% if language == "zh" %}
+**语言要求：输出内容必须使用中文。**
+{% else %}
+**Language Requirement: The output content must be in English.**
+{% endif %}
+
 Return only a list of extracted labelled statements in the JSON ARRAY of objects that match the schema below:
 {{ json_schema }}
--- a/api/app/models/memory_config_model.py
+++ b/api/app/models/memory_config_model.py
@@ -20,6 +20,9 @@ class MemoryConfig(Base):
    end_user_id = Column(String, nullable=True, comment="组ID")
    user_id = Column(String, nullable=True, comment="用户ID")
    apply_id = Column(String, nullable=True, comment="应用ID")
+    
+    # 本体场景关联
+    scene_id = Column(UUID(as_uuid=True), nullable=True, comment="本体场景ID，关联ontology_scene表")

    # 模型选择（从workspace继承）
    llm_id = Column(String, nullable=True, comment="LLM模型配置ID")
--- a/api/app/repositories/memory_config_repository.py
+++ b/api/app/repositories/memory_config_repository.py
@@ -229,6 +229,7 @@ class MemoryConfigRepository:
                config_name=params.config_name,
                config_desc=params.config_desc,
                workspace_id=params.workspace_id,
+                scene_id=params.scene_id,
                llm_id=params.llm_id,
                embedding_id=params.embedding_id,
                rerank_id=params.rerank_id,
--- a/api/app/schemas/memory_storage_schema.py
+++ b/api/app/schemas/memory_storage_schema.py
@@ -229,6 +229,9 @@ class ConfigParamsCreate(BaseModel):  # 创建配置参数模型（仅 body，
    config_desc: str = Field("配置描述", description="配置描述（字符串）")
    workspace_id: Optional[uuid.UUID] = Field(None, description="工作空间ID（UUID）")
    
+    # 本体场景关联（可选）
+    scene_id: Optional[uuid.UUID] = Field(None, description="本体场景ID（UUID），关联ontology_scene表")
+    
    # 模型配置字段（可选，用于手动指定或自动填充）
    llm_id: Optional[str] = Field(None, description="LLM模型配置ID")
    embedding_id: Optional[str] = Field(None, description="嵌入模型配置ID")
--- a/api/env.example
+++ b/api/env.example
@@ -1,4 +1,9 @@

+# Language Configuration
+# Supported values: "zh" (Chinese), "en" (English)
+# This controls the language used for memory summary titles and other generated content
+DEFAULT_LANGUAGE=zh
+
 # Neo4j Configuration (记忆系统数据库)
 NEO4J_URI= 
 NEO4J_USERNAME=