Fix/v022 bug (#263)

* [fix]Fix the issue of inconsistent language in explicit and episodic memory.

* [fix]Fix the issue of inconsistent language in explicit and episodic memory.

* [add]Add scene_id

* [fix]Based on the AI review to fix the code
This commit is contained in:
乐力齐
2026-01-30 18:02:45 +08:00
committed by GitHub
parent fa009327ad
commit 2687c3b80e
11 changed files with 151 additions and 25 deletions

View File

@@ -157,6 +157,11 @@ class Settings:
if origin.strip()
]
# Language Configuration
# Supported values: "zh" (Chinese), "en" (English)
# This controls the language used for memory summary titles and other generated content
DEFAULT_LANGUAGE: str = os.getenv("DEFAULT_LANGUAGE", "zh")
# Logging settings
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
LOG_FORMAT: str = os.getenv("LOG_FORMAT", "%(asctime)s - %(name)s - %(levelname)s - %(message)s")

View File

@@ -14,6 +14,34 @@ from pydantic import Field
logger = get_memory_logger(__name__)
# 支持的语言列表和默认回退值
SUPPORTED_LANGUAGES = {"zh", "en"}
FALLBACK_LANGUAGE = "en"
def validate_language(language: Optional[str]) -> str:
"""
校验语言参数,确保其为有效值。
Args:
language: 待校验的语言代码
Returns:
有效的语言代码("zh""en"
"""
if language is None:
return FALLBACK_LANGUAGE
lang = str(language).lower().strip()
if lang in SUPPORTED_LANGUAGES:
return lang
logger.warning(
f"无效的语言参数 '{language}',已回退到默认值 '{FALLBACK_LANGUAGE}'"
f"支持的语言: {SUPPORTED_LANGUAGES}"
)
return FALLBACK_LANGUAGE
class MemorySummaryResponse(RobustLLMResponse):
"""Structured response for summary generation per chunk.
@@ -31,7 +59,8 @@ class MemorySummaryResponse(RobustLLMResponse):
async def generate_title_and_type_for_summary(
content: str,
llm_client
llm_client,
language: str = None
) -> Tuple[str, str]:
"""
为MemorySummary生成标题和类型
@@ -41,11 +70,18 @@ async def generate_title_and_type_for_summary(
Args:
content: Summary的内容文本
llm_client: LLM客户端实例
language: 生成标题使用的语言 ("zh" 中文, "en" 英文)如果为None则从配置读取
Returns:
(标题, 类型)元组
"""
from app.core.memory.utils.prompt.prompt_utils import render_episodic_title_and_type_prompt
from app.core.config import settings
# 如果没有指定语言,从配置中读取,并校验有效性
if language is None:
language = settings.DEFAULT_LANGUAGE
language = validate_language(language)
# 定义有效的类型集合
VALID_TYPES = {
@@ -57,13 +93,19 @@ async def generate_title_and_type_for_summary(
}
DEFAULT_TYPE = "conversation" # 默认类型
# 根据语言设置默认标题
DEFAULT_TITLE = "空内容" if language == "zh" else "Empty Content"
PARSE_ERROR_TITLE = "解析失败" if language == "zh" else "Parse Failed"
ERROR_TITLE = "错误" if language == "zh" else "Error"
UNKNOWN_TITLE = "未知标题" if language == "zh" else "Unknown Title"
try:
if not content:
logger.warning("content为空无法生成标题和类型")
return ("空内容", DEFAULT_TYPE)
logger.warning(f"content为空无法生成标题和类型 (language={language})")
return (DEFAULT_TITLE, DEFAULT_TYPE)
# 1. 渲染Jinja2提示词模板
prompt = await render_episodic_title_and_type_prompt(content)
# 1. 渲染Jinja2提示词模板,传递语言参数
prompt = await render_episodic_title_and_type_prompt(content, language=language)
# 2. 调用LLM生成标题和类型
messages = [
@@ -102,7 +144,7 @@ async def generate_title_and_type_for_summary(
json_str = json_str.strip()
result_data = json.loads(json_str)
title = result_data.get("title", "未知标题")
title = result_data.get("title", UNKNOWN_TITLE)
episodic_type_raw = result_data.get("type", DEFAULT_TYPE)
# 5. 校验和归一化类型
@@ -130,16 +172,16 @@ async def generate_title_and_type_for_summary(
f"已归一化为 '{episodic_type}'"
)
logger.info(f"成功生成标题和类型: title={title}, type={episodic_type}")
logger.info(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}")
return (title, episodic_type)
except json.JSONDecodeError:
logger.error(f"无法解析LLM响应为JSON: {full_response}")
return ("解析失败", DEFAULT_TYPE)
logger.error(f"无法解析LLM响应为JSON (language={language}): {full_response}")
return (PARSE_ERROR_TITLE, DEFAULT_TYPE)
except Exception as e:
logger.error(f"生成标题和类型时出错: {str(e)}", exc_info=True)
return ("错误", DEFAULT_TYPE)
logger.error(f"生成标题和类型时出错 (language={language}): {str(e)}", exc_info=True)
return (ERROR_TITLE, DEFAULT_TYPE)
async def _process_chunk_summary(
dialog: DialogData,
@@ -153,11 +195,16 @@ async def _process_chunk_summary(
return None
try:
# 从配置中获取语言设置(只获取一次,复用),并校验有效性
from app.core.config import settings
language = validate_language(settings.DEFAULT_LANGUAGE)
# Render prompt via Jinja2 for a single chunk
prompt_content = await render_memory_summary_prompt(
chunk_texts=chunk.content,
json_schema=MemorySummaryResponse.model_json_schema(),
max_words=200,
language=language,
)
messages = [
@@ -178,9 +225,10 @@ async def _process_chunk_summary(
try:
title, episodic_type = await generate_title_and_type_for_summary(
content=summary_text,
llm_client=llm_client
llm_client=llm_client,
language=language
)
logger.info(f"Generated title and type for MemorySummary: title={title}, type={episodic_type}")
logger.info(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}")
except Exception as e:
logger.warning(f"Failed to generate title and type for chunk {chunk.id}: {e}")
# Continue without title and type

View File

@@ -25,6 +25,15 @@ class TripletExtractor:
"""
self.llm_client = llm_client
def _get_language(self) -> str:
"""Get the configured language for entity descriptions
Returns:
Language code ("zh" or "en")
"""
from app.core.config import settings
return settings.DEFAULT_LANGUAGE
async def _extract_triplets(self, statement: Statement, chunk_content: str) -> TripletExtractionResponse:
"""Process a single statement and return extracted triplets and entities"""
# Render the prompt using helper function
@@ -40,7 +49,8 @@ class TripletExtractor:
statement=statement.statement,
chunk_content=chunk_content,
json_schema=TripletExtractionResponse.model_json_schema(),
predicate_instructions=PREDICATE_DEFINITIONS
predicate_instructions=PREDICATE_DEFINITIONS,
language=self._get_language()
)
# Create messages for LLM

View File

@@ -177,7 +177,7 @@ def render_entity_dedup_prompt(
# Args:
# entity_a: Dict of entity A attributes
async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None) -> str:
async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None, language: str = "zh") -> str:
"""
Renders the triplet extraction prompt using the extract_triplet.jinja2 template.
@@ -186,6 +186,7 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
chunk_content: The content of the chunk to process
json_schema: JSON schema for the expected output format
predicate_instructions: Optional predicate instructions
language: The language to use for entity descriptions ("zh" for Chinese, "en" for English)
Returns:
Rendered prompt content as string
@@ -195,7 +196,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
statement=statement,
chunk_content=chunk_content,
json_schema=json_schema,
predicate_instructions=predicate_instructions
predicate_instructions=predicate_instructions,
language=language
)
# 记录渲染结果到提示日志(与示例日志结构一致)
log_prompt_rendering('triplet extraction', rendered_prompt)
@@ -204,7 +206,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
'statement': 'str',
'chunk_content': 'str',
'json_schema': 'TripletExtractionResponse.schema',
'predicate_instructions': 'PREDICATE_DEFINITIONS'
'predicate_instructions': 'PREDICATE_DEFINITIONS',
'language': language
})
return rendered_prompt
@@ -213,6 +216,7 @@ async def render_memory_summary_prompt(
chunk_texts: str,
json_schema: dict,
max_words: int = 200,
language: str = "zh",
) -> str:
"""
Renders the memory summary prompt using the memory_summary.jinja2 template.
@@ -221,6 +225,7 @@ async def render_memory_summary_prompt(
chunk_texts: Concatenated text of conversation chunks
json_schema: JSON schema for the expected output format
max_words: Maximum words for the summary
language: The language to use for summary generation ("zh" for Chinese, "en" for English)
Returns:
Rendered prompt content as string.
@@ -230,12 +235,14 @@ async def render_memory_summary_prompt(
chunk_texts=chunk_texts,
json_schema=json_schema,
max_words=max_words,
language=language,
)
log_prompt_rendering('memory summary', rendered_prompt)
log_template_rendering('memory_summary.jinja2', {
'chunk_texts_len': len(chunk_texts or ""),
'max_words': max_words,
'json_schema': 'MemorySummaryResponse.schema'
'json_schema': 'MemorySummaryResponse.schema',
'language': language
})
return rendered_prompt
@@ -388,24 +395,26 @@ async def render_memory_insight_prompt(
return rendered_prompt
async def render_episodic_title_and_type_prompt(content: str) -> str:
async def render_episodic_title_and_type_prompt(content: str, language: str = "zh") -> str:
"""
Renders the episodic title and type classification prompt using the episodic_type_classification.jinja2 template.
Args:
content: The content of the episodic memory summary to analyze
language: The language to use for title generation ("zh" for Chinese, "en" for English)
Returns:
Rendered prompt content as string
"""
template = prompt_env.get_template("episodic_type_classification.jinja2")
rendered_prompt = template.render(content=content)
rendered_prompt = template.render(content=content, language=language)
# 记录渲染结果到提示日志
log_prompt_rendering('episodic title and type classification', rendered_prompt)
# 可选:记录模板渲染信息
log_template_rendering('episodic_type_classification.jinja2', {
'content_len': len(content) if content else 0
'content_len': len(content) if content else 0,
'language': language
})
return rendered_prompt

View File

@@ -1,8 +1,19 @@
=== Task ===
Generate a concise title and classify the episodic memory into the most appropriate category.
{% if language == "zh" %}
**重要:请使用中文生成标题和分类。**
{% else %}
**Important: Please generate the title and classification in English.**
{% endif %}
=== Requirements ===
- Extract a clear, concise title (10-20 characters) that captures the core content
{% if language == "zh" %}
- 标题必须使用中文
{% else %}
- Title must be in English
{% endif %}
- Classify into exactly one category based on the primary theme
- Be specific and avoid ambiguity
- Output must be valid JSON conforming to the schema below

View File

@@ -5,6 +5,12 @@
===Task===
Extract entities and knowledge triplets from the given statement.
{% if language == "zh" %}
**重要请使用中文生成实体描述description和示例example。**
{% else %}
**Important: Please generate entity descriptions and examples in English.**
{% endif %}
===Inputs===
**Chunk Content:** "{{ chunk_content }}"
**Statement:** "{{ statement }}"
@@ -13,6 +19,13 @@ Extract entities and knowledge triplets from the given statement.
**Entity Extraction:**
- Extract entities with their types, context-independent descriptions, **concise examples**, aliases, and semantic memory classification
{% if language == "zh" %}
- **实体描述description必须使用中文**
- **示例example必须使用中文**
{% else %}
- **Entity descriptions must be in English**
- **Examples must be in English**
{% endif %}
- **Semantic Memory Classification (is_explicit_memory):**
* Set to `true` if the entity represents **explicit/semantic memory**:
- **Concepts:** "Machine Learning", "Photosynthesis", "Democracy", "人工智能", "光合作用", "民主"
@@ -334,9 +347,11 @@ Output:
- Escape quotation marks in text with backslashes (\")
- Ensure proper string closure and comma separation
- No line breaks within JSON string values
- The output language should ALWAYS match the input language
- If input is in English, extract statements in English
- If input is in Chinese, extract statements in Chinese
{% if language == "zh" %}
- **语言要求实体描述description和示例example必须使用中文**
{% else %}
- **Language Requirement: Entity descriptions and examples must be in English**
{% endif %}
- Preserve the original language and do not translate
{{ json_schema }}

View File

@@ -5,10 +5,21 @@
=== Task ===
Summarize the provided conversation chunks into a concise Memory summary.
{% if language == "zh" %}
**重要:请使用中文生成摘要内容。**
{% else %}
**Important: Please generate the summary content in English.**
{% endif %}
=== Requirements ===
- Focus on factual statements, user preferences, relationships, and salient temporal context.
- Avoid repetition and filler; be specific.
- Keep it under {{ max_words or 200 }} words.
{% if language == "zh" %}
- 摘要内容必须使用中文
{% else %}
- Summary content must be in English
{% endif %}
- Output must be valid JSON conforming to the schema below.
=== Input ===
@@ -24,6 +35,11 @@ Summarize the provided conversation chunks into a concise Memory summary.
4. Do not include line breaks within JSON string values
5. Example of proper escaping: "statement": "张曼婷说:\"我很喜欢这本书。\""
The output language should always be the same as the input language.
{% if language == "zh" %}
**语言要求:输出内容必须使用中文。**
{% else %}
**Language Requirement: The output content must be in English.**
{% endif %}
Return only a list of extracted labelled statements in the JSON ARRAY of objects that match the schema below:
{{ json_schema }}

View File

@@ -20,6 +20,9 @@ class MemoryConfig(Base):
end_user_id = Column(String, nullable=True, comment="组ID")
user_id = Column(String, nullable=True, comment="用户ID")
apply_id = Column(String, nullable=True, comment="应用ID")
# 本体场景关联
scene_id = Column(UUID(as_uuid=True), nullable=True, comment="本体场景ID关联ontology_scene表")
# 模型选择从workspace继承
llm_id = Column(String, nullable=True, comment="LLM模型配置ID")

View File

@@ -229,6 +229,7 @@ class MemoryConfigRepository:
config_name=params.config_name,
config_desc=params.config_desc,
workspace_id=params.workspace_id,
scene_id=params.scene_id,
llm_id=params.llm_id,
embedding_id=params.embedding_id,
rerank_id=params.rerank_id,

View File

@@ -229,6 +229,9 @@ class ConfigParamsCreate(BaseModel): # 创建配置参数模型(仅 body
config_desc: str = Field("配置描述", description="配置描述(字符串)")
workspace_id: Optional[uuid.UUID] = Field(None, description="工作空间IDUUID")
# 本体场景关联(可选)
scene_id: Optional[uuid.UUID] = Field(None, description="本体场景IDUUID关联ontology_scene表")
# 模型配置字段(可选,用于手动指定或自动填充)
llm_id: Optional[str] = Field(None, description="LLM模型配置ID")
embedding_id: Optional[str] = Field(None, description="嵌入模型配置ID")

View File

@@ -1,4 +1,9 @@
# Language Configuration
# Supported values: "zh" (Chinese), "en" (English)
# This controls the language used for memory summary titles and other generated content
DEFAULT_LANGUAGE=zh
# Neo4j Configuration (记忆系统数据库)
NEO4J_URI=
NEO4J_USERNAME=