Fix/v022 bug (#263)
* [fix]Fix the issue of inconsistent language in explicit and episodic memory. * [fix]Fix the issue of inconsistent language in explicit and episodic memory. * [add]Add scene_id * [fix]Based on the AI review to fix the code
This commit is contained in:
@@ -157,6 +157,11 @@ class Settings:
|
||||
if origin.strip()
|
||||
]
|
||||
|
||||
# Language Configuration
|
||||
# Supported values: "zh" (Chinese), "en" (English)
|
||||
# This controls the language used for memory summary titles and other generated content
|
||||
DEFAULT_LANGUAGE: str = os.getenv("DEFAULT_LANGUAGE", "zh")
|
||||
|
||||
# Logging settings
|
||||
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
|
||||
LOG_FORMAT: str = os.getenv("LOG_FORMAT", "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
|
||||
@@ -14,6 +14,34 @@ from pydantic import Field
|
||||
|
||||
logger = get_memory_logger(__name__)
|
||||
|
||||
# 支持的语言列表和默认回退值
|
||||
SUPPORTED_LANGUAGES = {"zh", "en"}
|
||||
FALLBACK_LANGUAGE = "en"
|
||||
|
||||
|
||||
def validate_language(language: Optional[str]) -> str:
|
||||
"""
|
||||
校验语言参数,确保其为有效值。
|
||||
|
||||
Args:
|
||||
language: 待校验的语言代码
|
||||
|
||||
Returns:
|
||||
有效的语言代码("zh" 或 "en")
|
||||
"""
|
||||
if language is None:
|
||||
return FALLBACK_LANGUAGE
|
||||
|
||||
lang = str(language).lower().strip()
|
||||
if lang in SUPPORTED_LANGUAGES:
|
||||
return lang
|
||||
|
||||
logger.warning(
|
||||
f"无效的语言参数 '{language}',已回退到默认值 '{FALLBACK_LANGUAGE}'。"
|
||||
f"支持的语言: {SUPPORTED_LANGUAGES}"
|
||||
)
|
||||
return FALLBACK_LANGUAGE
|
||||
|
||||
|
||||
class MemorySummaryResponse(RobustLLMResponse):
|
||||
"""Structured response for summary generation per chunk.
|
||||
@@ -31,7 +59,8 @@ class MemorySummaryResponse(RobustLLMResponse):
|
||||
|
||||
async def generate_title_and_type_for_summary(
|
||||
content: str,
|
||||
llm_client
|
||||
llm_client,
|
||||
language: str = None
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
为MemorySummary生成标题和类型
|
||||
@@ -41,11 +70,18 @@ async def generate_title_and_type_for_summary(
|
||||
Args:
|
||||
content: Summary的内容文本
|
||||
llm_client: LLM客户端实例
|
||||
language: 生成标题使用的语言 ("zh" 中文, "en" 英文),如果为None则从配置读取
|
||||
|
||||
Returns:
|
||||
(标题, 类型)元组
|
||||
"""
|
||||
from app.core.memory.utils.prompt.prompt_utils import render_episodic_title_and_type_prompt
|
||||
from app.core.config import settings
|
||||
|
||||
# 如果没有指定语言,从配置中读取,并校验有效性
|
||||
if language is None:
|
||||
language = settings.DEFAULT_LANGUAGE
|
||||
language = validate_language(language)
|
||||
|
||||
# 定义有效的类型集合
|
||||
VALID_TYPES = {
|
||||
@@ -57,13 +93,19 @@ async def generate_title_and_type_for_summary(
|
||||
}
|
||||
DEFAULT_TYPE = "conversation" # 默认类型
|
||||
|
||||
# 根据语言设置默认标题
|
||||
DEFAULT_TITLE = "空内容" if language == "zh" else "Empty Content"
|
||||
PARSE_ERROR_TITLE = "解析失败" if language == "zh" else "Parse Failed"
|
||||
ERROR_TITLE = "错误" if language == "zh" else "Error"
|
||||
UNKNOWN_TITLE = "未知标题" if language == "zh" else "Unknown Title"
|
||||
|
||||
try:
|
||||
if not content:
|
||||
logger.warning("content为空,无法生成标题和类型")
|
||||
return ("空内容", DEFAULT_TYPE)
|
||||
logger.warning(f"content为空,无法生成标题和类型 (language={language})")
|
||||
return (DEFAULT_TITLE, DEFAULT_TYPE)
|
||||
|
||||
# 1. 渲染Jinja2提示词模板
|
||||
prompt = await render_episodic_title_and_type_prompt(content)
|
||||
# 1. 渲染Jinja2提示词模板,传递语言参数
|
||||
prompt = await render_episodic_title_and_type_prompt(content, language=language)
|
||||
|
||||
# 2. 调用LLM生成标题和类型
|
||||
messages = [
|
||||
@@ -102,7 +144,7 @@ async def generate_title_and_type_for_summary(
|
||||
json_str = json_str.strip()
|
||||
|
||||
result_data = json.loads(json_str)
|
||||
title = result_data.get("title", "未知标题")
|
||||
title = result_data.get("title", UNKNOWN_TITLE)
|
||||
episodic_type_raw = result_data.get("type", DEFAULT_TYPE)
|
||||
|
||||
# 5. 校验和归一化类型
|
||||
@@ -130,16 +172,16 @@ async def generate_title_and_type_for_summary(
|
||||
f"已归一化为 '{episodic_type}'"
|
||||
)
|
||||
|
||||
logger.info(f"成功生成标题和类型: title={title}, type={episodic_type}")
|
||||
logger.info(f"成功生成标题和类型 (language={language}): title={title}, type={episodic_type}")
|
||||
return (title, episodic_type)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"无法解析LLM响应为JSON: {full_response}")
|
||||
return ("解析失败", DEFAULT_TYPE)
|
||||
logger.error(f"无法解析LLM响应为JSON (language={language}): {full_response}")
|
||||
return (PARSE_ERROR_TITLE, DEFAULT_TYPE)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"生成标题和类型时出错: {str(e)}", exc_info=True)
|
||||
return ("错误", DEFAULT_TYPE)
|
||||
logger.error(f"生成标题和类型时出错 (language={language}): {str(e)}", exc_info=True)
|
||||
return (ERROR_TITLE, DEFAULT_TYPE)
|
||||
|
||||
async def _process_chunk_summary(
|
||||
dialog: DialogData,
|
||||
@@ -153,11 +195,16 @@ async def _process_chunk_summary(
|
||||
return None
|
||||
|
||||
try:
|
||||
# 从配置中获取语言设置(只获取一次,复用),并校验有效性
|
||||
from app.core.config import settings
|
||||
language = validate_language(settings.DEFAULT_LANGUAGE)
|
||||
|
||||
# Render prompt via Jinja2 for a single chunk
|
||||
prompt_content = await render_memory_summary_prompt(
|
||||
chunk_texts=chunk.content,
|
||||
json_schema=MemorySummaryResponse.model_json_schema(),
|
||||
max_words=200,
|
||||
language=language,
|
||||
)
|
||||
|
||||
messages = [
|
||||
@@ -178,9 +225,10 @@ async def _process_chunk_summary(
|
||||
try:
|
||||
title, episodic_type = await generate_title_and_type_for_summary(
|
||||
content=summary_text,
|
||||
llm_client=llm_client
|
||||
llm_client=llm_client,
|
||||
language=language
|
||||
)
|
||||
logger.info(f"Generated title and type for MemorySummary: title={title}, type={episodic_type}")
|
||||
logger.info(f"Generated title and type for MemorySummary (language={language}): title={title}, type={episodic_type}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to generate title and type for chunk {chunk.id}: {e}")
|
||||
# Continue without title and type
|
||||
|
||||
@@ -25,6 +25,15 @@ class TripletExtractor:
|
||||
"""
|
||||
self.llm_client = llm_client
|
||||
|
||||
def _get_language(self) -> str:
|
||||
"""Get the configured language for entity descriptions
|
||||
|
||||
Returns:
|
||||
Language code ("zh" or "en")
|
||||
"""
|
||||
from app.core.config import settings
|
||||
return settings.DEFAULT_LANGUAGE
|
||||
|
||||
async def _extract_triplets(self, statement: Statement, chunk_content: str) -> TripletExtractionResponse:
|
||||
"""Process a single statement and return extracted triplets and entities"""
|
||||
# Render the prompt using helper function
|
||||
@@ -40,7 +49,8 @@ class TripletExtractor:
|
||||
statement=statement.statement,
|
||||
chunk_content=chunk_content,
|
||||
json_schema=TripletExtractionResponse.model_json_schema(),
|
||||
predicate_instructions=PREDICATE_DEFINITIONS
|
||||
predicate_instructions=PREDICATE_DEFINITIONS,
|
||||
language=self._get_language()
|
||||
)
|
||||
|
||||
# Create messages for LLM
|
||||
|
||||
@@ -177,7 +177,7 @@ def render_entity_dedup_prompt(
|
||||
|
||||
# Args:
|
||||
# entity_a: Dict of entity A attributes
|
||||
async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None) -> str:
|
||||
async def render_triplet_extraction_prompt(statement: str, chunk_content: str, json_schema: dict, predicate_instructions: dict = None, language: str = "zh") -> str:
|
||||
"""
|
||||
Renders the triplet extraction prompt using the extract_triplet.jinja2 template.
|
||||
|
||||
@@ -186,6 +186,7 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
|
||||
chunk_content: The content of the chunk to process
|
||||
json_schema: JSON schema for the expected output format
|
||||
predicate_instructions: Optional predicate instructions
|
||||
language: The language to use for entity descriptions ("zh" for Chinese, "en" for English)
|
||||
|
||||
Returns:
|
||||
Rendered prompt content as string
|
||||
@@ -195,7 +196,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
|
||||
statement=statement,
|
||||
chunk_content=chunk_content,
|
||||
json_schema=json_schema,
|
||||
predicate_instructions=predicate_instructions
|
||||
predicate_instructions=predicate_instructions,
|
||||
language=language
|
||||
)
|
||||
# 记录渲染结果到提示日志(与示例日志结构一致)
|
||||
log_prompt_rendering('triplet extraction', rendered_prompt)
|
||||
@@ -204,7 +206,8 @@ async def render_triplet_extraction_prompt(statement: str, chunk_content: str, j
|
||||
'statement': 'str',
|
||||
'chunk_content': 'str',
|
||||
'json_schema': 'TripletExtractionResponse.schema',
|
||||
'predicate_instructions': 'PREDICATE_DEFINITIONS'
|
||||
'predicate_instructions': 'PREDICATE_DEFINITIONS',
|
||||
'language': language
|
||||
})
|
||||
|
||||
return rendered_prompt
|
||||
@@ -213,6 +216,7 @@ async def render_memory_summary_prompt(
|
||||
chunk_texts: str,
|
||||
json_schema: dict,
|
||||
max_words: int = 200,
|
||||
language: str = "zh",
|
||||
) -> str:
|
||||
"""
|
||||
Renders the memory summary prompt using the memory_summary.jinja2 template.
|
||||
@@ -221,6 +225,7 @@ async def render_memory_summary_prompt(
|
||||
chunk_texts: Concatenated text of conversation chunks
|
||||
json_schema: JSON schema for the expected output format
|
||||
max_words: Maximum words for the summary
|
||||
language: The language to use for summary generation ("zh" for Chinese, "en" for English)
|
||||
|
||||
Returns:
|
||||
Rendered prompt content as string.
|
||||
@@ -230,12 +235,14 @@ async def render_memory_summary_prompt(
|
||||
chunk_texts=chunk_texts,
|
||||
json_schema=json_schema,
|
||||
max_words=max_words,
|
||||
language=language,
|
||||
)
|
||||
log_prompt_rendering('memory summary', rendered_prompt)
|
||||
log_template_rendering('memory_summary.jinja2', {
|
||||
'chunk_texts_len': len(chunk_texts or ""),
|
||||
'max_words': max_words,
|
||||
'json_schema': 'MemorySummaryResponse.schema'
|
||||
'json_schema': 'MemorySummaryResponse.schema',
|
||||
'language': language
|
||||
})
|
||||
return rendered_prompt
|
||||
|
||||
@@ -388,24 +395,26 @@ async def render_memory_insight_prompt(
|
||||
return rendered_prompt
|
||||
|
||||
|
||||
async def render_episodic_title_and_type_prompt(content: str) -> str:
|
||||
async def render_episodic_title_and_type_prompt(content: str, language: str = "zh") -> str:
|
||||
"""
|
||||
Renders the episodic title and type classification prompt using the episodic_type_classification.jinja2 template.
|
||||
|
||||
Args:
|
||||
content: The content of the episodic memory summary to analyze
|
||||
language: The language to use for title generation ("zh" for Chinese, "en" for English)
|
||||
|
||||
Returns:
|
||||
Rendered prompt content as string
|
||||
"""
|
||||
template = prompt_env.get_template("episodic_type_classification.jinja2")
|
||||
rendered_prompt = template.render(content=content)
|
||||
rendered_prompt = template.render(content=content, language=language)
|
||||
|
||||
# 记录渲染结果到提示日志
|
||||
log_prompt_rendering('episodic title and type classification', rendered_prompt)
|
||||
# 可选:记录模板渲染信息
|
||||
log_template_rendering('episodic_type_classification.jinja2', {
|
||||
'content_len': len(content) if content else 0
|
||||
'content_len': len(content) if content else 0,
|
||||
'language': language
|
||||
})
|
||||
|
||||
return rendered_prompt
|
||||
|
||||
@@ -1,8 +1,19 @@
|
||||
=== Task ===
|
||||
Generate a concise title and classify the episodic memory into the most appropriate category.
|
||||
|
||||
{% if language == "zh" %}
|
||||
**重要:请使用中文生成标题和分类。**
|
||||
{% else %}
|
||||
**Important: Please generate the title and classification in English.**
|
||||
{% endif %}
|
||||
|
||||
=== Requirements ===
|
||||
- Extract a clear, concise title (10-20 characters) that captures the core content
|
||||
{% if language == "zh" %}
|
||||
- 标题必须使用中文
|
||||
{% else %}
|
||||
- Title must be in English
|
||||
{% endif %}
|
||||
- Classify into exactly one category based on the primary theme
|
||||
- Be specific and avoid ambiguity
|
||||
- Output must be valid JSON conforming to the schema below
|
||||
|
||||
@@ -5,6 +5,12 @@
|
||||
===Task===
|
||||
Extract entities and knowledge triplets from the given statement.
|
||||
|
||||
{% if language == "zh" %}
|
||||
**重要:请使用中文生成实体描述(description)和示例(example)。**
|
||||
{% else %}
|
||||
**Important: Please generate entity descriptions and examples in English.**
|
||||
{% endif %}
|
||||
|
||||
===Inputs===
|
||||
**Chunk Content:** "{{ chunk_content }}"
|
||||
**Statement:** "{{ statement }}"
|
||||
@@ -13,6 +19,13 @@ Extract entities and knowledge triplets from the given statement.
|
||||
|
||||
**Entity Extraction:**
|
||||
- Extract entities with their types, context-independent descriptions, **concise examples**, aliases, and semantic memory classification
|
||||
{% if language == "zh" %}
|
||||
- **实体描述(description)必须使用中文**
|
||||
- **示例(example)必须使用中文**
|
||||
{% else %}
|
||||
- **Entity descriptions must be in English**
|
||||
- **Examples must be in English**
|
||||
{% endif %}
|
||||
- **Semantic Memory Classification (is_explicit_memory):**
|
||||
* Set to `true` if the entity represents **explicit/semantic memory**:
|
||||
- **Concepts:** "Machine Learning", "Photosynthesis", "Democracy", "人工智能", "光合作用", "民主"
|
||||
@@ -334,9 +347,11 @@ Output:
|
||||
- Escape quotation marks in text with backslashes (\")
|
||||
- Ensure proper string closure and comma separation
|
||||
- No line breaks within JSON string values
|
||||
- The output language should ALWAYS match the input language
|
||||
- If input is in English, extract statements in English
|
||||
- If input is in Chinese, extract statements in Chinese
|
||||
{% if language == "zh" %}
|
||||
- **语言要求:实体描述(description)和示例(example)必须使用中文**
|
||||
{% else %}
|
||||
- **Language Requirement: Entity descriptions and examples must be in English**
|
||||
{% endif %}
|
||||
- Preserve the original language and do not translate
|
||||
|
||||
{{ json_schema }}
|
||||
@@ -5,10 +5,21 @@
|
||||
=== Task ===
|
||||
Summarize the provided conversation chunks into a concise Memory summary.
|
||||
|
||||
{% if language == "zh" %}
|
||||
**重要:请使用中文生成摘要内容。**
|
||||
{% else %}
|
||||
**Important: Please generate the summary content in English.**
|
||||
{% endif %}
|
||||
|
||||
=== Requirements ===
|
||||
- Focus on factual statements, user preferences, relationships, and salient temporal context.
|
||||
- Avoid repetition and filler; be specific.
|
||||
- Keep it under {{ max_words or 200 }} words.
|
||||
{% if language == "zh" %}
|
||||
- 摘要内容必须使用中文
|
||||
{% else %}
|
||||
- Summary content must be in English
|
||||
{% endif %}
|
||||
- Output must be valid JSON conforming to the schema below.
|
||||
|
||||
=== Input ===
|
||||
@@ -24,6 +35,11 @@ Summarize the provided conversation chunks into a concise Memory summary.
|
||||
4. Do not include line breaks within JSON string values
|
||||
5. Example of proper escaping: "statement": "张曼婷说:\"我很喜欢这本书。\""
|
||||
|
||||
The output language should always be the same as the input language.
|
||||
{% if language == "zh" %}
|
||||
**语言要求:输出内容必须使用中文。**
|
||||
{% else %}
|
||||
**Language Requirement: The output content must be in English.**
|
||||
{% endif %}
|
||||
|
||||
Return only a list of extracted labelled statements in the JSON ARRAY of objects that match the schema below:
|
||||
{{ json_schema }}
|
||||
@@ -20,6 +20,9 @@ class MemoryConfig(Base):
|
||||
end_user_id = Column(String, nullable=True, comment="组ID")
|
||||
user_id = Column(String, nullable=True, comment="用户ID")
|
||||
apply_id = Column(String, nullable=True, comment="应用ID")
|
||||
|
||||
# 本体场景关联
|
||||
scene_id = Column(UUID(as_uuid=True), nullable=True, comment="本体场景ID,关联ontology_scene表")
|
||||
|
||||
# 模型选择(从workspace继承)
|
||||
llm_id = Column(String, nullable=True, comment="LLM模型配置ID")
|
||||
|
||||
@@ -229,6 +229,7 @@ class MemoryConfigRepository:
|
||||
config_name=params.config_name,
|
||||
config_desc=params.config_desc,
|
||||
workspace_id=params.workspace_id,
|
||||
scene_id=params.scene_id,
|
||||
llm_id=params.llm_id,
|
||||
embedding_id=params.embedding_id,
|
||||
rerank_id=params.rerank_id,
|
||||
|
||||
@@ -229,6 +229,9 @@ class ConfigParamsCreate(BaseModel): # 创建配置参数模型(仅 body,
|
||||
config_desc: str = Field("配置描述", description="配置描述(字符串)")
|
||||
workspace_id: Optional[uuid.UUID] = Field(None, description="工作空间ID(UUID)")
|
||||
|
||||
# 本体场景关联(可选)
|
||||
scene_id: Optional[uuid.UUID] = Field(None, description="本体场景ID(UUID),关联ontology_scene表")
|
||||
|
||||
# 模型配置字段(可选,用于手动指定或自动填充)
|
||||
llm_id: Optional[str] = Field(None, description="LLM模型配置ID")
|
||||
embedding_id: Optional[str] = Field(None, description="嵌入模型配置ID")
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
|
||||
# Language Configuration
|
||||
# Supported values: "zh" (Chinese), "en" (English)
|
||||
# This controls the language used for memory summary titles and other generated content
|
||||
DEFAULT_LANGUAGE=zh
|
||||
|
||||
# Neo4j Configuration (记忆系统数据库)
|
||||
NEO4J_URI=
|
||||
NEO4J_USERNAME=
|
||||
|
||||
Reference in New Issue
Block a user