Merge branch 'refs/heads/develop' into feature/agent-tool_xjn
This commit is contained in:
@@ -153,7 +153,7 @@ class PerceptualSearchService:
|
||||
return []
|
||||
try:
|
||||
r = await search_perceptual(
|
||||
connector=connector, q=escaped,
|
||||
connector=connector, query=escaped,
|
||||
end_user_id=self.end_user_id,
|
||||
limit=limit * 5, # 多查一些以提高命中率
|
||||
)
|
||||
@@ -178,7 +178,7 @@ class PerceptualSearchService:
|
||||
if not escaped.strip():
|
||||
return []
|
||||
r = await search_perceptual(
|
||||
connector=connector, q=escaped,
|
||||
connector=connector, query=escaped,
|
||||
end_user_id=self.end_user_id, limit=limit,
|
||||
)
|
||||
return r.get("perceptuals", [])
|
||||
|
||||
@@ -58,6 +58,14 @@ from app.core.memory.models.triplet_models import (
|
||||
TripletExtractionResponse,
|
||||
)
|
||||
|
||||
# User metadata models
|
||||
from app.core.memory.models.metadata_models import (
|
||||
UserMetadata,
|
||||
UserMetadataBehavioralHints,
|
||||
UserMetadataProfile,
|
||||
MetadataExtractionResponse,
|
||||
)
|
||||
|
||||
# Ontology scenario models (LLM extracted from scenarios)
|
||||
from app.core.memory.models.ontology_scenario_models import (
|
||||
OntologyClass,
|
||||
@@ -124,6 +132,10 @@ __all__ = [
|
||||
"Entity",
|
||||
"Triplet",
|
||||
"TripletExtractionResponse",
|
||||
"UserMetadata",
|
||||
"UserMetadataBehavioralHints",
|
||||
"UserMetadataProfile",
|
||||
"MetadataExtractionResponse",
|
||||
# Ontology models
|
||||
"OntologyClass",
|
||||
"OntologyExtractionResponse",
|
||||
|
||||
@@ -364,12 +364,14 @@ class ChunkNode(Node):
|
||||
Attributes:
|
||||
dialog_id: ID of the parent dialog
|
||||
content: The text content of the chunk
|
||||
speaker: Speaker identifier ('user' or 'assistant')
|
||||
chunk_embedding: Optional embedding vector for the chunk
|
||||
sequence_number: Order of this chunk within the dialog
|
||||
metadata: Additional chunk metadata as key-value pairs
|
||||
"""
|
||||
dialog_id: str = Field(..., description="ID of the parent dialog")
|
||||
content: str = Field(..., description="The text content of the chunk")
|
||||
speaker: Optional[str] = Field(None, description="Speaker identifier: 'user' for user messages, 'assistant' for AI responses")
|
||||
chunk_embedding: Optional[List[float]] = Field(None, description="Chunk embedding vector")
|
||||
sequence_number: int = Field(..., description="Order of this chunk within the dialog")
|
||||
metadata: dict = Field(default_factory=dict, description="Additional chunk metadata")
|
||||
|
||||
57
api/app/core/memory/models/metadata_models.py
Normal file
57
api/app/core/memory/models/metadata_models.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Models for user metadata extraction.
|
||||
|
||||
Independent from triplet_models.py - these models are used by the
|
||||
standalone metadata extraction pipeline (post-dedup async Celery task).
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class UserMetadataProfile(BaseModel):
|
||||
"""用户画像信息"""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
role: str = Field(default="", description="用户职业或角色")
|
||||
domain: str = Field(default="", description="用户所在领域")
|
||||
expertise: List[str] = Field(
|
||||
default_factory=list, description="用户擅长的技能或工具"
|
||||
)
|
||||
interests: List[str] = Field(
|
||||
default_factory=list, description="用户关注的话题或领域标签"
|
||||
)
|
||||
|
||||
|
||||
class UserMetadataBehavioralHints(BaseModel):
|
||||
"""行为偏好"""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
learning_stage: str = Field(default="", description="学习阶段")
|
||||
preferred_depth: str = Field(default="", description="偏好深度")
|
||||
tone_preference: str = Field(default="", description="语气偏好")
|
||||
|
||||
|
||||
class UserMetadata(BaseModel):
|
||||
"""用户元数据顶层结构"""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
profile: UserMetadataProfile = Field(default_factory=UserMetadataProfile)
|
||||
behavioral_hints: UserMetadataBehavioralHints = Field(
|
||||
default_factory=UserMetadataBehavioralHints
|
||||
)
|
||||
knowledge_tags: List[str] = Field(default_factory=list, description="知识标签")
|
||||
|
||||
|
||||
class MetadataExtractionResponse(BaseModel):
|
||||
"""元数据提取 LLM 响应结构"""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
user_metadata: UserMetadata = Field(default_factory=UserMetadata)
|
||||
aliases_to_add: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="本次新发现的用户别名(用户自我介绍或他人对用户的称呼)",
|
||||
)
|
||||
aliases_to_remove: List[str] = Field(
|
||||
default_factory=list, description="用户明确否认的别名(如'我不叫XX了')"
|
||||
)
|
||||
@@ -1,4 +1,3 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import math
|
||||
@@ -6,7 +5,6 @@ import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||
from uuid import UUID
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.schemas.memory_config_schema import MemoryConfig
|
||||
@@ -23,7 +21,7 @@ from app.core.memory.utils.config.config_utils import (
|
||||
)
|
||||
from app.core.memory.utils.data.text_utils import extract_plain_query
|
||||
from app.core.memory.utils.data.time_utils import normalize_date_safe
|
||||
from app.core.memory.utils.llm.llm_utils import get_reranker_client
|
||||
# from app.core.memory.utils.llm.llm_utils import get_reranker_client
|
||||
from app.core.models.base import RedBearModelConfig
|
||||
from app.db import get_db_context
|
||||
from app.repositories.neo4j.graph_search import (
|
||||
@@ -748,11 +746,10 @@ async def run_hybrid_search(
|
||||
if search_type in ["keyword", "hybrid"]:
|
||||
# Keyword-based search
|
||||
logger.info("[PERF] Starting keyword search...")
|
||||
keyword_start = time.time()
|
||||
keyword_task = asyncio.create_task(
|
||||
search_graph(
|
||||
connector=connector,
|
||||
q=query_text,
|
||||
query=query_text,
|
||||
end_user_id=end_user_id,
|
||||
limit=limit,
|
||||
include=include
|
||||
@@ -762,7 +759,6 @@ async def run_hybrid_search(
|
||||
if search_type in ["embedding", "hybrid"]:
|
||||
# Embedding-based search
|
||||
logger.info("[PERF] Starting embedding search...")
|
||||
embedding_start = time.time()
|
||||
|
||||
# 从数据库读取嵌入器配置(按 ID)并构建 RedBearModelConfig
|
||||
config_load_start = time.time()
|
||||
@@ -904,10 +900,10 @@ async def run_hybrid_search(
|
||||
else:
|
||||
results["latency_metrics"] = latency_metrics
|
||||
|
||||
logger.info(f"[PERF] ===== SEARCH PERFORMANCE SUMMARY =====")
|
||||
logger.info("[PERF] ===== SEARCH PERFORMANCE SUMMARY =====")
|
||||
logger.info(f"[PERF] Total search completed in {total_latency:.4f}s")
|
||||
logger.info(f"[PERF] Latency breakdown: {json.dumps(latency_metrics, indent=2)}")
|
||||
logger.info(f"[PERF] =========================================")
|
||||
logger.info("[PERF] =========================================")
|
||||
|
||||
# Sanitize results: drop large/unused fields
|
||||
_remove_keys_recursive(results, ["name_embedding"]) # drop entity name embeddings from outputs
|
||||
|
||||
@@ -311,10 +311,53 @@ class ExtractionOrchestrator:
|
||||
dialog_data_list,
|
||||
)
|
||||
|
||||
# 步骤 7: 同步用户别名到数据库表(仅正式模式)
|
||||
# 步骤 7: 触发异步元数据和别名提取(仅正式模式)
|
||||
if not is_pilot_run:
|
||||
logger.info("步骤 7: 同步用户别名到 end_user 和 end_user_info 表")
|
||||
await self._update_end_user_other_name(entity_nodes, dialog_data_list)
|
||||
try:
|
||||
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.metadata_extractor import (
|
||||
MetadataExtractor,
|
||||
)
|
||||
|
||||
metadata_extractor = MetadataExtractor(
|
||||
llm_client=self.llm_client, language=self.language
|
||||
)
|
||||
user_statements = (
|
||||
metadata_extractor.collect_user_related_statements(
|
||||
entity_nodes, statement_nodes, statement_entity_edges
|
||||
)
|
||||
)
|
||||
if user_statements:
|
||||
end_user_id = (
|
||||
dialog_data_list[0].end_user_id
|
||||
if dialog_data_list
|
||||
else None
|
||||
)
|
||||
config_id = (
|
||||
dialog_data_list[0].config_id
|
||||
if dialog_data_list
|
||||
and hasattr(dialog_data_list[0], "config_id")
|
||||
else None
|
||||
)
|
||||
if end_user_id:
|
||||
from app.tasks import extract_user_metadata_task
|
||||
|
||||
extract_user_metadata_task.delay(
|
||||
end_user_id=str(end_user_id),
|
||||
statements=user_statements,
|
||||
config_id=str(config_id) if config_id else None,
|
||||
language=self.language,
|
||||
)
|
||||
logger.info(
|
||||
f"已触发异步元数据提取任务,共 {len(user_statements)} 条用户相关 statement"
|
||||
)
|
||||
else:
|
||||
logger.info("未找到用户相关 statement,跳过元数据提取")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"触发元数据提取任务失败(不影响主流程): {e}", exc_info=True
|
||||
)
|
||||
|
||||
# 别名同步已迁移到 Celery 元数据提取任务中,不再在此处执行
|
||||
|
||||
logger.info(f"知识提取流水线运行完成({mode_str})")
|
||||
return (
|
||||
@@ -1107,6 +1150,7 @@ class ExtractionOrchestrator:
|
||||
end_user_id=dialog_data.end_user_id,
|
||||
run_id=dialog_data.run_id, # 使用 dialog_data 的 run_id
|
||||
content=chunk.content,
|
||||
speaker=getattr(chunk, 'speaker', None),
|
||||
chunk_embedding=chunk.chunk_embedding,
|
||||
sequence_number=chunk_idx, # 添加必需的 sequence_number 字段
|
||||
created_at=dialog_data.created_at,
|
||||
@@ -1342,7 +1386,7 @@ class ExtractionOrchestrator:
|
||||
async def _update_end_user_other_name(
|
||||
self,
|
||||
entity_nodes: List[ExtractedEntityNode],
|
||||
dialog_data_list: List[DialogData]
|
||||
dialog_data_list: List[DialogData],
|
||||
) -> None:
|
||||
"""
|
||||
将本轮提取的用户别名同步到 end_user 和 end_user_info 表。
|
||||
@@ -1470,7 +1514,6 @@ class ExtractionOrchestrator:
|
||||
end_user_id=end_user_uuid,
|
||||
other_name=first_alias,
|
||||
aliases=merged_aliases,
|
||||
meta_data={}
|
||||
))
|
||||
logger.info(f"创建 end_user_info 记录,other_name={first_alias}, aliases={merged_aliases}")
|
||||
|
||||
@@ -1478,9 +1521,6 @@ class ExtractionOrchestrator:
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"更新 end_user other_name 失败: {e}", exc_info=True)
|
||||
|
||||
|
||||
|
||||
# 用户实体占位名称,不允许作为 other_name 或出现在 aliases 中
|
||||
# 复用 deduped_and_disamb 模块级常量,避免重复维护
|
||||
USER_PLACEHOLDER_NAMES = _USER_PLACEHOLDER_NAMES
|
||||
@@ -1587,7 +1627,6 @@ class ExtractionOrchestrator:
|
||||
if candidate and candidate.lower() in self.USER_PLACEHOLDER_NAMES:
|
||||
return None
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
async def _run_dedup_and_write_summary(
|
||||
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Metadata extractor module.
|
||||
|
||||
Collects user-related statements from post-dedup graph data and
|
||||
extracts user metadata via an independent LLM call.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from app.core.memory.models.graph_models import (
|
||||
ExtractedEntityNode,
|
||||
StatementEntityEdge,
|
||||
StatementNode,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Reuse the same user-entity detection logic from dedup module
|
||||
_USER_NAMES = {"用户", "我", "user", "i"}
|
||||
_CANONICAL_USER_TYPE = "用户"
|
||||
|
||||
|
||||
def _is_user_entity(ent: ExtractedEntityNode) -> bool:
|
||||
"""判断实体是否为用户实体"""
|
||||
name = (getattr(ent, "name", "") or "").strip().lower()
|
||||
etype = (getattr(ent, "entity_type", "") or "").strip()
|
||||
return name in _USER_NAMES or etype == _CANONICAL_USER_TYPE
|
||||
|
||||
|
||||
class MetadataExtractor:
|
||||
"""Extracts user metadata from post-dedup graph data via independent LLM call."""
|
||||
|
||||
def __init__(self, llm_client, language: Optional[str] = None):
|
||||
self.llm_client = llm_client
|
||||
self.language = language
|
||||
|
||||
@staticmethod
|
||||
def detect_language(statements: List[str]) -> str:
|
||||
"""根据 statement 文本内容检测语言。
|
||||
如果文本中包含中文字符则返回 "zh",否则返回 "en"。
|
||||
"""
|
||||
import re
|
||||
|
||||
combined = " ".join(statements)
|
||||
if re.search(r"[\u4e00-\u9fff]", combined):
|
||||
return "zh"
|
||||
return "en"
|
||||
|
||||
def collect_user_related_statements(
|
||||
self,
|
||||
entity_nodes: List[ExtractedEntityNode],
|
||||
statement_nodes: List[StatementNode],
|
||||
statement_entity_edges: List[StatementEntityEdge],
|
||||
) -> List[str]:
|
||||
"""
|
||||
从去重后的数据中筛选与用户直接相关且由用户发言的 statement 文本。
|
||||
|
||||
筛选逻辑:
|
||||
1. 用户实体 → StatementEntityEdge → statement(直接关联)
|
||||
2. 只保留 speaker="user" 的 statement(过滤 assistant 回复的噪声)
|
||||
|
||||
Returns:
|
||||
用户发言的 statement 文本列表
|
||||
"""
|
||||
# Find user entity IDs
|
||||
user_entity_ids = set()
|
||||
for ent in entity_nodes:
|
||||
if _is_user_entity(ent):
|
||||
user_entity_ids.add(ent.id)
|
||||
|
||||
if not user_entity_ids:
|
||||
logger.debug("未找到用户实体节点,跳过 statement 收集")
|
||||
return []
|
||||
|
||||
# 用户实体 → StatementEntityEdge → statement
|
||||
target_stmt_ids = set()
|
||||
for edge in statement_entity_edges:
|
||||
if edge.target in user_entity_ids:
|
||||
target_stmt_ids.add(edge.source)
|
||||
|
||||
# Collect: only speaker="user" statements, preserving order
|
||||
result = []
|
||||
seen = set()
|
||||
total_associated = 0
|
||||
skipped_non_user = 0
|
||||
for stmt_node in statement_nodes:
|
||||
if stmt_node.id in target_stmt_ids and stmt_node.id not in seen:
|
||||
total_associated += 1
|
||||
speaker = getattr(stmt_node, "speaker", None) or "unknown"
|
||||
if speaker == "user":
|
||||
text = (stmt_node.statement or "").strip()
|
||||
if text:
|
||||
result.append(text)
|
||||
else:
|
||||
skipped_non_user += 1
|
||||
seen.add(stmt_node.id)
|
||||
|
||||
logger.info(
|
||||
f"收集到 {len(result)} 条用户发言 statement "
|
||||
f"(直接关联: {total_associated}, speaker=user: {len(result)}, "
|
||||
f"跳过非user: {skipped_non_user})"
|
||||
)
|
||||
if result:
|
||||
for i, text in enumerate(result):
|
||||
logger.info(f" [user statement {i + 1}] {text}")
|
||||
if total_associated > 0 and len(result) == 0:
|
||||
logger.warning(
|
||||
f"有 {total_associated} 条直接关联 statement 但全部被 speaker 过滤,"
|
||||
f"可能本次写入不包含 user 消息"
|
||||
)
|
||||
return result
|
||||
|
||||
async def extract_metadata(
|
||||
self,
|
||||
statements: List[str],
|
||||
existing_metadata: Optional[dict] = None,
|
||||
existing_aliases: Optional[List[str]] = None,
|
||||
) -> Optional[tuple]:
|
||||
"""
|
||||
对筛选后的 statement 列表调用 LLM 提取元数据和用户别名。
|
||||
|
||||
Args:
|
||||
statements: 用户发言的 statement 文本列表
|
||||
existing_metadata: 数据库已有的元数据(可选)
|
||||
existing_aliases: 数据库已有的用户别名列表(可选)
|
||||
|
||||
Returns:
|
||||
(UserMetadata, List[str], List[str]) tuple: (metadata, aliases_to_add, aliases_to_remove) on success, None on failure
|
||||
"""
|
||||
if not statements:
|
||||
return None
|
||||
|
||||
try:
|
||||
from app.core.memory.utils.prompt.prompt_utils import prompt_env
|
||||
|
||||
if self.language:
|
||||
detected_language = self.language
|
||||
logger.info(f"元数据提取使用显式指定语言: {detected_language}")
|
||||
else:
|
||||
detected_language = self.detect_language(statements)
|
||||
logger.info(f"元数据提取语言自动检测结果: {detected_language}")
|
||||
|
||||
template = prompt_env.get_template("extract_user_metadata.jinja2")
|
||||
prompt = template.render(
|
||||
statements=statements,
|
||||
language=detected_language,
|
||||
existing_metadata=existing_metadata,
|
||||
existing_aliases=existing_aliases,
|
||||
json_schema="",
|
||||
)
|
||||
|
||||
from app.core.memory.models.metadata_models import (
|
||||
MetadataExtractionResponse,
|
||||
)
|
||||
|
||||
response = await self.llm_client.response_structured(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_model=MetadataExtractionResponse,
|
||||
)
|
||||
|
||||
if response:
|
||||
metadata = response.user_metadata if response.user_metadata else None
|
||||
to_add = response.aliases_to_add if response.aliases_to_add else []
|
||||
to_remove = (
|
||||
response.aliases_to_remove if response.aliases_to_remove else []
|
||||
)
|
||||
return metadata, to_add, to_remove
|
||||
|
||||
logger.warning("LLM 返回的响应为空")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"元数据提取 LLM 调用失败: {e}", exc_info=True)
|
||||
return None
|
||||
@@ -1,6 +1,5 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
@@ -82,6 +81,7 @@ class StatementExtractor:
|
||||
logger.warning(f"Chunk {getattr(chunk, 'id', 'unknown')} has no speaker field or is empty")
|
||||
return None
|
||||
|
||||
|
||||
async def _extract_statements(self, chunk, end_user_id: Optional[str] = None, dialogue_content: str = None) -> List[Statement]:
|
||||
"""Process a single chunk and return extracted statements
|
||||
|
||||
@@ -94,7 +94,8 @@ class StatementExtractor:
|
||||
List of ExtractedStatement objects extracted from the chunk
|
||||
"""
|
||||
chunk_content = chunk.content
|
||||
|
||||
chunk_speaker = self._get_speaker_from_chunk(chunk)
|
||||
|
||||
if not chunk_content or len(chunk_content.strip()) < 5:
|
||||
logger.warning(f"Chunk {chunk.id} content too short or empty, skipping")
|
||||
return []
|
||||
@@ -149,8 +150,6 @@ class StatementExtractor:
|
||||
relevence_info = RelevenceInfo[relevence_str] if relevence_str in RelevenceInfo.__members__ else RelevenceInfo.RELEVANT
|
||||
except (KeyError, ValueError):
|
||||
relevence_info = RelevenceInfo.RELEVANT
|
||||
|
||||
chunk_speaker = self._get_speaker_from_chunk(chunk)
|
||||
|
||||
chunk_statement = Statement(
|
||||
statement=extracted_stmt.statement,
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import os
|
||||
import asyncio
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
使用Neo4j的全文索引进行高效的文本匹配。
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import List, Optional
|
||||
from app.core.logging_config import get_memory_logger
|
||||
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
|
||||
from app.core.memory.storage_services.search.search_strategy import SearchStrategy, SearchResult
|
||||
@@ -74,7 +74,7 @@ class KeywordSearchStrategy(SearchStrategy):
|
||||
# 调用底层的关键词搜索函数
|
||||
results_dict = await search_graph(
|
||||
connector=self.connector,
|
||||
q=query_text,
|
||||
query=query_text,
|
||||
end_user_id=end_user_id,
|
||||
limit=limit,
|
||||
include=include_list
|
||||
|
||||
@@ -22,7 +22,9 @@ def escape_lucene_query(query: str) -> str:
|
||||
s = s.replace("\r", " ").replace("\n", " ").strip()
|
||||
|
||||
# Lucene reserved tokens/special characters
|
||||
specials = ['&&', '||', '\\', '+', '-', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':']
|
||||
# NOTE: '/' is the regex delimiter in Lucene — must be escaped to prevent
|
||||
# TokenMgrError when the query contains unmatched slashes.
|
||||
specials = ['&&', '||', '\\', '+', '-', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '/']
|
||||
# Replace longer tokens first to avoid partial double-escaping
|
||||
for token in sorted(specials, key=len, reverse=True):
|
||||
s = s.replace(token, f"\\{token}")
|
||||
|
||||
@@ -43,8 +43,9 @@ Each statement must be labeled as per the criteria mentioned below.
|
||||
|
||||
对话上下文和共指消解:
|
||||
- 将每个陈述句归属于说出它的参与者。
|
||||
- 如果参与者列表为说话者提供了名称(例如,"李雪(用户)"),请在提取的陈述句中使用具体名称("李雪"),而不是通用角色("用户")。
|
||||
- 将所有代词解析为对话上下文中的具体人物或实体。
|
||||
- **对于用户的发言:必须使用"用户"作为主语**,禁止将"用户"或"我"替换为用户的真实姓名或别名。例如,用户说"我叫张三"应提取为"用户叫张三",而不是"张三叫张三"。
|
||||
- 对于 AI 助手的发言:使用"助手"或"AI助手"作为主语。
|
||||
- 将所有代词解析为对话上下文中的具体人物或实体,但"我"必须解析为"用户"。
|
||||
- 识别并将抽象引用解析为其具体名称(如果提到)。
|
||||
- 将缩写和首字母缩略词扩展为其完整形式。
|
||||
{% else %}
|
||||
@@ -68,8 +69,9 @@ Context Resolution Requirements:
|
||||
|
||||
Conversational Context & Co-reference Resolution:
|
||||
- Attribute every statement to the participant who uttered it.
|
||||
- If the participant list provides a name for a speaker (e.g., "李雪 (用户)"), use the specific name ("李雪") in the extracted statement, not the generic role ("用户").
|
||||
- Resolve all pronouns to the specific person or entity from the conversation's context.
|
||||
- **For user's statements: always use "用户" (User) as the subject**. Do NOT replace "用户" or "I" with the user's real name or alias. For example, if the user says "I'm John", extract as "用户 is John", not "John is John".
|
||||
- For AI assistant's statements: use "助手" or "AI助手" as the subject.
|
||||
- Resolve all pronouns to the specific person or entity from the conversation's context, but "I"/"我" must always resolve to "用户".
|
||||
- Identify and resolve abstract references to their specific names if mentioned.
|
||||
- Expand abbreviations and acronyms to their full form.
|
||||
{% endif %}
|
||||
@@ -139,13 +141,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料与阿拉伯树胶等粘合
|
||||
示例输出: {
|
||||
"statements": [
|
||||
{
|
||||
"statement": "Sarah Chen 最近一直在尝试水彩画。",
|
||||
"statement": "用户最近一直在尝试水彩画。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "Sarah Chen 画了一些花朵。",
|
||||
"statement": "用户画了一些花朵。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
@@ -157,13 +159,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料与阿拉伯树胶等粘合
|
||||
"relevance": "IRRELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "Sarah Chen 认为她的水彩画中的色彩组合可以改进。",
|
||||
"statement": "用户认为她的水彩画中的色彩组合可以改进。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "Sarah Chen 真的很喜欢玫瑰和百合。",
|
||||
"statement": "用户真的很喜欢玫瑰和百合。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
@@ -186,13 +188,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合
|
||||
示例输出: {
|
||||
"statements": [
|
||||
{
|
||||
"statement": "张曼婷最近在尝试水彩画。",
|
||||
"statement": "用户最近在尝试水彩画。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "张曼婷画了一些花朵。",
|
||||
"statement": "用户画了一些花朵。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
@@ -204,13 +206,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合
|
||||
"relevance": "IRRELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "张曼婷觉得水彩画的色彩搭配还有提升的空间。",
|
||||
"statement": "用户觉得水彩画的色彩搭配还有提升的空间。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "张曼婷很喜欢玫瑰和百合。",
|
||||
"statement": "用户很喜欢玫瑰和百合。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
@@ -233,13 +235,13 @@ User: "I think the color combinations could use some improvement, but I really l
|
||||
Example Output: {
|
||||
"statements": [
|
||||
{
|
||||
"statement": "Sarah Chen has been trying watercolor painting recently.",
|
||||
"statement": "用户 has been trying watercolor painting recently.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "Sarah Chen painted some flowers.",
|
||||
"statement": "用户 painted some flowers.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
@@ -251,13 +253,13 @@ Example Output: {
|
||||
"relevance": "IRRELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "Sarah Chen thinks the color combinations in her watercolor paintings could use some improvement.",
|
||||
"statement": "用户 thinks the color combinations in her watercolor paintings could use some improvement.",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "Sarah Chen really likes roses and lilies.",
|
||||
"statement": "用户 really likes roses and lilies.",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
@@ -280,13 +282,13 @@ AI: "水彩画很有趣!水彩颜料通常由颜料和阿拉伯树胶等粘合
|
||||
Example Output: {
|
||||
"statements": [
|
||||
{
|
||||
"statement": "张曼婷最近在尝试水彩画。",
|
||||
"statement": "用户最近在尝试水彩画。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "张曼婷画了一些花朵。",
|
||||
"statement": "用户画了一些花朵。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "DYNAMIC",
|
||||
"relevance": "RELEVANT"
|
||||
@@ -298,13 +300,13 @@ Example Output: {
|
||||
"relevance": "IRRELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "张曼婷觉得水彩画的色彩搭配还有提升的空间。",
|
||||
"statement": "用户觉得水彩画的色彩搭配还有提升的空间。",
|
||||
"statement_type": "OPINION",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
},
|
||||
{
|
||||
"statement": "张曼婷很喜欢玫瑰和百合。",
|
||||
"statement": "用户很喜欢玫瑰和百合。",
|
||||
"statement_type": "FACT",
|
||||
"temporal_type": "STATIC",
|
||||
"relevance": "RELEVANT"
|
||||
|
||||
@@ -406,4 +406,12 @@ Output:
|
||||
- **⚠️ ALIASES ORDER: preserve temporal order of appearance**
|
||||
- **🚨 MANDATORY FIELD: EVERY entity MUST include "aliases" field, even if empty array []**
|
||||
|
||||
**Output JSON structure:**
|
||||
```json
|
||||
{
|
||||
"triplets": [...],
|
||||
"entities": [...]
|
||||
}
|
||||
```
|
||||
|
||||
{{ json_schema }}
|
||||
|
||||
@@ -0,0 +1,135 @@
|
||||
===Task===
|
||||
Extract user metadata from the following conversation statements spoken by the user.
|
||||
|
||||
{% if language == "zh" %}
|
||||
**"三度原则"判断标准:**
|
||||
- 复用度:该信息是否会被多个功能模块使用?
|
||||
- 约束度:该信息是否会影响系统行为?
|
||||
- 时效性:该信息是长期稳定的还是临时的?仅提取长期稳定信息。
|
||||
|
||||
**提取规则:**
|
||||
- **只提取关于"用户本人"的画像信息**,忽略用户提到的第三方人物(如朋友、同事、家人)的信息
|
||||
- 仅提取文本中明确提到的信息,不要推测
|
||||
- 如果文本中没有可提取的用户画像信息,返回空的 user_metadata 对象
|
||||
- **输出语言必须与输入文本的语言一致**(输入中文则输出中文值,输入英文则输出英文值)
|
||||
|
||||
{% if existing_metadata %}
|
||||
**重要:合并已有元数据**
|
||||
下方提供了数据库中已有的用户元数据。请结合用户最新发言,输出**合并后的完整元数据**:
|
||||
- 如果用户明确否定了已有信息(如"我不再教高中物理了"),在输出中**移除**该信息
|
||||
- 如果用户提到了新信息,**添加**到对应字段中
|
||||
- 如果已有信息未被用户否定,**保留**在输出中
|
||||
- 标量字段(如 role、domain):如果用户提到了新值,用新值替换;否则保留已有值
|
||||
- 最终输出应该是完整的、合并后的元数据,不是增量
|
||||
{% endif %}
|
||||
|
||||
**字段说明:**
|
||||
- profile.role:用户的职业或角色,如 教师、医生、后端工程师
|
||||
- profile.domain:用户所在领域,如 教育、医疗、软件开发
|
||||
- profile.expertise:用户擅长的技能或工具(通用,不限于编程),如 Python、心理咨询、高中物理
|
||||
- profile.interests:用户主动表达兴趣的话题或领域标签
|
||||
- behavioral_hints.learning_stage:学习阶段(初学者/中级/高级)
|
||||
- behavioral_hints.preferred_depth:偏好深度(概览/技术细节/深入探讨)
|
||||
- behavioral_hints.tone_preference:语气偏好(轻松随意/专业简洁/学术严谨)
|
||||
- knowledge_tags:用户涉及的知识领域标签
|
||||
|
||||
**用户别名变更(增量模式):**
|
||||
- **aliases_to_add**:本次新发现的用户别名,包括:
|
||||
* 用户主动自我介绍:如"我叫张三"、"我的名字是XX"、"我的网名是XX"
|
||||
* 他人对用户的称呼:如"同事叫我陈哥"、"大家叫我小张"、"领导叫我老陈"
|
||||
* 只提取原文中逐字出现的名字,严禁推测或创造
|
||||
* 禁止提取:用户给 AI 取的名字、第三方人物自身的名字、"用户"/"我" 等占位词
|
||||
* 如果没有新别名,返回空数组 `[]`
|
||||
- **aliases_to_remove**:用户明确否认的别名,包括:
|
||||
* 用户说"我不叫XX了"、"别叫我XX"、"我改名了,不叫XX" → 将 XX 放入此数组
|
||||
* **严格限制**:只将用户原文中**逐字提到**的被否认名字放入,不要推断关联的其他别名
|
||||
* 例如:用户说"我不叫陈小刀了" → 只移除"陈小刀",不要移除"陈哥"、"老陈"等未被提及的别名
|
||||
* 如果没有要移除的别名,返回空数组 `[]`
|
||||
{% if existing_aliases %}
|
||||
- 已有别名:{{ existing_aliases | tojson }}(仅供参考,不需要在输出中重复)
|
||||
{% endif %}
|
||||
{% else %}
|
||||
**"Three-Degree Principle" criteria:**
|
||||
- Reusability: Will this information be used by multiple functional modules?
|
||||
- Constraint: Will this information affect system behavior?
|
||||
- Timeliness: Is this information long-term stable or temporary? Only extract long-term stable information.
|
||||
|
||||
**Extraction rules:**
|
||||
- **Only extract profile information about the user themselves**, ignore information about third parties (friends, colleagues, family) mentioned by the user
|
||||
- Only extract information explicitly mentioned in the text, do not speculate
|
||||
- If no user profile information can be extracted, return an empty user_metadata object
|
||||
- **Output language must match the input text language**
|
||||
|
||||
{% if existing_metadata %}
|
||||
**Important: Merge with existing metadata**
|
||||
Existing user metadata from the database is provided below. Combine with the user's latest statements to output the **complete merged metadata**:
|
||||
- If the user explicitly negates existing info (e.g. "I no longer teach high school physics"), **remove** it from output
|
||||
- If the user mentions new info, **add** it to the corresponding field
|
||||
- If existing info is not negated by the user, **keep** it in the output
|
||||
- Scalar fields (e.g. role, domain): replace with new value if user mentions one; otherwise keep existing
|
||||
- The final output should be the complete, merged metadata — not an incremental update
|
||||
{% endif %}
|
||||
|
||||
**Field descriptions:**
|
||||
- profile.role: User's occupation or role, e.g. teacher, doctor, software engineer
|
||||
- profile.domain: User's domain, e.g. education, healthcare, software development
|
||||
- profile.expertise: User's skills or tools (general, not limited to programming)
|
||||
- profile.interests: Topics or domain tags the user actively expressed interest in
|
||||
- behavioral_hints.learning_stage: Learning stage (beginner/intermediate/advanced)
|
||||
- behavioral_hints.preferred_depth: Preferred depth (overview/detailed/deep dive)
|
||||
- behavioral_hints.tone_preference: Tone preference (casual/professional/academic)
|
||||
- knowledge_tags: Knowledge domain tags related to the user
|
||||
|
||||
**User alias changes (incremental mode):**
|
||||
- **aliases_to_add**: Newly discovered user aliases from this conversation, including:
|
||||
* User self-introductions: e.g. "I'm John", "My name is XX", "My username is XX"
|
||||
* How others address the user: e.g. "My colleagues call me Johnny", "People call me Mike"
|
||||
* Only extract names that appear VERBATIM in the text — never infer or fabricate
|
||||
* Do NOT extract: names the user gives to the AI, third-party people's own names, placeholder words like "User"/"I"
|
||||
* If no new aliases, return empty array `[]`
|
||||
- **aliases_to_remove**: Aliases the user explicitly denies, including:
|
||||
* User says "Don't call me XX anymore", "I'm not called XX", "I changed my name from XX" → put XX in this array
|
||||
* **Strict rule**: Only include the exact name the user **verbatim mentions** as denied. Do NOT infer or remove related aliases
|
||||
* Example: User says "I'm not called John anymore" → only remove "John", do NOT remove "Johnny", "J" or other related aliases not mentioned
|
||||
* If no aliases to remove, return empty array `[]`
|
||||
{% if existing_aliases %}
|
||||
- Existing aliases: {{ existing_aliases | tojson }} (for reference only, do not repeat in output)
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
===User Statements===
|
||||
{% for stmt in statements %}
|
||||
- {{ stmt }}
|
||||
{% endfor %}
|
||||
|
||||
{% if existing_metadata %}
|
||||
===Existing User Metadata===
|
||||
```json
|
||||
{{ existing_metadata | tojson }}
|
||||
```
|
||||
{% endif %}
|
||||
|
||||
===Output Format===
|
||||
Return a JSON object with the following structure:
|
||||
```json
|
||||
{
|
||||
"user_metadata": {
|
||||
"profile": {
|
||||
"role": "",
|
||||
"domain": "",
|
||||
"expertise": [],
|
||||
"interests": []
|
||||
},
|
||||
"behavioral_hints": {
|
||||
"learning_stage": "",
|
||||
"preferred_depth": "",
|
||||
"tone_preference": ""
|
||||
},
|
||||
"knowledge_tags": []
|
||||
},
|
||||
"aliases_to_add": [],
|
||||
"aliases_to_remove": []
|
||||
}
|
||||
```
|
||||
|
||||
{{ json_schema }}
|
||||
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
|
||||
# Example:
|
||||
# "Hello {{user.name}}!" ->
|
||||
# ["Hello ", "{{user.name}}", "!"]
|
||||
_OUTPUT_PATTERN = re.compile(r'\{\{.*?}}|[^{}]+')
|
||||
_OUTPUT_PATTERN = re.compile(r'\{\{.*?}}|[^{]+|{')
|
||||
# Strict variable format: {{ node_id.field_name }}
|
||||
_VARIABLE_PATTERN = re.compile(r'\{\{\s*[a-zA-Z0-9_]+\.[a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)?\s*}}')
|
||||
|
||||
|
||||
@@ -55,9 +55,9 @@ class CycleGraphNode(BaseNode):
|
||||
if config.output_type in [
|
||||
VariableType.ARRAY_FILE,
|
||||
VariableType.ARRAY_STRING,
|
||||
VariableType.NUMBER,
|
||||
VariableType.ARRAY_NUMBER,
|
||||
VariableType.ARRAY_OBJECT,
|
||||
VariableType.BOOLEAN
|
||||
VariableType.ARRAY_BOOLEAN
|
||||
]:
|
||||
if config.flatten:
|
||||
outputs['output'] = config.output_type
|
||||
|
||||
Reference in New Issue
Block a user