Merge branch 'release/v0.3.0' into develop

* release/v0.3.0: (44 commits)
  Revert "fix(web): prompt editor"
  fix(web): prompt editor
  fix(prompt-optimizer): handle escaped quotes in JSON parsing
  fix(custom-tools): remove parameter coercion in custom tool base class
  fix(core): conditionally apply thinking parameters based on model support
  refactor(custom-tools): coerce query and request body parameters to schema types
  fix(prompt-optimizer): support list content type in prompt optimizer
  refactor(memory): unify user placeholder names and harden alias sync logic
  fix(rag): replace semicolon separators with newlines in Excel parser output
  fix(web): Compatible with Windows whitespace
  fix(memory): make PgSQL the single source of truth for user entity aliases
  refactor(rag): simplify Excel parsing logic and remove redundant chunk_token_num assignment
  fix(web): Hide error message when workflow node error message equals empty string
  ci(wechat-notify): add Sourcery summary extraction with Qwen fallback
  fix(http-request,embedding,naive): tighten form-data validation, reduce truncation length to 8000, and disable chunking for Excel
  fix(web): adjust the value of End User Name
  fix(http-request): support array and file variables in form-data files upload
  fix(web): change http body key name
  fix(web): header user name
  fix(web): calculate using the filtered breadcrumbs length
  ...

# Conflicts:
#	web/src/views/UserMemoryDetail/Neo4j.tsx
#	web/src/views/UserMemoryDetail/components/EndUserProfile.tsx
#	web/src/views/UserMemoryDetail/types.ts
This commit is contained in:
Mark
2026-04-15 19:31:38 +08:00
48 changed files with 702 additions and 452 deletions

View File

@@ -124,10 +124,11 @@ async def get_prompt_opt(
skill=data.skill
):
# chunk 是 prompt 的增量内容
yield f"event:message\ndata: {json.dumps(chunk)}\n\n"
yield f"event:message\ndata: {json.dumps(chunk, ensure_ascii=False)}\n\n"
except Exception as e:
yield f"event:error\ndata: {json.dumps(
{"error": str(e)}
{"error": str(e)},
ensure_ascii=False
)}\n\n"
yield "event:end\ndata: {}\n\n"

View File

@@ -14,6 +14,7 @@ from dotenv import load_dotenv
from app.core.logging_config import get_agent_logger
from app.core.memory.agent.utils.get_dialogs import get_chunked_dialogs
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import _USER_PLACEHOLDER_NAMES
from app.core.memory.storage_services.extraction_engine.extraction_orchestrator import ExtractionOrchestrator
from app.core.memory.storage_services.extraction_engine.knowledge_extraction.memory_summary import \
memory_summary_generation
@@ -191,15 +192,37 @@ async def write(
if success:
logger.info("Successfully saved all data to Neo4j")
# 使用 Celery 异步任务触发聚类(不阻塞主流程)
if all_entity_nodes:
end_user_id = all_entity_nodes[0].end_user_id
# Neo4j 写入完成后,用 PgSQL 权威 aliases 覆盖 Neo4j 用户实体
try:
from app.repositories.end_user_info_repository import EndUserInfoRepository
if end_user_id:
with get_db_context() as db_session:
info = EndUserInfoRepository(db_session).get_by_end_user_id(uuid.UUID(end_user_id))
pg_aliases = info.aliases if info and info.aliases else []
if info is not None:
# 将 Python 侧占位名集合作为参数传入,避免 Cypher 硬编码
placeholder_names = list(_USER_PLACEHOLDER_NAMES)
await neo4j_connector.execute_query(
"""
MATCH (e:ExtractedEntity)
WHERE e.end_user_id = $end_user_id AND toLower(e.name) IN $placeholder_names
SET e.aliases = $aliases
""",
end_user_id=end_user_id, aliases=pg_aliases,
placeholder_names=placeholder_names,
)
logger.info(f"[AliasSync] Neo4j 用户实体 aliases 已用 PgSQL 权威源覆盖: {pg_aliases}")
except Exception as sync_err:
logger.warning(f"[AliasSync] PgSQL→Neo4j aliases 同步失败(不影响主流程): {sync_err}")
# 使用 Celery 异步任务触发聚类(不阻塞主流程)
try:
from app.tasks import run_incremental_clustering
end_user_id = all_entity_nodes[0].end_user_id
new_entity_ids = [e.id for e in all_entity_nodes]
# 异步提交 Celery 任务
task = run_incremental_clustering.apply_async(
kwargs={
"end_user_id": end_user_id,
@@ -207,7 +230,6 @@ async def write(
"llm_model_id": str(memory_config.llm_model_id) if memory_config.llm_model_id else None,
"embedding_model_id": str(memory_config.embedding_model_id) if memory_config.embedding_model_id else None,
},
# 设置任务优先级(低优先级,不影响主业务)
priority=3,
)
logger.info(
@@ -215,7 +237,6 @@ async def write(
f"task_id={task.id}, end_user_id={end_user_id}, entity_count={len(new_entity_ids)}"
)
except Exception as e:
# 聚类任务提交失败不影响主流程
logger.error(f"[Clustering] 提交聚类任务失败(不影响主流程): {e}", exc_info=True)
break

View File

@@ -82,51 +82,38 @@ def _merge_attribute(canonical: ExtractedEntityNode, ent: ExtractedEntityNode):
canonical.connect_strength = next(iter(pair))
# 别名合并(去重保序,使用标准化工具)
# 用户实体的 aliases 由 PgSQL end_user_info 作为唯一权威源,去重合并时不修改
try:
canonical_name = (getattr(canonical, "name", "") or "").strip()
incoming_name = (getattr(ent, "name", "") or "").strip()
# 收集所有需要合并的别名
all_aliases = []
# 1. 添加canonical现有的别名
existing = getattr(canonical, "aliases", []) or []
all_aliases.extend(existing)
# 2. 添加incoming实体的名称如果不同于canonical的名称
if incoming_name and incoming_name != canonical_name:
all_aliases.append(incoming_name)
# 3. 添加incoming实体的所有别名
incoming = getattr(ent, "aliases", []) or []
all_aliases.extend(incoming)
# 4. 标准化并去重优先使用alias_utils工具函数
try:
from app.core.memory.utils.alias_utils import normalize_aliases
canonical.aliases = normalize_aliases(canonical_name, all_aliases)
except Exception:
# 如果导入失败,使用增强的去重逻辑
seen_normalized = set()
unique_aliases = []
if canonical_name.lower() not in _USER_PLACEHOLDER_NAMES:
incoming_name = (getattr(ent, "name", "") or "").strip()
for alias in all_aliases:
if not alias:
continue
alias_stripped = str(alias).strip()
if not alias_stripped or alias_stripped == canonical_name:
continue
# 标准化:转小写用于去重判断
alias_normalized = alias_stripped.lower()
if alias_normalized not in seen_normalized:
seen_normalized.add(alias_normalized)
unique_aliases.append(alias_stripped)
# 收集所有需要合并的别名,过滤掉用户占位名避免污染非用户实体
all_aliases = list(getattr(canonical, "aliases", []) or [])
if incoming_name and incoming_name != canonical_name and incoming_name.lower() not in _USER_PLACEHOLDER_NAMES:
all_aliases.append(incoming_name)
all_aliases.extend(
a for a in (getattr(ent, "aliases", []) or [])
if a and a.strip().lower() not in _USER_PLACEHOLDER_NAMES
)
# 排序并赋值
canonical.aliases = sorted(unique_aliases)
try:
from app.core.memory.utils.alias_utils import normalize_aliases
canonical.aliases = normalize_aliases(canonical_name, all_aliases)
except Exception:
seen_normalized = set()
unique_aliases = []
for alias in all_aliases:
if not alias:
continue
alias_stripped = str(alias).strip()
if not alias_stripped or alias_stripped == canonical_name:
continue
alias_normalized = alias_stripped.lower()
if alias_normalized not in seen_normalized:
seen_normalized.add(alias_normalized)
unique_aliases.append(alias_stripped)
canonical.aliases = sorted(unique_aliases)
except Exception:
pass
@@ -733,66 +720,37 @@ def fuzzy_match(
def _merge_entities_with_aliases(canonical: ExtractedEntityNode, losing: ExtractedEntityNode):
""" 模糊匹配中的实体合并。
"""模糊匹配中的实体合并(别名部分)
合并策略:
1. 保留canonical的主名称不变
2. 将losing的主名称添加为alias如果不同
3. 合并两个实体的所有aliases
4. 自动去重case-insensitive并排序
Args:
canonical: 规范实体(保留)
losing: 被合并实体(删除)
Note:
使用alias_utils.normalize_aliases进行标准化去重
用户实体的 aliases 由 PgSQL end_user_info 作为唯一权威源,跳过合并。
"""
# 获取规范实体的名称
canonical_name = (getattr(canonical, "name", "") or "").strip()
if canonical_name.lower() in _USER_PLACEHOLDER_NAMES:
return
losing_name = (getattr(losing, "name", "") or "").strip()
# 收集所有需要合并的别名
all_aliases = []
# 1. 添加canonical现有的别名
current_aliases = getattr(canonical, "aliases", []) or []
all_aliases.extend(current_aliases)
# 2. 添加losing实体的名称如果不同于canonical的名称
all_aliases = list(getattr(canonical, "aliases", []) or [])
if losing_name and losing_name != canonical_name:
all_aliases.append(losing_name)
all_aliases.extend(getattr(losing, "aliases", []) or [])
# 3. 添加losing实体的所有别名
losing_aliases = getattr(losing, "aliases", []) or []
all_aliases.extend(losing_aliases)
# 4. 标准化并去重(使用标准化后的字符串进行去重)
try:
from app.core.memory.utils.alias_utils import normalize_aliases
canonical.aliases = normalize_aliases(canonical_name, all_aliases)
except Exception:
# 如果导入失败,使用增强的去重逻辑
# 使用标准化后的字符串作为key进行去重
seen_normalized = set()
unique_aliases = []
for alias in all_aliases:
if not alias:
continue
alias_stripped = str(alias).strip()
if not alias_stripped or alias_stripped == canonical_name:
continue
# 标准化:转小写用于去重判断
alias_normalized = alias_stripped.lower()
if alias_normalized not in seen_normalized:
seen_normalized.add(alias_normalized)
unique_aliases.append(alias_stripped)
# 排序并赋值
canonical.aliases = sorted(unique_aliases)
# ========== 主循环:遍历所有实体对进行模糊匹配 ==========

View File

@@ -1391,18 +1391,18 @@ class ExtractionOrchestrator:
"""
将本轮提取的用户别名同步到 end_user 和 end_user_info 表。
注意:此方法在 Neo4j 写入之前调用,因此不能依赖 Neo4j 作为别名的权威数据源。
改为直接使用内存中去重后的 entity_nodes 的 aliases与 PgSQL 已有的 aliases 合并。
PgSQL end_user_info.aliases 是用户别名的唯一权威源。
此方法仅将本轮 LLM 从对话中新提取的别名增量追加到 PgSQL
不再从 Neo4j 二层去重合并历史别名,避免脏数据反向污染 PgSQL。
策略:
1. 从内存中的 entity_nodes 提取本轮用户别名current_aliases
2. 从去重后的 entity_nodes 中提取完整别名(含 Neo4j 二层去重合并的历史别名
3. 从 PgSQL end_user_info 读取已有的 aliasesdb_aliases
4. 合并 db_aliases + deduped_aliases + current_aliases去重保序
5. 写回 PgSQL
1. 从本轮对话原始发言中提取用户别名current_aliases
2. 从 PgSQL end_user_info 读取已有的 aliasesdb_aliases
3. 合并 db_aliases + current_aliases,去重保序
4. 写回 PgSQL
Args:
entity_nodes: 去重后的实体节点列表(内存中,含二层去重合并结果
entity_nodes: 去重后的实体节点列表(内存中)
dialog_data_list: 对话数据列表
"""
try:
@@ -1418,11 +1418,6 @@ class ExtractionOrchestrator:
# 1. 提取本轮对话的用户别名(保持 LLM 提取的原始顺序,不排序)
current_aliases = self._extract_current_aliases(entity_nodes, dialog_data_list)
# 1.5 从去重后的 entity_nodes 中提取完整别名
# 二层去重会将 Neo4j 中已有的历史别名合并到 entity_nodes 中,
# 这里提取出来确保 PgSQL 与 Neo4j 的别名保持同步
deduped_aliases = self._extract_deduped_entity_aliases(entity_nodes)
# 1.6 从 Neo4j 查询已有的 AI 助手别名,作为额外的排除源
# (防止 LLM 未提取出 AI 助手实体时AI 别名泄漏到用户别名中)
neo4j_assistant_aliases = await self._fetch_neo4j_assistant_aliases(end_user_id)
@@ -1434,19 +1429,12 @@ class ExtractionOrchestrator:
]
if len(current_aliases) < before_count:
logger.info(f"通过 Neo4j AI 助手别名排除了 {before_count - len(current_aliases)} 个误归属别名")
# 同样过滤 deduped_aliases
deduped_aliases = [
a for a in deduped_aliases
if a.strip().lower() not in neo4j_assistant_aliases
]
if not current_aliases and not deduped_aliases:
if not current_aliases:
logger.debug(f"本轮未提取到用户别名,跳过同步: end_user_id={end_user_id}")
return
logger.info(f"本轮对话提取的 aliases: {current_aliases}")
if deduped_aliases:
logger.info(f"去重后实体的完整 aliases含历史: {deduped_aliases}")
# 2. 同步到数据库
end_user_uuid = uuid.UUID(end_user_id)
@@ -1457,21 +1445,15 @@ class ExtractionOrchestrator:
logger.warning(f"未找到 end_user_id={end_user_id} 的用户记录")
return
# 3. 从 PgSQL 读取已有 aliases 并与本轮合并
# 3. 从 PgSQL 读取已有 aliases 并与本轮新增合并
info = EndUserInfoRepository(db).get_by_end_user_id(end_user_uuid)
db_aliases = (info.aliases if info and info.aliases else [])
# 过滤掉占位名称
db_aliases = [a for a in db_aliases if a.strip().lower() not in self.USER_PLACEHOLDER_NAMES]
# 合并:已有 + 去重后完整别名 + 本轮新增,去重保序
# 合并:PgSQL 已有 + 本轮新增,去重保序(不再合并 Neo4j 历史别名)
merged_aliases = list(db_aliases)
seen_lower = {a.strip().lower() for a in merged_aliases}
# 先合并去重后实体的完整别名(含 Neo4j 历史别名)
for alias in deduped_aliases:
if alias.strip().lower() not in seen_lower:
merged_aliases.append(alias)
seen_lower.add(alias.strip().lower())
# 再合并本轮新提取的别名
for alias in current_aliases:
if alias.strip().lower() not in seen_lower:
merged_aliases.append(alias)
@@ -1505,9 +1487,7 @@ class ExtractionOrchestrator:
info.aliases = merged_aliases
logger.info(f"同步合并后 aliases 到 end_user_info: {merged_aliases}")
else:
first_alias = current_aliases[0].strip() if current_aliases else (
deduped_aliases[0].strip() if deduped_aliases else ""
)
first_alias = current_aliases[0].strip() if current_aliases else ""
# 确保 first_alias 不是占位名称
if first_alias and first_alias.lower() not in self.USER_PLACEHOLDER_NAMES:
db.add(EndUserInfo(

View File

@@ -112,22 +112,23 @@ class RedBearModelFactory:
params["stream_usage"] = True
# 深度思考模式
is_streaming = bool(config.extra_params.get("streaming"))
if is_streaming and not config.is_omni:
if provider == ModelProvider.VOLCANO:
# 火山引擎深度思考仅流式调用支持,非流式时不传 thinking 参数
thinking_config: Dict[str, Any] = {
"type": "enabled" if config.deep_thinking else "disabled"
}
if config.deep_thinking and config.thinking_budget_tokens:
thinking_config["budget_tokens"] = config.thinking_budget_tokens
params["extra_body"] = {"thinking": thinking_config}
else:
# 始终显式传递 enable_thinking不支持该参数的模型如 DeepSeek-R1会直接忽略
model_kwargs: Dict[str, Any] = config.extra_params.get("model_kwargs", {})
model_kwargs["enable_thinking"] = config.deep_thinking
if config.deep_thinking and config.thinking_budget_tokens:
model_kwargs["thinking_budget"] = config.thinking_budget_tokens
params["model_kwargs"] = model_kwargs
if config.support_thinking:
if is_streaming and not config.is_omni:
if provider == ModelProvider.VOLCANO:
# 火山引擎深度思考仅流式调用支持,非流式时不传 thinking 参数
thinking_config: Dict[str, Any] = {
"type": "enabled" if config.deep_thinking else "disabled"
}
if config.deep_thinking and config.thinking_budget_tokens:
thinking_config["budget_tokens"] = config.thinking_budget_tokens
params["extra_body"] = {"thinking": thinking_config}
else:
# 始终显式传递 enable_thinking不支持该参数的模型如 DeepSeek-R1会直接忽略
model_kwargs: Dict[str, Any] = config.extra_params.get("model_kwargs", {})
model_kwargs["enable_thinking"] = config.deep_thinking
if config.deep_thinking and config.thinking_budget_tokens:
model_kwargs["thinking_budget"] = config.thinking_budget_tokens
params["model_kwargs"] = model_kwargs
return params
elif provider == ModelProvider.DASHSCOPE:
params = {

View File

@@ -672,10 +672,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
excel_parser = ExcelParser()
if parser_config.get("html4excel") and parser_config.get("html4excel").lower() == "true":
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
parser_config["chunk_token_num"] = 0
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
parser_config["chunk_token_num"] = 12800
callback(0.8, "Finish parsing.")
# Excel 每行直接作为一个 chunk不经过 naive_merge 避免被 delimiter 拆分
chunks = [s for s, _ in sections]
res.extend(tokenize_chunks(chunks, doc, is_english, None))
res.extend(embed_res)
res.extend(url_res)
return res
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")

View File

@@ -232,14 +232,14 @@ class RAGExcelParser:
t = str(ti[i].value) if i < len(ti) else ""
t += ("" if t else "") + str(c.value)
fields.append(t)
line = "; ".join(fields)
line = "\n".join(fields)
if sheetname.lower().find("sheet") < 0:
line += " ——" + sheetname
line += "\n——" + sheetname
res.append(line)
else:
# 只有表头的情况
if header_fields:
line = "; ".join(header_fields)
line = "\n".join(header_fields)
if sheetname.lower().find("sheet") < 0:
line += " ——" + sheetname
res.append(line)

View File

@@ -50,7 +50,9 @@ class OpenAIEmbed(Base):
def encode(self, texts: list):
# OpenAI requires batch size <=16
batch_size = 16
texts = [truncate(t, 8191) for t in texts]
# Use 8000 instead of 8191 to leave safety margin for tokenizer differences
# between cl100k_base (used by truncate) and the actual embedding model
texts = [truncate(t, 8000) for t in texts]
ress = []
total_tokens = 0
for i in range(0, len(texts), batch_size):
@@ -63,7 +65,7 @@ class OpenAIEmbed(Base):
return np.array(ress), total_tokens
def encode_queries(self, text):
res = self.client.embeddings.create(input=[truncate(text, 8191)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True})
res = self.client.embeddings.create(input=[truncate(text, 8000)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True})
return np.array(res.data[0].embedding), self.total_token_count(res)
@@ -79,6 +81,7 @@ class LocalAIEmbed(Base):
def encode(self, texts: list):
batch_size = 16
texts = [truncate(t, 8000) for t in texts]
ress = []
for i in range(0, len(texts), batch_size):
res = self.client.embeddings.create(input=texts[i : i + batch_size], model=self.model_name)
@@ -173,6 +176,7 @@ class XinferenceEmbed(Base):
def encode(self, texts: list):
batch_size = 16
texts = [truncate(t, 8000) for t in texts]
ress = []
total_tokens = 0
for i in range(0, len(texts), batch_size):
@@ -188,7 +192,7 @@ class XinferenceEmbed(Base):
def encode_queries(self, text):
res = None
try:
res = self.client.embeddings.create(input=[text], model=self.model_name)
res = self.client.embeddings.create(input=[truncate(text, 8000)], model=self.model_name)
return np.array(res.data[0].embedding), self.total_token_count(res)
except Exception as _e:
log_exception(_e, res)

View File

@@ -72,8 +72,9 @@ class HttpContentTypeConfig(BaseModel):
@classmethod
def validate_data(cls, v, info):
content_type = info.data.get("content_type")
if content_type == HttpContentType.FROM_DATA and not isinstance(v, HttpFormData):
raise ValueError("When content_type is 'form-data', data must be of type HttpFormData")
if content_type == HttpContentType.FROM_DATA and (
not isinstance(v, list) or not all(isinstance(item, HttpFormData) for item in v)):
raise ValueError("When content_type is 'form-data', data must be a list of HttpFormData")
elif content_type in [HttpContentType.JSON] and not isinstance(v, str):
raise ValueError("When content_type is JSON, data must be of type str")
elif content_type in [HttpContentType.WWW_FORM] and not isinstance(v, dict):

View File

@@ -260,17 +260,22 @@ class HttpRequestNode(BaseNode):
))
case HttpContentType.FROM_DATA:
data = {}
content["files"] = {}
files = []
for item in self.typed_config.body.data:
key = self._render_template(item.key, variable_pool)
if item.type == "text":
data[self._render_template(item.key, variable_pool)] = self._render_template(item.value,
variable_pool)
data[key] = self._render_template(item.value, variable_pool)
elif item.type == "file":
content["files"][self._render_template(item.key, variable_pool)] = (
uuid.uuid4().hex,
await variable_pool.get_instance(item.value).get_content()
)
file_instance = variable_pool.get_instance(item.value)
if isinstance(file_instance, ArrayVariable):
for v in file_instance.value:
if isinstance(v, FileVariable):
files.append((key, (uuid.uuid4().hex, await v.get_content())))
elif isinstance(file_instance, FileVariable):
files.append((key, (uuid.uuid4().hex, await file_instance.get_content())))
content["data"] = data
if files:
content["files"] = files
case HttpContentType.BINARY:
content["files"] = []
file_instence = variable_pool.get_instance(self.typed_config.body.data)

View File

@@ -84,7 +84,7 @@ class FileVariable(BaseVariable):
total_bytes = 0
chunks = []
async with httpx.AsyncClient() as client:
async with httpx.AsyncClient(follow_redirects=True) as client:
async with client.stream("GET", self.value.url) as resp:
resp.raise_for_status()
async for chunk in resp.aiter_bytes(8192):

View File

@@ -5,16 +5,9 @@ Implicit Emotions Storage Repository
事务由调用方控制,仓储层只使用 flush/refresh
"""
import logging
from datetime import date, datetime, timezone
from datetime import datetime, timedelta, timezone
from typing import Generator, Optional
class TimeFilterUnavailableError(Exception):
"""redis_client 不可用,无法执行时间轴筛选。
调用方捕获此异常后可选择回退到 get_all_user_ids 进行全量处理。
"""
import redis
from sqlalchemy import exists, not_, select
from sqlalchemy.orm import Session
@@ -25,6 +18,13 @@ from app.models.implicit_emotions_storage_model import ImplicitEmotionsStorage
logger = logging.getLogger(__name__)
class TimeFilterUnavailableError(Exception):
"""redis_client 不可用,无法执行时间轴筛选。
调用方捕获此异常后可选择回退到 get_all_user_ids 进行全量处理。
"""
class ImplicitEmotionsStorageRepository:
"""隐性记忆和情绪存储仓储类"""
@@ -216,9 +216,7 @@ class ImplicitEmotionsStorageRepository:
"""
from sqlalchemy import String as SAString
from sqlalchemy import cast
CST = timezone(timedelta(hours=8))
now_cst = datetime.now(CST)
today_start = now_cst.replace(hour=0, minute=0, second=0, microsecond=0).astimezone(timezone.utc).replace(tzinfo=None)
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
tomorrow_start = today_start + timedelta(days=1)
offset = 0
while True:

View File

@@ -93,6 +93,8 @@ SET e.name = CASE WHEN entity.name IS NOT NULL AND entity.name <> '' THEN entity
END,
e.statement_id = CASE WHEN entity.statement_id IS NOT NULL AND entity.statement_id <> '' THEN entity.statement_id ELSE e.statement_id END,
e.aliases = CASE
// 用户实体的 aliases 由 PgSQL end_user_info 作为唯一权威源,知识抽取完全不写入
WHEN entity.name IN ['用户', '', 'User', 'I'] THEN e.aliases
WHEN entity.aliases IS NOT NULL AND size(entity.aliases) > 0
THEN CASE
WHEN e.aliases IS NULL THEN entity.aliases

View File

@@ -77,11 +77,11 @@ class Neo4jConnector:
"""
await self.driver.close()
async def execute_query(self, query: str, json_format=False, **kwargs: Any) -> List[Dict[str, Any]]:
async def execute_query(self, cypher: str, json_format=False, **kwargs: Any) -> List[Dict[str, Any]]:
"""执行Cypher查询
Args:
query: Cypher查询语句
cypher: Cypher查询语句
json_format: json格式化
**kwargs: 查询参数将作为参数传递给Cypher查询
@@ -92,7 +92,7 @@ class Neo4jConnector:
"""
result = await self.driver.execute_query(
query,
cypher,
database="neo4j",
**kwargs
)

View File

@@ -297,6 +297,10 @@ def get_user_by_id(db: Session, user_id: uuid.UUID) -> Optional[User]:
"""根据ID获取用户"""
return UserRepository(db).get_user_by_id(user_id)
def get_user_by_id_regardless_active(db: Session, user_id: uuid.UUID) -> Optional[User]:
"""根据ID获取用户不过滤 is_active用于启用/禁用场景)"""
return db.query(User).filter(User.id == user_id).first()
def get_user_by_email(db: Session, email: str) -> Optional[User]:
"""根据邮箱获取用户"""
return UserRepository(db).get_user_by_email(email)

View File

@@ -73,15 +73,14 @@ class AppDslService:
AppType.MULTI_AGENT: "multi_agent_config",
AppType.WORKFLOW: "workflow"
}.get(app.type, "config")
config_data = self._enrich_release_config(app.type, release.config or {})
config_data = self._enrich_release_config(app.type, release.config or {}, release.default_model_config_id)
dsl = {**meta, "app": app_meta, config_key: config_data}
return yaml.dump(dsl, default_flow_style=False, allow_unicode=True), f"{release.name}_v{release.version_name}.yaml"
def _enrich_release_config(self, app_type: str, cfg: dict) -> dict:
def _enrich_release_config(self, app_type: str, cfg: dict, default_model_config_id=None) -> dict:
if app_type == AppType.AGENT:
enriched = {**cfg}
if "default_model_config_id" in cfg:
enriched["default_model_config_ref"] = self._model_ref(cfg["default_model_config_id"])
enriched["default_model_config_ref"] = self._model_ref(default_model_config_id)
if "knowledge_retrieval" in cfg:
enriched["knowledge_retrieval"] = self._enrich_knowledge_retrieval(cfg["knowledge_retrieval"])
if "tools" in cfg:
@@ -91,8 +90,7 @@ class AppDslService:
return enriched
if app_type == AppType.MULTI_AGENT:
enriched = {**cfg}
if "default_model_config_id" in cfg:
enriched["default_model_config_ref"] = self._model_ref(cfg["default_model_config_id"])
enriched["default_model_config_ref"] = self._model_ref(default_model_config_id)
if "master_agent_id" in cfg:
enriched["master_agent_ref"] = self._release_ref(cfg["master_agent_id"])
if "sub_agents" in cfg:

View File

@@ -679,9 +679,9 @@ class EmotionAnalyticsService:
# 查询用户的实体和标签
query = """
MATCH (e:Entity)
MATCH (e:ExtractedEntity)
WHERE e.end_user_id = $end_user_id
RETURN e.name as name, e.type as type
RETURN e.name as name, e.entity_type as type
ORDER BY e.created_at DESC
LIMIT 20
"""

View File

@@ -34,6 +34,7 @@ from app.schemas.implicit_memory_schema import (
UserMemorySummary,
)
from app.schemas.memory_config_schema import MemoryConfig
from app.services.memory_base_service import MIN_MEMORY_SUMMARY_COUNT
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
@@ -379,12 +380,59 @@ class ImplicitMemoryService:
raise
def _build_empty_profile(self) -> dict:
"""构建 MemorySummary 不足时返回的固定空白画像数据"""
now_ms = int(datetime.utcnow().timestamp() * 1000)
insufficient = "Insufficient data for analysis"
def _empty_dimension(name: str) -> dict:
return {
"evidence": [insufficient],
"reasoning": f"No clear evidence found for {name} dimension",
"percentage": 0.0,
"dimension_name": name,
"confidence_level": 20,
}
def _empty_category(name: str) -> dict:
return {
"evidence": [insufficient],
"percentage": 25.0,
"category_name": name,
"trending_direction": None,
}
return {
"habits": [],
"portrait": {
"aesthetic": _empty_dimension("aesthetic"),
"creativity": _empty_dimension("creativity"),
"literature": _empty_dimension("literature"),
"technology": _empty_dimension("technology"),
"historical_trends": None,
"analysis_timestamp": now_ms,
"total_summaries_analyzed": 0,
},
"preferences": [],
"interest_areas": {
"art": _empty_category("art"),
"tech": _empty_category("tech"),
"music": _empty_category("music"),
"lifestyle": _empty_category("lifestyle"),
"analysis_timestamp": now_ms,
"total_summaries_analyzed": 0,
},
}
async def generate_complete_profile(
self,
user_id: str
) -> dict:
"""生成完整的用户画像包含所有4个模块
需要该用户的 MemorySummary 节点数量 >= 5 才会真正调用 LLM 生成画像,
否则返回固定的空白画像数据。
Args:
user_id: 用户ID
@@ -394,6 +442,16 @@ class ImplicitMemoryService:
logger.info(f"生成完整用户画像: user={user_id}")
try:
# 前置检查:查询该用户有效的 MemorySummary 节点数量(排除孤立节点)
from app.services.memory_base_service import MemoryBaseService
base_service = MemoryBaseService()
memory_summary_count = await base_service.get_valid_memory_summary_count(user_id)
logger.info(f"用户 MemorySummary 节点数量: {memory_summary_count} (user={user_id})")
if memory_summary_count < MIN_MEMORY_SUMMARY_COUNT:
logger.info(f"MemorySummary 数量不足 {MIN_MEMORY_SUMMARY_COUNT}(当前 {memory_summary_count}),返回空白画像: user={user_id}")
return self._build_empty_profile()
# 并行调用4个分析方法
preferences, portrait, interest_areas, habits = await asyncio.gather(
self.get_preference_tags(user_id=user_id),

View File

@@ -265,12 +265,50 @@ async def Translation_English(modid, text, fields=None):
# 其他类型数字、布尔值、None等原样返回
else:
return text
# 隐性记忆画像生成所需的最低 MemorySummary 节点数量
MIN_MEMORY_SUMMARY_COUNT = 5
class MemoryBaseService:
"""记忆服务基类,提供共享的辅助方法"""
def __init__(self):
self.neo4j_connector = Neo4jConnector()
async def get_valid_memory_summary_count(
self,
end_user_id: str
) -> int:
"""获取用户有效的 MemorySummary 节点数量(排除孤立节点)。
只统计存在 DERIVED_FROM_STATEMENT 关系的 MemorySummary 节点。
Args:
end_user_id: 终端用户ID
Returns:
有效 MemorySummary 节点数量
"""
try:
query = """
MATCH (n:MemorySummary)-[:DERIVED_FROM_STATEMENT]->(:Statement)
WHERE n.end_user_id = $end_user_id
RETURN count(DISTINCT n) as count
"""
result = await self.neo4j_connector.execute_query(
query, end_user_id=end_user_id
)
count = result[0]["count"] if result and len(result) > 0 else 0
logger.debug(
f"有效 MemorySummary 节点数量: {count} (end_user_id={end_user_id})"
)
return count
except Exception as e:
logger.error(
f"获取有效 MemorySummary 数量失败: {str(e)}", exc_info=True
)
return 0
@staticmethod
def parse_timestamp(timestamp_value) -> Optional[int]:
"""

View File

@@ -227,10 +227,20 @@ class PromptOptimizerService:
content = getattr(chunk, "content", chunk)
if not content:
continue
buffer += content
if isinstance(content, str):
buffer += content
elif isinstance(content, list):
for _ in content:
buffer += _["text"]
else:
logger.error(f"Unsupported content type - {content}")
raise Exception("Unsupported content type")
cache = buffer[:-20]
last_idx = 19
while cache and cache[-1] == '\\' and last_idx > 0:
cache = buffer[:-last_idx]
last_idx -= 1
# 尝试找到 "prompt": " 开始位置
if prompt_finished:
continue
@@ -272,7 +282,7 @@ class PromptOptimizerService:
def parser_prompt_variables(prompt: str):
try:
pattern = r'\{\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}\}'
matches = re.findall(pattern, prompt)
matches = re.findall(pattern, str(prompt))
variables = list(set(matches))
return variables
except Exception as e:

View File

@@ -14,6 +14,7 @@ from pydantic import BaseModel, Field
from sqlalchemy.orm import Session
from app.core.logging_config import get_logger
from app.core.memory.storage_services.extraction_engine.deduplication.deduped_and_disamb import _USER_PLACEHOLDER_NAMES
from app.core.memory.utils.llm.llm_utils import MemoryClientFactory
from app.db import get_db_context
from app.repositories.conversation_repository import ConversationRepository
@@ -21,7 +22,7 @@ from app.repositories.end_user_repository import EndUserRepository
from app.repositories.neo4j.cypher_queries import Graph_Node_query
from app.repositories.neo4j.neo4j_connector import Neo4jConnector
from app.schemas.memory_episodic_schema import EmotionSubject, EmotionType, type_mapping
from app.services.memory_base_service import MemoryBaseService
from app.services.memory_base_service import MemoryBaseService, MIN_MEMORY_SUMMARY_COUNT
from app.services.memory_config_service import MemoryConfigService
from app.services.memory_perceptual_service import MemoryPerceptualService
from app.services.memory_short_service import ShortService
@@ -477,7 +478,7 @@ class UserMemoryService:
allowed_fields = {'other_name', 'aliases', 'meta_data'}
# 用户占位名称黑名单,不允许作为 other_name 或出现在 aliases 中
_user_placeholder_names = {'用户', '', 'User', 'I'}
_user_placeholder_names = _USER_PLACEHOLDER_NAMES
# 过滤 other_name不允许设置为占位名称
if 'other_name' in update_data and update_data['other_name'] and update_data['other_name'].strip() in _user_placeholder_names:
@@ -1504,7 +1505,7 @@ async def analytics_memory_types(
2. 工作记忆 (WORKING_MEMORY) = 会话数量(通过 ConversationRepository.get_conversation_by_user_id 获取)
3. 短期记忆 (SHORT_TERM_MEMORY) = /short_term 接口返回的问答对数量
4. 显性记忆 (EXPLICIT_MEMORY) = 情景记忆 + 语义记忆(通过 MemoryBaseService.get_explicit_memory_count 获取)
5. 隐性记忆 (IMPLICIT_MEMORY) = Statement 节点数量的三分之一
5. 隐性记忆 (IMPLICIT_MEMORY) = MemorySummary 节点数量(需 >= MIN_MEMORY_SUMMARY_COUNT 才显示,否则为 0
6. 情绪记忆 (EMOTIONAL_MEMORY) = 情绪标签统计总数(通过 MemoryBaseService.get_emotional_memory_count 获取)
7. 情景记忆 (EPISODIC_MEMORY) = memory_summary通过 MemoryBaseService.get_episodic_memory_count 获取)
8. 遗忘记忆 (FORGET_MEMORY) = 激活值低于阈值的节点数(通过 MemoryBaseService.get_forget_memory_count 获取)
@@ -1561,23 +1562,15 @@ async def analytics_memory_types(
logger.warning(f"获取会话数量失败工作记忆数量设为0: {str(e)}")
work_count = 0
# 获取隐性记忆数量(基于 Statement 节点数量的三分之一
# 获取隐性记忆数量(基于有关联关系的 MemorySummary 节点数量,需 >= MIN_MEMORY_SUMMARY_COUNT 才计入
implicit_count = 0
if end_user_id:
try:
# 查询 Statement 节点数量
query = """
MATCH (n:Statement)
WHERE n.end_user_id = $end_user_id
RETURN count(n) as count
"""
result = await _neo4j_connector.execute_query(query, end_user_id=end_user_id)
statement_count = result[0]["count"] if result and len(result) > 0 else 0
# 取三分之一作为隐性记忆数量
implicit_count = round(statement_count / 3)
logger.debug(f"隐性记忆数量Statement数量的1/3: {implicit_count} (Statement总数={statement_count}, end_user_id={end_user_id})")
memory_summary_count = await base_service.get_valid_memory_summary_count(end_user_id)
implicit_count = memory_summary_count if memory_summary_count >= MIN_MEMORY_SUMMARY_COUNT else 0
logger.debug(f"隐性记忆数量有效MemorySummary节点数: {implicit_count} (有效MemorySummary总数={memory_summary_count}, end_user_id={end_user_id})")
except Exception as e:
logger.warning(f"获取Statement数量失败隐性记忆数量设为0: {str(e)}")
logger.warning(f"获取MemorySummary数量失败隐性记忆数量设为0: {str(e)}")
implicit_count = 0
# 原有的基于行为习惯的统计方式(已注释)
@@ -1643,7 +1636,7 @@ async def analytics_memory_types(
"WORKING_MEMORY": work_count, # 工作记忆(基于会话数量)
"SHORT_TERM_MEMORY": short_term_count, # 短期记忆(基于问答对数量)
"EXPLICIT_MEMORY": explicit_count, # 显性记忆(情景记忆 + 语义记忆)
"IMPLICIT_MEMORY": implicit_count, # 隐性记忆(Statement数量的1/3
"IMPLICIT_MEMORY": implicit_count, # 隐性记忆(MemorySummary节点数需>=MIN_MEMORY_SUMMARY_COUNT
"EMOTIONAL_MEMORY": emotion_count, # 情绪记忆(使用情绪标签统计)
"EPISODIC_MEMORY": episodic_count, # 情景记忆
"FORGET_MEMORY": forget_count # 遗忘记忆(激活值低于阈值)

View File

@@ -285,7 +285,7 @@ def activate_user(db: Session, user_id_to_activate: uuid.UUID, current_user: Use
try:
# 查找用户
business_logger.debug(f"查找待激活用户: {user_id_to_activate}")
db_user = user_repository.get_user_by_id(db, user_id=user_id_to_activate)
db_user = user_repository.get_user_by_id_regardless_active(db, user_id=user_id_to_activate)
if not db_user:
business_logger.warning(f"用户不存在: {user_id_to_activate}")
raise BusinessException("用户不存在", code=BizCode.USER_NOT_FOUND)