feat(memory): add async user metadata extraction pipeline
- Add MetadataExtractor to collect user-related statements post-dedup and extract profile/behavioral metadata via independent LLM call - Add Celery task (extract_user_metadata) routed to memory_tasks queue - Add metadata models (UserMetadata, UserMetadataProfile, etc.) - Add metadata utility functions (clean, validate, merge with _op support) - Add Jinja2 prompt template for metadata extraction (zh/en) - Fix Lucene query parameter naming: rename `q` to `query` across all Cypher queries, graph_search functions, and callers - Escape `/` in Lucene queries to prevent TokenMgrError - Add `speaker` field to ChunkNode and persist it in Neo4j - Remove unused imports (argparse, os, UUID) in search.py - Fix unnecessary db context nesting in interest distribution task
This commit is contained in:
@@ -23,6 +23,7 @@ SET s += {
|
||||
end_user_id: statement.end_user_id,
|
||||
stmt_type: statement.stmt_type,
|
||||
statement: statement.statement,
|
||||
speaker: statement.speaker,
|
||||
emotion_intensity: statement.emotion_intensity,
|
||||
emotion_target: statement.emotion_target,
|
||||
emotion_subject: statement.emotion_subject,
|
||||
@@ -56,6 +57,7 @@ SET c += {
|
||||
expired_at: chunk.expired_at,
|
||||
dialog_id: chunk.dialog_id,
|
||||
content: chunk.content,
|
||||
speaker: chunk.speaker,
|
||||
chunk_embedding: chunk.chunk_embedding,
|
||||
sequence_number: chunk.sequence_number,
|
||||
start_index: chunk.start_index,
|
||||
@@ -283,7 +285,7 @@ LIMIT $limit
|
||||
"""
|
||||
|
||||
SEARCH_STATEMENTS_BY_KEYWORD = """
|
||||
CALL db.index.fulltext.queryNodes("statementsFulltext", $q) YIELD node AS s, score
|
||||
CALL db.index.fulltext.queryNodes("statementsFulltext", $query) YIELD node AS s, score
|
||||
WHERE ($end_user_id IS NULL OR s.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s)
|
||||
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity)
|
||||
@@ -307,7 +309,7 @@ LIMIT $limit
|
||||
"""
|
||||
# 查询实体名称包含指定字符串的实体
|
||||
SEARCH_ENTITIES_BY_NAME = """
|
||||
CALL db.index.fulltext.queryNodes("entitiesFulltext", $q) YIELD node AS e, score
|
||||
CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score
|
||||
WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (s:Statement)-[:REFERENCES_ENTITY]->(e)
|
||||
OPTIONAL MATCH (c:Chunk)-[:CONTAINS]->(s)
|
||||
@@ -337,21 +339,21 @@ LIMIT $limit
|
||||
"""
|
||||
|
||||
SEARCH_ENTITIES_BY_NAME_OR_ALIAS = """
|
||||
CALL db.index.fulltext.queryNodes("entitiesFulltext", $q) YIELD node AS e, score
|
||||
CALL db.index.fulltext.queryNodes("entitiesFulltext", $query) YIELD node AS e, score
|
||||
WHERE ($end_user_id IS NULL OR e.end_user_id = $end_user_id)
|
||||
WITH e, score
|
||||
WITH collect({entity: e, score: score}) AS fulltextResults
|
||||
With collect({entity: e, score: score}) AS fulltextResults
|
||||
|
||||
OPTIONAL MATCH (ae:ExtractedEntity)
|
||||
WHERE ($end_user_id IS NULL OR ae.end_user_id = $end_user_id)
|
||||
AND ae.aliases IS NOT NULL
|
||||
AND ANY(alias IN ae.aliases WHERE toLower(alias) CONTAINS toLower($q))
|
||||
AND ANY(alias IN ae.aliases WHERE toLower(alias) CONTAINS toLower($query))
|
||||
WITH fulltextResults, collect(ae) AS aliasEntities
|
||||
|
||||
UNWIND (fulltextResults + [x IN aliasEntities | {entity: x, score:
|
||||
CASE
|
||||
WHEN ANY(alias IN x.aliases WHERE toLower(alias) = toLower($q)) THEN 1.0
|
||||
WHEN ANY(alias IN x.aliases WHERE toLower(alias) STARTS WITH toLower($q)) THEN 0.9
|
||||
WHEN ANY(alias IN x.aliases WHERE toLower(alias) = toLower($query)) THEN 1.0
|
||||
WHEN ANY(alias IN x.aliases WHERE toLower(alias) STARTS WITH toLower($query)) THEN 0.9
|
||||
ELSE 0.8
|
||||
END
|
||||
}]) AS row
|
||||
@@ -384,7 +386,7 @@ LIMIT $limit
|
||||
|
||||
|
||||
SEARCH_CHUNKS_BY_CONTENT = """
|
||||
CALL db.index.fulltext.queryNodes("chunksFulltext", $q) YIELD node AS c, score
|
||||
CALL db.index.fulltext.queryNodes("chunksFulltext", $query) YIELD node AS c, score
|
||||
WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (c)-[:CONTAINS]->(s:Statement)
|
||||
OPTIONAL MATCH (s)-[:REFERENCES_ENTITY]->(e:ExtractedEntity)
|
||||
@@ -501,7 +503,7 @@ LIMIT $limit
|
||||
"""
|
||||
|
||||
SEARCH_STATEMENTS_BY_KEYWORD_TEMPORAL = """
|
||||
CALL db.index.fulltext.queryNodes("statementsFulltext", $q) YIELD node AS s, score
|
||||
CALL db.index.fulltext.queryNodes("statementsFulltext", $query) YIELD node AS s, score
|
||||
WHERE ($end_user_id IS NULL OR s.end_user_id = $end_user_id)
|
||||
AND ((($start_date IS NULL OR (s.created_at IS NOT NULL AND datetime(s.created_at) >= datetime($start_date)))
|
||||
AND ($end_date IS NULL OR (s.created_at IS NOT NULL AND datetime(s.created_at) <= datetime($end_date))))
|
||||
@@ -677,7 +679,7 @@ SET n.invalid_at = $new_invalid_at
|
||||
|
||||
# MemorySummary keyword search using fulltext index
|
||||
SEARCH_MEMORY_SUMMARIES_BY_KEYWORD = """
|
||||
CALL db.index.fulltext.queryNodes("summariesFulltext", $q) YIELD node AS m, score
|
||||
CALL db.index.fulltext.queryNodes("summariesFulltext", $query) YIELD node AS m, score
|
||||
WHERE ($end_user_id IS NULL OR m.end_user_id = $end_user_id)
|
||||
OPTIONAL MATCH (m)-[:DERIVED_FROM_STATEMENT]->(s:Statement)
|
||||
RETURN m.id AS id,
|
||||
@@ -1363,7 +1365,7 @@ RETURN c.community_id AS community_id
|
||||
|
||||
# Community keyword search: matches name or summary via fulltext index
|
||||
SEARCH_COMMUNITIES_BY_KEYWORD = """
|
||||
CALL db.index.fulltext.queryNodes("communitiesFulltext", $q) YIELD node AS c, score
|
||||
CALL db.index.fulltext.queryNodes("communitiesFulltext", $query) YIELD node AS c, score
|
||||
WHERE ($end_user_id IS NULL OR c.end_user_id = $end_user_id)
|
||||
RETURN c.community_id AS id,
|
||||
c.name AS name,
|
||||
@@ -1451,7 +1453,7 @@ RETURN elementId(r) AS uuid
|
||||
"""
|
||||
|
||||
SEARCH_PERCEPTUAL_BY_KEYWORD = """
|
||||
CALL db.index.fulltext.queryNodes("perceptualFulltext", $q) YIELD node AS p, score
|
||||
CALL db.index.fulltext.queryNodes("perceptualFulltext", $query) YIELD node AS p, score
|
||||
WHERE p.end_user_id = $end_user_id
|
||||
RETURN p.id AS id,
|
||||
p.end_user_id AS end_user_id,
|
||||
|
||||
@@ -2,6 +2,7 @@ import asyncio
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.core.memory.utils.data.text_utils import escape_lucene_query
|
||||
from app.repositories.neo4j.cypher_queries import (
|
||||
CHUNK_EMBEDDING_SEARCH,
|
||||
COMMUNITY_EMBEDDING_SEARCH,
|
||||
@@ -87,7 +88,7 @@ async def _update_activation_values_batch(
|
||||
unique_node_ids.append(node_id)
|
||||
|
||||
if not unique_node_ids:
|
||||
logger.warning(f"批量更新激活值:没有有效的节点ID")
|
||||
logger.warning("批量更新激活值:没有有效的节点ID")
|
||||
return nodes
|
||||
|
||||
# 记录去重信息(仅针对具有有效 ID 的节点)
|
||||
@@ -223,7 +224,7 @@ async def _update_search_results_activation(
|
||||
|
||||
async def search_graph(
|
||||
connector: Neo4jConnector,
|
||||
q: str,
|
||||
query: str,
|
||||
end_user_id: Optional[str] = None,
|
||||
limit: int = 50,
|
||||
include: List[str] = None,
|
||||
@@ -234,14 +235,14 @@ async def search_graph(
|
||||
OPTIMIZED: Runs all queries in parallel using asyncio.gather()
|
||||
INTEGRATED: Updates activation values for knowledge nodes before returning results
|
||||
|
||||
- Statements: matches s.statement CONTAINS q
|
||||
- Entities: matches e.name CONTAINS q
|
||||
- Chunks: matches s.content CONTAINS q (from Statement nodes)
|
||||
- Summaries: matches ms.content CONTAINS q
|
||||
- Statements: matches s.statement CONTAINS query
|
||||
- Entities: matches e.name CONTAINS query
|
||||
- Chunks: matches s.content CONTAINS query (from Statement nodes)
|
||||
- Summaries: matches ms.content CONTAINS query
|
||||
|
||||
Args:
|
||||
connector: Neo4j connector
|
||||
q: Query text
|
||||
query: Query text for full-text search
|
||||
end_user_id: Optional group filter
|
||||
limit: Max results per category
|
||||
include: List of categories to search (default: all)
|
||||
@@ -252,6 +253,9 @@ async def search_graph(
|
||||
if include is None:
|
||||
include = ["statements", "chunks", "entities", "summaries"]
|
||||
|
||||
# Escape Lucene special characters to prevent query parse errors
|
||||
escaped_query = escape_lucene_query(query)
|
||||
|
||||
# Prepare tasks for parallel execution
|
||||
tasks = []
|
||||
task_keys = []
|
||||
@@ -260,7 +264,7 @@ async def search_graph(
|
||||
tasks.append(connector.execute_query(
|
||||
SEARCH_STATEMENTS_BY_KEYWORD,
|
||||
json_format=True,
|
||||
q=q,
|
||||
query=escaped_query,
|
||||
end_user_id=end_user_id,
|
||||
limit=limit,
|
||||
))
|
||||
@@ -270,7 +274,7 @@ async def search_graph(
|
||||
tasks.append(connector.execute_query(
|
||||
SEARCH_ENTITIES_BY_NAME_OR_ALIAS,
|
||||
json_format=True,
|
||||
q=q,
|
||||
query=escaped_query,
|
||||
end_user_id=end_user_id,
|
||||
limit=limit,
|
||||
))
|
||||
@@ -280,7 +284,7 @@ async def search_graph(
|
||||
tasks.append(connector.execute_query(
|
||||
SEARCH_CHUNKS_BY_CONTENT,
|
||||
json_format=True,
|
||||
q=q,
|
||||
query=escaped_query,
|
||||
end_user_id=end_user_id,
|
||||
limit=limit,
|
||||
))
|
||||
@@ -290,7 +294,7 @@ async def search_graph(
|
||||
tasks.append(connector.execute_query(
|
||||
SEARCH_MEMORY_SUMMARIES_BY_KEYWORD,
|
||||
json_format=True,
|
||||
q=q,
|
||||
query=escaped_query,
|
||||
end_user_id=end_user_id,
|
||||
limit=limit,
|
||||
))
|
||||
@@ -300,7 +304,7 @@ async def search_graph(
|
||||
tasks.append(connector.execute_query(
|
||||
SEARCH_COMMUNITIES_BY_KEYWORD,
|
||||
json_format=True,
|
||||
q=q,
|
||||
query=escaped_query,
|
||||
end_user_id=end_user_id,
|
||||
limit=limit,
|
||||
))
|
||||
@@ -482,7 +486,7 @@ async def search_graph_by_embedding(
|
||||
update_time = time.time() - update_start
|
||||
logger.info(f"[PERF] Activation value updates took: {update_time:.4f}s")
|
||||
else:
|
||||
logger.info(f"[PERF] Skipping activation updates (only summaries)")
|
||||
logger.info("[PERF] Skipping activation updates (only summaries)")
|
||||
|
||||
return results
|
||||
|
||||
@@ -520,7 +524,7 @@ async def get_dedup_candidates_for_entities( # 适配新版查询:使用全
|
||||
# 全文索引按名称检索(包含 CONTAINS 语义)
|
||||
rows = await connector.execute_query(
|
||||
SEARCH_ENTITIES_BY_NAME,
|
||||
q=name,
|
||||
query=escape_lucene_query(name),
|
||||
end_user_id=end_user_id,
|
||||
limit=100,
|
||||
)
|
||||
@@ -544,7 +548,7 @@ async def get_dedup_candidates_for_entities( # 适配新版查询:使用全
|
||||
try:
|
||||
rows = await connector.execute_query(
|
||||
SEARCH_ENTITIES_BY_NAME,
|
||||
q=name.lower(),
|
||||
query=escape_lucene_query(name.lower()),
|
||||
end_user_id=end_user_id,
|
||||
limit=100,
|
||||
)
|
||||
@@ -593,11 +597,12 @@ async def search_graph_by_keyword_temporal(
|
||||
- Returns up to 'limit' statements
|
||||
"""
|
||||
if not query_text:
|
||||
logger.warning(f"query_text不能为空")
|
||||
logger.warning("query_text不能为空")
|
||||
return {"statements": []}
|
||||
escaped_query = escape_lucene_query(query_text)
|
||||
statements = await connector.execute_query(
|
||||
SEARCH_STATEMENTS_BY_KEYWORD_TEMPORAL,
|
||||
q=query_text,
|
||||
query=escaped_query,
|
||||
end_user_id=end_user_id,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
@@ -671,7 +676,7 @@ async def search_graph_by_dialog_id(
|
||||
- Returns up to 'limit' dialogues
|
||||
"""
|
||||
if not dialog_id:
|
||||
logger.warning(f"dialog_id不能为空")
|
||||
logger.warning("dialog_id不能为空")
|
||||
return {"dialogues": []}
|
||||
|
||||
dialogues = await connector.execute_query(
|
||||
@@ -690,7 +695,7 @@ async def search_graph_by_chunk_id(
|
||||
limit: int = 1,
|
||||
) -> Dict[str, List[Dict[str, Any]]]:
|
||||
if not chunk_id:
|
||||
logger.warning(f"chunk_id不能为空")
|
||||
logger.warning("chunk_id不能为空")
|
||||
return {"chunks": []}
|
||||
chunks = await connector.execute_query(
|
||||
SEARCH_CHUNK_BY_CHUNK_ID,
|
||||
@@ -968,7 +973,7 @@ async def search_graph_l_valid_at(
|
||||
|
||||
async def search_perceptual(
|
||||
connector: Neo4jConnector,
|
||||
q: str,
|
||||
query: str,
|
||||
end_user_id: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
) -> Dict[str, List[Dict[str, Any]]]:
|
||||
@@ -979,7 +984,7 @@ async def search_perceptual(
|
||||
|
||||
Args:
|
||||
connector: Neo4j connector
|
||||
q: Query text
|
||||
query: Query text for full-text search
|
||||
end_user_id: Optional user filter
|
||||
limit: Max results
|
||||
|
||||
@@ -989,7 +994,7 @@ async def search_perceptual(
|
||||
try:
|
||||
perceptuals = await connector.execute_query(
|
||||
SEARCH_PERCEPTUAL_BY_KEYWORD,
|
||||
q=q,
|
||||
query=escape_lucene_query(query),
|
||||
end_user_id=end_user_id,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user