config_id字段改成UUID
This commit is contained in:
@@ -1,44 +1,54 @@
|
||||
|
||||
from app.core.memory.agent.utils.llm_tools import WriteState
|
||||
from app.core.memory.agent.utils.llm_tools import WriteState
|
||||
from app.core.memory.agent.utils.write_tools import write
|
||||
from app.core.logging_config import get_agent_logger
|
||||
|
||||
logger = get_agent_logger(__name__)
|
||||
|
||||
|
||||
async def write_node(state: WriteState) -> WriteState:
|
||||
"""
|
||||
Write data to the database/file system.
|
||||
|
||||
Args:
|
||||
content: Data content to write
|
||||
end_user_id: End user identifier
|
||||
memory_config: MemoryConfig object containing all configuration
|
||||
state: WriteState containing messages, end_user_id, and memory_config
|
||||
|
||||
Returns:
|
||||
dict: Contains 'status', 'saved_to', and 'data' fields
|
||||
dict: Contains 'write_result' with status and data fields
|
||||
"""
|
||||
content=state.get('data','')
|
||||
end_user_id=state.get('end_user_id','')
|
||||
memory_config=state.get('memory_config', '')
|
||||
messages = state.get('messages', [])
|
||||
end_user_id = state.get('end_user_id', '')
|
||||
memory_config = state.get('memory_config', '')
|
||||
|
||||
# Convert LangChain messages to structured format expected by write()
|
||||
structured_messages = []
|
||||
for msg in messages:
|
||||
if hasattr(msg, 'type') and hasattr(msg, 'content'):
|
||||
# Map LangChain message types to role names
|
||||
role = 'user' if msg.type == 'human' else 'assistant' if msg.type == 'ai' else msg.type
|
||||
structured_messages.append({
|
||||
"role": role,
|
||||
"content": msg.content # content is now guaranteed to be a string
|
||||
})
|
||||
|
||||
try:
|
||||
result=await write(
|
||||
result = await write(
|
||||
messages=structured_messages,
|
||||
end_user_id=end_user_id,
|
||||
memory_config=memory_config,
|
||||
messages=content, # 修复:使用正确的参数名 messages
|
||||
)
|
||||
logger.info(f"Write completed successfully! Config: {memory_config.config_name}")
|
||||
|
||||
write_result= {
|
||||
write_result = {
|
||||
"status": "success",
|
||||
"data": content,
|
||||
"data": structured_messages,
|
||||
"config_id": memory_config.config_id,
|
||||
"config_name": memory_config.config_name,
|
||||
}
|
||||
return {"write_result":write_result}
|
||||
|
||||
return {"write_result": write_result}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Data_write failed: {e}", exc_info=True)
|
||||
write_result= {
|
||||
write_result = {
|
||||
"status": "error",
|
||||
"message": str(e),
|
||||
}
|
||||
|
||||
@@ -10,55 +10,58 @@ from app.core.memory.models.message_models import DialogData, ConversationContex
|
||||
async def get_chunked_dialogs(
|
||||
chunker_strategy: str = "RecursiveChunker",
|
||||
end_user_id: str = "group_1",
|
||||
content: str = "这是用户的输入",
|
||||
messages: list = None,
|
||||
ref_id: str = "wyl_20251027",
|
||||
config_id: str = None
|
||||
) -> List[DialogData]:
|
||||
"""Generate chunks from all test data entries using the specified chunker strategy.
|
||||
"""Generate chunks from structured messages using the specified chunker strategy.
|
||||
|
||||
Args:
|
||||
chunker_strategy: The chunking strategy to use (default: RecursiveChunker)
|
||||
end_user_id: End user identifier
|
||||
content: Dialog content
|
||||
group_id: Group identifier
|
||||
messages: Structured message list [{"role": "user", "content": "..."}, ...]
|
||||
ref_id: Reference identifier
|
||||
config_id: Configuration ID for processing
|
||||
|
||||
Returns:
|
||||
List of DialogData objects with generated chunks for each test entry
|
||||
List of DialogData objects with generated chunks
|
||||
"""
|
||||
dialog_data_list = []
|
||||
messages = []
|
||||
from app.core.logging_config import get_agent_logger
|
||||
logger = get_agent_logger(__name__)
|
||||
|
||||
messages.append(ConversationMessage(role="用户", msg=content))
|
||||
if not messages or not isinstance(messages, list) or len(messages) == 0:
|
||||
raise ValueError("messages parameter must be a non-empty list")
|
||||
|
||||
# Create DialogData
|
||||
conversation_context = ConversationContext(msgs=messages)
|
||||
# Create DialogData with end_user_id
|
||||
conversation_messages = []
|
||||
|
||||
for idx, msg in enumerate(messages):
|
||||
if not isinstance(msg, dict) or 'role' not in msg or 'content' not in msg:
|
||||
raise ValueError(f"Message {idx} format error: must contain 'role' and 'content' fields")
|
||||
|
||||
role = msg['role']
|
||||
content = msg['content']
|
||||
|
||||
if role not in ['user', 'assistant']:
|
||||
raise ValueError(f"Message {idx} role must be 'user' or 'assistant', got: {role}")
|
||||
|
||||
if content.strip():
|
||||
conversation_messages.append(ConversationMessage(role=role, msg=content.strip()))
|
||||
|
||||
if not conversation_messages:
|
||||
raise ValueError("Message list cannot be empty after filtering")
|
||||
|
||||
conversation_context = ConversationContext(msgs=conversation_messages)
|
||||
dialog_data = DialogData(
|
||||
context=conversation_context,
|
||||
ref_id=ref_id,
|
||||
end_user_id=end_user_id,
|
||||
config_id=config_id
|
||||
)
|
||||
# Create DialogueChunker and process the dialogue
|
||||
|
||||
chunker = DialogueChunker(chunker_strategy)
|
||||
extracted_chunks = await chunker.process_dialogue(dialog_data)
|
||||
dialog_data.chunks = extracted_chunks
|
||||
|
||||
dialog_data_list.append(dialog_data)
|
||||
logger.info(f"DialogData created with {len(extracted_chunks)} chunks")
|
||||
|
||||
# Convert to dict with datetime serialized
|
||||
def serialize_datetime(obj):
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
|
||||
|
||||
combined_output = [dd.model_dump() for dd in dialog_data_list]
|
||||
|
||||
print(dialog_data_list)
|
||||
|
||||
# with open(os.path.join(os.path.dirname(__file__), "chunker_test_output.txt"), "w", encoding="utf-8") as f:
|
||||
# json.dump(combined_output, f, ensure_ascii=False, indent=4, default=serialize_datetime)
|
||||
|
||||
|
||||
return dialog_data_list
|
||||
return [dialog_data]
|
||||
|
||||
@@ -36,9 +36,11 @@ async def write(
|
||||
) -> None:
|
||||
"""
|
||||
Execute the complete knowledge extraction pipeline.
|
||||
|
||||
|
||||
Args:
|
||||
end_user_id: End user identifier
|
||||
user_id: User identifier
|
||||
apply_id: Application identifier
|
||||
group_id: Group identifier
|
||||
memory_config: MemoryConfig object containing all configuration
|
||||
messages: Structured message list [{"role": "user", "content": "..."}, ...]
|
||||
ref_id: Reference ID, defaults to "wyl20251027"
|
||||
@@ -47,14 +49,14 @@ async def write(
|
||||
embedding_model_id = str(memory_config.embedding_model_id)
|
||||
chunker_strategy = memory_config.chunker_strategy
|
||||
config_id = str(memory_config.config_id)
|
||||
|
||||
|
||||
logger.info("=== MemSci Knowledge Extraction Pipeline ===")
|
||||
logger.info(f"Config: {memory_config.config_name} (ID: {config_id})")
|
||||
logger.info(f"Workspace: {memory_config.workspace_name}")
|
||||
logger.info(f"LLM model: {memory_config.llm_model_name}")
|
||||
logger.info(f"Embedding model: {memory_config.embedding_model_name}")
|
||||
logger.info(f"Chunker strategy: {chunker_strategy}")
|
||||
logger.info(f"End User ID: {end_user_id}")
|
||||
logger.info(f"end_user_id ID: {end_user_id}")
|
||||
|
||||
# Construct clients from memory_config using factory pattern with db session
|
||||
with get_db_context() as db:
|
||||
@@ -77,25 +79,10 @@ async def write(
|
||||
|
||||
# Step 1: Load and chunk data
|
||||
step_start = time.time()
|
||||
|
||||
# Convert messages list to content string
|
||||
# messages format: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
|
||||
if isinstance(messages, list) and len(messages) > 0:
|
||||
# Extract content from the last user message or concatenate all messages
|
||||
if isinstance(messages[-1], dict) and 'content' in messages[-1]:
|
||||
content = messages[-1]['content']
|
||||
else:
|
||||
# Fallback: concatenate all message contents
|
||||
content = " ".join([msg.get('content', '') for msg in messages if isinstance(msg, dict)])
|
||||
elif isinstance(messages, str):
|
||||
content = messages
|
||||
else:
|
||||
content = str(messages)
|
||||
|
||||
chunked_dialogs = await get_chunked_dialogs(
|
||||
chunker_strategy=chunker_strategy,
|
||||
end_user_id=end_user_id,
|
||||
content=content, # 修复:使用 content 参数而不是 messages
|
||||
messages=messages,
|
||||
ref_id=ref_id,
|
||||
config_id=config_id,
|
||||
)
|
||||
|
||||
@@ -187,11 +187,11 @@ class ChunkerClient:
|
||||
async def generate_chunks(self, dialogue: DialogData):
|
||||
"""
|
||||
Generate chunks following 1 Message = 1 Chunk strategy.
|
||||
|
||||
|
||||
Each message creates one chunk, directly inheriting role information.
|
||||
If a message is too long, it will be split into multiple sub-chunks,
|
||||
each maintaining the same speaker.
|
||||
|
||||
|
||||
Raises:
|
||||
ValueError: If dialogue has no messages or chunking fails
|
||||
"""
|
||||
@@ -201,9 +201,9 @@ class ChunkerClient:
|
||||
f"Dialogue {dialogue.ref_id} has no messages. "
|
||||
f"Cannot generate chunks from empty dialogue."
|
||||
)
|
||||
|
||||
|
||||
dialogue.chunks = []
|
||||
|
||||
|
||||
# 按消息分块:每个消息创建一个或多个 chunk,直接继承角色
|
||||
for msg_idx, msg in enumerate(dialogue.context.msgs):
|
||||
# Validate message has required attributes
|
||||
@@ -212,13 +212,13 @@ class ChunkerClient:
|
||||
f"Message {msg_idx} in dialogue {dialogue.ref_id} "
|
||||
f"missing 'role' or 'msg' attribute"
|
||||
)
|
||||
|
||||
|
||||
msg_content = msg.msg.strip()
|
||||
|
||||
|
||||
# Skip empty messages
|
||||
if not msg_content:
|
||||
continue
|
||||
|
||||
|
||||
# 如果消息太长,可以进一步分块
|
||||
if len(msg_content) > self.chunk_size:
|
||||
# 对单个消息的内容进行分块
|
||||
@@ -228,14 +228,14 @@ class ChunkerClient:
|
||||
raise ValueError(
|
||||
f"Failed to chunk long message {msg_idx} in dialogue {dialogue.ref_id}: {e}"
|
||||
)
|
||||
|
||||
|
||||
for idx, sub_chunk in enumerate(sub_chunks):
|
||||
sub_chunk_text = sub_chunk.text if hasattr(sub_chunk, 'text') else str(sub_chunk)
|
||||
sub_chunk_text = sub_chunk_text.strip()
|
||||
|
||||
|
||||
if len(sub_chunk_text) < (self.min_characters_per_chunk or 50):
|
||||
continue
|
||||
|
||||
|
||||
chunk = Chunk(
|
||||
content=f"{msg.role}: {sub_chunk_text}",
|
||||
speaker=msg.role, # 直接继承角色
|
||||
@@ -260,7 +260,7 @@ class ChunkerClient:
|
||||
},
|
||||
)
|
||||
dialogue.chunks.append(chunk)
|
||||
|
||||
|
||||
# Validate we generated at least one chunk
|
||||
if not dialogue.chunks:
|
||||
raise ValueError(
|
||||
@@ -268,7 +268,7 @@ class ChunkerClient:
|
||||
f"All messages were either empty or too short. "
|
||||
f"Messages count: {len(dialogue.context.msgs)}"
|
||||
)
|
||||
|
||||
|
||||
return dialogue
|
||||
|
||||
def evaluate_chunking(self, dialogue: DialogData) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user