Initial commit

This commit is contained in:
Ke Sun
2025-11-30 18:22:17 +08:00
commit aea2fe391e
449 changed files with 83030 additions and 0 deletions

View File

@@ -0,0 +1,115 @@
"""Data models for the Memory module.
This package contains all Pydantic models used in the memory system,
including models for messages, dialogues, statements, entities, triplets,
graph nodes/edges, configurations, and deduplication decisions.
"""
# Base response models
from app.core.memory.models.base_response import RobustLLMResponse
# Configuration models
from app.core.memory.models.config_models import (
LLMConfig,
ChunkerConfig,
PruningConfig,
TemporalSearchParams,
)
# Deduplication models
from app.core.memory.models.dedup_models import (
EntityDedupDecision,
EntityDisambDecision,
)
# Graph models (nodes and edges)
from app.core.memory.models.graph_models import (
# Edges
Edge,
ChunkEdge,
ChunkEntityEdge,
ChunkDialogEdge,
StatementChunkEdge,
StatementEntityEdge,
EntityEntityEdge,
# Nodes
Node,
DialogueNode,
StatementNode,
ChunkNode,
ExtractedEntityNode,
MemorySummaryNode,
)
# Message and dialogue models
from app.core.memory.models.message_models import (
ConversationMessage,
TemporalValidityRange,
Statement,
ConversationContext,
Chunk,
DialogData,
)
# Triplet and entity models
from app.core.memory.models.triplet_models import (
Entity,
Triplet,
TripletExtractionResponse,
)
# Variable configuration models
from app.core.memory.models.variate_config import (
StatementExtractionConfig,
ForgettingEngineConfig,
TripletExtractionConfig,
TemporalExtractionConfig,
DedupConfig,
ExtractionPipelineConfig,
)
__all__ = [
# Base response
"RobustLLMResponse",
# Configuration
"LLMConfig",
"ChunkerConfig",
"PruningConfig",
"TemporalSearchParams",
# Deduplication
"EntityDedupDecision",
"EntityDisambDecision",
# Graph edges
"Edge",
"ChunkEdge",
"ChunkEntityEdge",
"ChunkDialogEdge",
"StatementChunkEdge",
"StatementEntityEdge",
"EntityEntityEdge",
# Graph nodes
"Node",
"DialogueNode",
"StatementNode",
"ChunkNode",
"ExtractedEntityNode",
"MemorySummaryNode",
# Messages and dialogues
"ConversationMessage",
"TemporalValidityRange",
"Statement",
"ConversationContext",
"Chunk",
"DialogData",
# Triplets and entities
"Entity",
"Triplet",
"TripletExtractionResponse",
# Variable configuration
"StatementExtractionConfig",
"ForgettingEngineConfig",
"TripletExtractionConfig",
"TemporalExtractionConfig",
"DedupConfig",
"ExtractionPipelineConfig",
]

View File

@@ -0,0 +1,59 @@
"""Base classes for LLM response models with common validators.
This module provides reusable base classes for Pydantic models that handle
common LLM response patterns and edge cases.
Classes:
RobustLLMResponse: Base class for LLM response models with robust validation
"""
from typing import Any
from pydantic import BaseModel, ConfigDict, model_validator
class RobustLLMResponse(BaseModel):
"""Base class for LLM response models with robust validation.
This base class provides:
- Automatic handling of list-wrapped responses (e.g., [{"field": "value"}])
- Ignoring extra fields from LLM output
- Validation on assignment
Usage:
class MyResponse(RobustLLMResponse):
field1: str
field2: int
"""
model_config = ConfigDict(
extra="ignore", # Allow extra fields to be ignored (more forgiving)
validate_assignment=True # Validate on assignment
)
@model_validator(mode='before')
@classmethod
def handle_list_input(cls, data: Any) -> Any:
"""Handle cases where LLM returns a list instead of a dict.
Some LLMs may wrap the response in a list like [{"field": "value"}].
This validator extracts the first item if that happens.
Args:
data: The input data from the LLM
Returns:
The unwrapped data (dict)
Raises:
ValueError: If the input is invalid (empty list, wrong type, etc.)
"""
if isinstance(data, list):
if len(data) == 0:
raise ValueError("Received empty list from LLM")
# Extract first item from list
data = data[0]
if not isinstance(data, dict):
raise ValueError(f"Expected dict or list, got {type(data).__name__}")
return data

View File

@@ -0,0 +1,93 @@
"""Configuration models for Memory module components.
This module contains Pydantic models for configuring various components
of the memory system including LLM, chunking, pruning, and search.
Classes:
LLMConfig: Configuration for LLM client
ChunkerConfig: Configuration for dialogue chunking
PruningConfig: Configuration for semantic pruning
TemporalSearchParams: Parameters for temporal search queries
"""
from typing import Optional
from pydantic import BaseModel, Field
class LLMConfig(BaseModel):
"""Configuration for Large Language Model client.
Attributes:
llm_name: The name of the LLM model to use (e.g., 'gpt-4', 'claude-3')
api_base: Optional base URL for the API endpoint
max_retries: Maximum number of retries for failed API calls (default: 3)
"""
llm_name: str = Field(..., description="The name of the LLM model to use.")
api_base: Optional[str] = Field(None, description="The base URL for the API endpoint.")
max_retries: Optional[int] = Field(3, ge=0, description="The maximum number of retries for API calls.")
class ChunkerConfig(BaseModel):
"""Configuration for dialogue chunking strategy.
Attributes:
chunker_strategy: Name of the chunking strategy (e.g., 'RecursiveChunker', 'SemanticChunker')
embedding_model: Name of the embedding model to use for semantic chunking
chunk_size: Maximum size of each chunk in characters (default: 2048)
threshold: Similarity threshold for semantic chunking (0-1, default: 0.8)
language: Language of the text (default: 'zh' for Chinese)
skip_window: Window size for skip-and-merge strategy (default: 0)
min_sentences: Minimum number of sentences per chunk (default: 1)
min_characters_per_chunk: Minimum characters per chunk (default: 24)
"""
chunker_strategy: str = Field(..., description="The name of the chunker strategy to use.")
embedding_model: str = Field(..., description="The name of the embedding model to use.")
chunk_size: Optional[int] = Field(2048, ge=0, description="The size of each chunk.")
threshold: Optional[float] = Field(0.8, ge=0, le=1, description="The threshold for similarity.")
language: Optional[str] = Field("zh", description="The language of the text.")
skip_window: Optional[int] = Field(0, ge=0, description="The window for skip-and-merge.")
min_sentences: Optional[int] = Field(1, ge=0, description="The minimum number of sentences in each chunk.")
min_characters_per_chunk: Optional[int] = Field(24, ge=0, description="The minimum number of characters in each chunk.")
class PruningConfig(BaseModel):
"""Configuration for semantic pruning of dialogue content.
Attributes:
pruning_switch: Enable or disable semantic pruning
pruning_scene: Scene type for pruning ('education', 'online_service', 'outbound')
pruning_threshold: Pruning ratio (0-0.9, max 0.9 to avoid complete removal)
"""
pruning_switch: bool = Field(False, description="Enable semantic pruning when True.")
pruning_scene: str = Field(
"education",
description="Scene for pruning: one of 'education', 'online_service', 'outbound'.",
)
pruning_threshold: float = Field(
0.5, ge=0.0, le=0.9,
description="Pruning ratio within 0-0.9 (max 0.9 to avoid termination).")
class TemporalSearchParams(BaseModel):
"""Parameters for temporal search queries in the knowledge graph.
Attributes:
group_id: Group ID to filter search results (default: 'test')
apply_id: Application ID to filter search results
user_id: User ID to filter search results
start_date: Start date for temporal filtering (format: 'YYYY-MM-DD')
end_date: End date for temporal filtering (format: 'YYYY-MM-DD')
valid_date: Date when memory should be valid (format: 'YYYY-MM-DD')
invalid_date: Date when memory should be invalid (format: 'YYYY-MM-DD')
limit: Maximum number of results to return (default: 3)
"""
group_id: Optional[str] = Field("test", description="The group ID to filter the search.")
apply_id: Optional[str] = Field(None, description="The apply ID to filter the search.")
user_id: Optional[str] = Field(None, description="The user ID to filter the search.")
start_date: Optional[str] = Field(None, description="The start date for the search.")
end_date: Optional[str] = Field(None, description="The end date for the search.")
valid_date: Optional[str] = Field(None, description="The valid date for the search.")
invalid_date: Optional[str] = Field(None, description="The invalid date for the search.")
limit: int = Field(default=3, description="The maximum number of results to return.")

View File

@@ -0,0 +1,52 @@
"""Models for entity deduplication and disambiguation decisions.
This module contains Pydantic models for structured LLM responses
during entity deduplication and disambiguation processes.
Classes:
EntityDedupDecision: Decision model for entity deduplication
EntityDisambDecision: Decision model for entity disambiguation
"""
from typing import Optional
from pydantic import BaseModel, Field
class EntityDedupDecision(BaseModel):
"""Structured decision returned by LLM for entity deduplication.
This model represents the LLM's decision on whether two entities
refer to the same real-world entity and should be merged.
Attributes:
same_entity: Whether the two entities refer to the same real-world entity
confidence: Model confidence in the decision (0.0 to 1.0)
canonical_idx: Index of the canonical entity to keep when merging (0 or 1, -1 if not applicable)
reason: Brief rationale for the decision (1-3 sentences, kept for audit)
"""
same_entity: bool = Field(..., description="Two entities refer to the same entity")
confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence of the decision")
canonical_idx: int = Field(..., description="Index of canonical entity among the pair: 0 or 1; -1 if not applicable")
reason: str = Field(..., description="Short rationale, 1-3 sentences")
class EntityDisambDecision(BaseModel):
"""Structured disambiguation decision for same-name but different-type entities.
This model represents the LLM's decision on whether two entities with
the same name but different types should be merged or kept separate.
Attributes:
should_merge: Whether the two entities should be merged despite type difference
confidence: Model confidence in the decision (0.0 to 1.0)
canonical_idx: Index of the canonical entity to keep when merging (0 or 1, -1 if not applicable)
block_pair: If True, this pair should be blocked from fuzzy/auto merges
suggested_type: Optional unified type to apply when should_merge is True
reason: Brief rationale for audit and analysis (1-3 sentences)
"""
should_merge: bool = Field(..., description="Merge the pair despite type difference")
confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence of the decision")
canonical_idx: int = Field(..., description="Index of canonical entity among the pair: 0 or 1; -1 if not applicable")
block_pair: bool = Field(False, description="Block this pair from fuzzy or heuristic merges")
suggested_type: Optional[str] = Field(None, description="Unified entity type when merging")
reason: str = Field(..., description="Short rationale, 1-3 sentences")

View File

@@ -0,0 +1,304 @@
"""Graph models for Neo4j knowledge graph nodes and edges.
This module contains Pydantic models representing nodes and edges
in the Neo4j knowledge graph, including dialogues, statements,
chunks, entities, and their relationships.
Classes:
Edge: Base class for all graph edges
ChunkEdge: Edge connecting chunks
ChunkEntityEdge: Edge connecting chunks to entities
ChunkDialogEdge: Edge connecting chunks to dialogues
StatementChunkEdge: Edge connecting statements to chunks
StatementEntityEdge: Edge connecting statements to entities
EntityEntityEdge: Edge connecting related entities
Node: Base class for all graph nodes
DialogueNode: Node representing a dialogue
StatementNode: Node representing a statement
ChunkNode: Node representing a conversation chunk
ExtractedEntityNode: Node representing an extracted entity
MemorySummaryNode: Node representing a memory summary
"""
from uuid import uuid4
from datetime import datetime, timezone
from typing import List, Optional
from pydantic import BaseModel, Field, field_validator
import re
from app.core.memory.utils.data.ontology import TemporalInfo
def parse_historical_datetime(v):
"""支持任意年份的日期解析包括历史日期如公元755年
Python datetime 支持公元1年到9999年的日期
此函数手动解析 ISO 8601 格式的日期字符串支持1-4位年份
Args:
v: 日期值(可以是 None、datetime 对象或字符串)
Returns:
datetime 对象或 None
"""
if v is None or isinstance(v, datetime):
return v
if isinstance(v, str):
# 匹配 ISO 8601 格式YYYY-MM-DD 或 YYYY-MM-DDTHH:MM:SS[.ffffff][Z|±HH:MM]
# 支持1-4位年份
pattern = r'^(\d{1,4})-(\d{2})-(\d{2})(?:T(\d{2}):(\d{2}):(\d{2})(?:\.(\d+))?(?:Z|([+-]\d{2}:\d{2}))?)?'
match = re.match(pattern, v)
if match:
try:
year = int(match.group(1))
month = int(match.group(2))
day = int(match.group(3))
hour = int(match.group(4)) if match.group(4) else 0
minute = int(match.group(5)) if match.group(5) else 0
second = int(match.group(6)) if match.group(6) else 0
microsecond = 0
# 处理微秒
if match.group(7):
# 补齐或截断到6位
us_str = match.group(7).ljust(6, '0')[:6]
microsecond = int(us_str)
# 处理时区
tzinfo = None
if 'Z' in v or match.group(8):
tzinfo = timezone.utc
# 创建 datetime 对象
return datetime(year, month, day, hour, minute, second, microsecond, tzinfo=tzinfo)
except (ValueError, OverflowError):
# 日期值无效如月份13、日期32等
return None
# 如果不匹配模式,尝试使用 fromisoformat用于标准格式
try:
return datetime.fromisoformat(v.replace('Z', '+00:00'))
except Exception:
return None
return v
class Edge(BaseModel):
"""Base class for all graph edges in the knowledge graph.
Attributes:
id: Unique identifier for the edge
source: ID of the source node
target: ID of the target node
group_id: Group ID for multi-tenancy
user_id: User ID for user-specific data
apply_id: Application ID for application-specific data
run_id: Unique identifier for the pipeline run that created this edge
created_at: Timestamp when the edge was created (system perspective)
expired_at: Optional timestamp when the edge expires (system perspective)
"""
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the edge.")
source: str = Field(..., description="The ID of the source node.")
target: str = Field(..., description="The ID of the target node.")
group_id: str = Field(..., description="The group ID of the edge.")
user_id: str = Field(..., description="The user ID of the edge.")
apply_id: str = Field(..., description="The apply ID of the edge.")
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
created_at: datetime = Field(..., description="The valid time of the edge from system perspective.")
expired_at: Optional[datetime] = Field(None, description="The expired time of the edge from system perspective.")
class ChunkEdge(Edge):
"""Edge connecting two chunks in sequence."""
pass
class ChunkEntityEdge(Edge):
"""Edge connecting a chunk to an entity mentioned in it."""
pass
class ChunkDialogEdge(Edge):
"""Edge connecting a chunk to its parent dialog.
Attributes:
sequence_number: Order of this chunk within the dialog
"""
sequence_number: int = Field(..., description="Order of this chunk within the dialog")
class StatementChunkEdge(Edge):
"""Edge connecting a statement to its parent chunk."""
pass
class StatementEntityEdge(Edge):
"""Edge connecting a statement to entities extracted from it.
Attributes:
connect_strength: Classification of connection strength ('Strong' or 'Weak')
"""
connect_strength: str = Field(..., description="Strong VS Weak about this statement-entity edge")
class EntityEntityEdge(Edge):
"""Edge connecting related entities (from triplet relationships).
Attributes:
relation_type: Type of relationship as defined in ontology
relation_value: Optional value of the relation
statement: The statement text where this relationship was found
source_statement_id: ID of the statement where this relationship was extracted
valid_at: Optional start date of temporal validity
invalid_at: Optional end date of temporal validity
"""
relation_type: str = Field(..., description="Relation type as defined in ontology")
relation_value: Optional[str] = Field(None, description="Value of the relation")
statement: str = Field(..., description='The statement of the edge.')
source_statement_id: str = Field(..., description="Statement where this relationship was extracted")
valid_at: Optional[datetime] = Field(None, description="Temporal validity start")
invalid_at: Optional[datetime] = Field(None, description="Temporal validity end")
@field_validator('valid_at', 'invalid_at', mode='before')
@classmethod
def validate_datetime(cls, v):
"""使用通用的历史日期解析函数"""
return parse_historical_datetime(v)
class Node(BaseModel):
"""Base class for all graph nodes in the knowledge graph.
Attributes:
id: Unique identifier for the node
name: Name of the node
group_id: Group ID for multi-tenancy
user_id: User ID for user-specific data
apply_id: Application ID for application-specific data
run_id: Unique identifier for the pipeline run that created this node
created_at: Timestamp when the node was created (system perspective)
expired_at: Optional timestamp when the node expires (system perspective)
"""
id: str = Field(..., description="The unique identifier for the node.")
name: str = Field(..., description="The name of the node.")
group_id: str = Field(..., description="The group ID of the node.")
user_id: str = Field(..., description="The user ID of the edge.")
apply_id: str = Field(..., description="The apply ID of the edge.")
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
created_at: datetime = Field(..., description="The valid time of the node from system perspective.")
expired_at: Optional[datetime] = Field(None, description="The expired time of the node from system perspective.")
class DialogueNode(Node):
"""Node representing a dialogue in the knowledge graph.
Attributes:
ref_id: Reference identifier linking to external dialog system
content: Full dialogue content as text
dialog_embedding: Optional embedding vector for the entire dialogue
config_id: Configuration ID used to process this dialogue
"""
ref_id: str = Field(..., description="Reference identifier of the dialog")
content: str = Field(..., description="Dialogue content")
dialog_embedding: Optional[List[float]] = Field(None, description="Dialog embedding vector")
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this dialogue (integer or string)")
class StatementNode(Node):
"""Node representing a statement extracted from dialogue.
Attributes:
chunk_id: ID of the parent chunk this statement belongs to
stmt_type: Type of the statement (from ontology)
temporal_info: Temporal information extracted from the statement
statement: The actual statement text content
connect_strength: Classification of connection strength ('Strong' or 'Weak')
valid_at: Optional start date of temporal validity
invalid_at: Optional end date of temporal validity
statement_embedding: Optional embedding vector for the statement
chunk_embedding: Optional embedding vector for the parent chunk
config_id: Configuration ID used to process this statement
"""
chunk_id: str = Field(..., description="ID of the parent chunk")
stmt_type: str = Field(..., description="Type of the statement")
temporal_info: TemporalInfo = Field(..., description="Temporal information")
statement: str = Field(..., description="The statement text content")
connect_strength: str = Field(..., description="Strong VS Weak classification of this statement")
valid_at: Optional[datetime] = Field(None, description="Temporal validity start")
invalid_at: Optional[datetime] = Field(None, description="Temporal validity end")
statement_embedding: Optional[List[float]] = Field(None, description="Statement embedding vector")
chunk_embedding: Optional[List[float]] = Field(None, description="Chunk embedding vector")
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this statement (integer or string)")
@field_validator('valid_at', 'invalid_at', mode='before')
@classmethod
def validate_datetime(cls, v):
"""使用通用的历史日期解析函数"""
return parse_historical_datetime(v)
class ChunkNode(Node):
"""Node representing a chunk of conversation in the knowledge graph.
Attributes:
dialog_id: ID of the parent dialog
content: The text content of the chunk
chunk_embedding: Optional embedding vector for the chunk
sequence_number: Order of this chunk within the dialog
metadata: Additional chunk metadata as key-value pairs
"""
dialog_id: str = Field(..., description="ID of the parent dialog")
content: str = Field(..., description="The text content of the chunk")
chunk_embedding: Optional[List[float]] = Field(None, description="Chunk embedding vector")
sequence_number: int = Field(..., description="Order of this chunk within the dialog")
metadata: dict = Field(default_factory=dict, description="Additional chunk metadata")
class ExtractedEntityNode(Node):
"""Node representing an extracted entity in the knowledge graph.
Attributes:
entity_idx: Unique numeric identifier for the entity
statement_id: ID of the statement this entity was extracted from
entity_type: Type/category of the entity
description: Textual description of the entity
aliases: Optional list of alternative names for the entity
name_embedding: Optional embedding vector for the entity name
fact_summary: Summary of facts about this entity
connect_strength: Classification of connection strength ('Strong' or 'Weak')
config_id: Configuration ID used to process this entity
"""
entity_idx: int = Field(..., description="Unique identifier for the entity")
statement_id: str = Field(..., description="Statement this entity was extracted from")
entity_type: str = Field(..., description="Type of the entity")
description: str = Field(..., description="Entity description")
aliases: Optional[List[str]] = Field(default_factory=list, description="Entity aliases")
name_embedding: Optional[List[float]] = Field(default_factory=list, description="Name embedding vector")
fact_summary: str = Field(..., description="Summary of the fact about this entity")
connect_strength: str = Field(..., description="Strong VS Weak about this entity")
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this entity (integer or string)")
class MemorySummaryNode(Node):
"""Node representing a memory summary with vector embedding.
Attributes:
summary_id: Unique identifier for the summary
dialog_id: ID of the parent dialog
chunk_ids: List of chunk IDs used to generate this summary
content: Summary text content
summary_embedding: Optional embedding vector for the summary
metadata: Additional metadata for the summary
config_id: Configuration ID used to process this summary
"""
summary_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for the summary")
dialog_id: str = Field(..., description="ID of the parent dialog")
chunk_ids: List[str] = Field(default_factory=list, description="List of chunk IDs used in the summary")
content: str = Field(..., description="Summary text content")
summary_embedding: Optional[List[float]] = Field(None, description="Embedding vector for the summary")
metadata: dict = Field(default_factory=dict, description="Additional metadata for the summary")
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this summary (integer or string)")

View File

@@ -0,0 +1,247 @@
"""Models for dialogue messages, conversations, and statements.
This module contains Pydantic models for representing dialogue data,
including messages, conversation context, chunks, and statements.
Classes:
ConversationMessage: Single message in a conversation
TemporalValidityRange: Temporal validity range for statements
Statement: Statement extracted from dialogue with metadata
ConversationContext: Full conversation history
Chunk: Chunk of conversation text
DialogData: Complete dialogue data structure
"""
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
from uuid import uuid4
from datetime import datetime
from app.core.memory.utils.data.ontology import StatementType, TemporalInfo, RelevenceInfo
from app.core.memory.models.triplet_models import TripletExtractionResponse, Triplet
class ConversationMessage(BaseModel):
"""Represents a single message in a conversation.
Attributes:
role: Role of the speaker (e.g., '用户' for user, 'AI' for assistant)
msg: Text content of the message
"""
role: str = Field(..., description="The role of the speaker (e.g., '用户', 'AI').")
msg: str = Field(..., description="The text content of the message.")
class TemporalValidityRange(BaseModel):
"""Represents the temporal validity range of a statement.
Attributes:
valid_at: Start date of validity in 'YYYY-MM-DD' format (None if not specified)
invalid_at: End date of validity in 'YYYY-MM-DD' format (None if not specified)
"""
valid_at: Optional[str] = Field(
None,
description="The start date of the statement's validity, in 'YYYY-MM-DD' format or 'None'.",
)
invalid_at: Optional[str] = Field(
None,
description="The end date of the statement's validity, in 'YYYY-MM-DD' format or 'None'.",
)
class Statement(BaseModel):
"""Represents a statement extracted from dialogue with metadata.
Attributes:
id: Unique identifier for the statement
chunk_id: ID of the parent chunk this statement belongs to
group_id: Optional group ID for multi-tenancy
statement: The actual statement text content
statement_embedding: Optional embedding vector for the statement
stmt_type: Type of the statement (from ontology)
temporal_info: Temporal information extracted from the statement
relevence_info: Relevance classification (RELEVANT or IRRELEVANT)
connect_strength: Optional connection strength ('Strong' or 'Weak')
temporal_validity: Optional temporal validity range
triplet_extraction_info: Optional triplet extraction results
"""
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the statement.")
chunk_id: str = Field(..., description="ID of the parent chunk this statement belongs to.")
group_id: Optional[str] = Field(None, description="ID of the group this statement belongs to.")
statement: str = Field(..., description="The text content of the statement.")
statement_embedding: Optional[List[float]] = Field(None, description="The embedding vector of the statement.")
stmt_type: StatementType = Field(..., description="The type of the statement.")
temporal_info: TemporalInfo = Field(..., description="The temporal information of the statement.")
relevence_info: RelevenceInfo = Field(RelevenceInfo.RELEVANT, description="The relevence information of the statement.")
connect_strength: Optional[str] = Field(None, description="Strong VS Weak about this entity")
temporal_validity: Optional[TemporalValidityRange] = Field(
None, description="The temporal validity range of the statement."
)
triplet_extraction_info: Optional[TripletExtractionResponse] = Field(
None, description="The triplet extraction information of the statement."
)
class ConversationContext(BaseModel):
"""Represents the full conversation history.
Attributes:
msgs: List of messages in the conversation
Properties:
content: Formatted string representation of the conversation
"""
msgs: List[ConversationMessage] = Field(..., description="A list of messages in the conversation.")
@property
def content(self) -> str:
"""Get the content of the conversation as a formatted string.
Returns:
String with format "role: message" for each message, joined by newlines
"""
return "\n".join([f"{msg.role}: {msg.msg}" for msg in self.msgs])
class Chunk(BaseModel):
"""A chunk of text from the conversation context.
Attributes:
id: Unique identifier for the chunk
text: List of messages in the chunk
content: The content of the chunk as a formatted string
statements: List of statements extracted from this chunk
chunk_embedding: Optional embedding vector for the chunk
metadata: Additional metadata as key-value pairs
"""
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the chunk.")
text: List[ConversationMessage] = Field(default_factory=list, description="A list of messages in the chunk.")
content: str = Field(..., description="The content of the chunk as a string.")
statements: List[Statement] = Field(default_factory=list, description="A list of statements in the chunk.")
chunk_embedding: Optional[List[float]] = Field(None, description="The embedding vector of the chunk.")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata for the chunk.")
@classmethod
def from_messages(cls, messages: List[ConversationMessage], metadata: Optional[Dict[str, Any]] = None):
"""Create a chunk from a list of messages.
Args:
messages: List of conversation messages
metadata: Optional metadata dictionary
Returns:
Chunk instance with formatted content
"""
if metadata is None:
metadata = {}
# Generate content from messages
content = "\n".join([f"{msg.role}: {msg.msg}" for msg in messages])
return cls(text=messages, content=content, metadata=metadata)
class DialogData(BaseModel):
"""Represents the complete data structure for a dialog record.
Attributes:
id: Unique identifier for the dialog
context: Full conversation context
dialog_embedding: Optional embedding vector for the entire dialog
ref_id: Reference ID linking to external dialog system
group_id: Group ID for multi-tenancy
user_id: User ID for user-specific data
apply_id: Application ID for application-specific data
created_at: Timestamp when the dialog was created
expired_at: Timestamp when the dialog expires (default: far future)
metadata: Additional metadata as key-value pairs
chunks: List of chunks from the conversation
config_id: Configuration ID used to process this dialog
Properties:
content: Formatted string representation of the dialog
"""
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the dialog.")
context: ConversationContext = Field(..., description="The full conversation context as a single string.")
dialog_embedding: Optional[List[float]] = Field(None, description="The embedding vector of the dialog.")
ref_id: str = Field(..., description="Refer to external dialog id. This is used to link to the original dialog.")
group_id: str = Field(default=..., description="Group ID of dialogue data")
user_id: str = Field(..., description="USER ID of dialogue data")
apply_id: str = Field(..., description="APPLY ID of dialogue data")
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the dialog was created.")
expired_at: datetime = Field(default_factory=lambda: datetime(9999, 12, 31), description="The timestamp when the dialog expires.")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata for the dialog.")
chunks: List[Chunk] = Field(default_factory=list, description="A list of chunks from the conversation context.")
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this dialog (integer or string)")
@property
def content(self) -> str:
"""Get the content of the dialog as a formatted string.
Returns:
String representation of the conversation context
"""
return self.context.content
def get_statement_chunk(self, statement_id: str) -> Optional[Chunk]:
"""Find the chunk containing a specific statement.
Args:
statement_id: ID of the statement to find
Returns:
Chunk containing the statement, or None if not found
"""
for chunk in self.chunks:
for statement in chunk.statements:
if statement.id == statement_id:
return chunk
return None
def get_all_statements(self) -> List[Statement]:
"""Get all statements from all chunks.
Returns:
List of all statements in the dialog
"""
all_statements = []
for chunk in self.chunks:
all_statements.extend(chunk.statements)
return all_statements
def get_statement_by_id(self, statement_id: str) -> Optional[Statement]:
"""Find a specific statement by its ID.
Args:
statement_id: ID of the statement to find
Returns:
Statement with the given ID, or None if not found
"""
for chunk in self.chunks:
for statement in chunk.statements:
if statement.id == statement_id:
return statement
return None
def get_triplets_for_statement(self, statement_id: str) -> List[Triplet]:
"""Get all triplets extracted from a specific statement.
Args:
statement_id: ID of the statement
Returns:
List of triplets from the statement, or empty list if none found
"""
statement = self.get_statement_by_id(statement_id)
if statement and statement.triplet_extraction_info:
return statement.triplet_extraction_info.triplets
return []
def assign_group_id_to_statements(self) -> None:
"""Assign this dialog's group_id to all statements in all chunks.
This method updates statements that don't have a group_id set.
"""
for chunk in self.chunks:
for statement in chunk.statements:
if statement.group_id is None:
statement.group_id = self.group_id

View File

@@ -0,0 +1,85 @@
"""Models for knowledge triplets and entities.
This module contains Pydantic models for representing extracted knowledge
in the form of entities and triplets (subject-predicate-object relationships).
Classes:
Entity: Represents an extracted entity
Triplet: Represents a knowledge triplet (subject-predicate-object)
TripletExtractionResponse: Response model containing extracted triplets and entities
"""
from typing import List, Optional
from pydantic import BaseModel, Field, ConfigDict
from uuid import uuid4
class Entity(BaseModel):
"""Represents an extracted entity from dialogue.
Attributes:
id: Unique string identifier for the entity
entity_idx: Numeric index for the entity
name: Name of the entity
name_embedding: Optional embedding vector for the entity name
type: Type/category of the entity (e.g., 'Person', 'Organization')
description: Textual description of the entity
Config:
extra: Ignore extra fields from LLM output
"""
model_config = ConfigDict(extra='ignore')
id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for the entity.")
entity_idx: int = Field(..., description="Unique identifier for the entity")
name: str = Field(..., description="Name of the entity")
name_embedding: Optional[List[float]] = Field(None, description="Embedding vector for the entity name")
type: str = Field(..., description="Type/category of the entity")
description: str = Field(..., description="Description of the entity")
class Triplet(BaseModel):
"""Represents an extracted knowledge triplet (subject-predicate-object).
A triplet represents a relationship between two entities, forming
the basic unit of knowledge in the knowledge graph.
Attributes:
id: Unique string identifier for the triplet
statement_id: Optional ID of the parent statement (set programmatically)
subject_name: Name of the subject entity
subject_id: Numeric ID of the subject entity
predicate: Relationship/predicate between subject and object
object_name: Name of the object entity
object_id: Numeric ID of the object entity
value: Optional additional value or context for the relationship
Config:
extra: Ignore extra fields from LLM output
"""
model_config = ConfigDict(extra='ignore')
id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for the triplet.")
statement_id: Optional[str] = Field(None, description="ID of the parent statement this triplet was extracted from.")
subject_name: str = Field(..., description="Name of the subject entity")
subject_id: int = Field(..., description="ID of the subject entity")
predicate: str = Field(..., description="Relationship/predicate between subject and object")
object_name: str = Field(..., description="Name of the object entity")
object_id: int = Field(..., description="ID of the object entity")
value: Optional[str] = Field(None, description="Additional value or context")
class TripletExtractionResponse(BaseModel):
"""Response model for triplet extraction from LLM.
This model represents the structured output from the LLM when
extracting knowledge triplets and entities from statements.
Attributes:
triplets: List of extracted knowledge triplets
entities: List of extracted entities
Config:
extra: Ignore extra fields from LLM output
"""
model_config = ConfigDict(extra='ignore')
triplets: List[Triplet] = Field(default_factory=list, description="List of extracted triplets")
entities: List[Entity] = Field(default_factory=list, description="List of extracted entities")

View File

@@ -0,0 +1,151 @@
"""Variable configuration models for extraction pipeline components.
This module contains Pydantic models for configuring various aspects
of the extraction pipeline, including statement extraction, triplet extraction,
temporal extraction, deduplication, and forgetting mechanisms.
Classes:
StatementExtractionConfig: Configuration for statement extraction
ForgettingEngineConfig: Configuration for forgetting engine
TripletExtractionConfig: Configuration for triplet extraction
TemporalExtractionConfig: Configuration for temporal extraction
DedupConfig: Configuration for entity deduplication
ExtractionPipelineConfig: Combined configuration for entire pipeline
"""
from typing import Optional
from pydantic import BaseModel, Field
class StatementExtractionConfig(BaseModel):
"""Configuration for statement extraction behavior.
Attributes:
statement_granularity: Granularity level (1-3):
- 1: Split sentences into different statements
- 2: Sentence-level statements
- 3: Combine sentences, shorten long statements
temperature: LLM temperature for statement extraction (0-2, default: 0.1)
include_dialogue_context: Whether to include full dialogue context
max_dialogue_context_chars: Maximum characters from dialogue context (default: 2000)
"""
statement_granularity: Optional[int] = Field(None, ge=1, le=3, description="Granularity of statements to extract, level 1 to 3")
temperature: Optional[float] = Field(0.1, ge=0, le=2, description="LLM temperature for statement extraction")
include_dialogue_context: bool = Field(True, description="Whether to include full dialogue context in extraction")
max_dialogue_context_chars: Optional[int] = Field(2000, ge=100, description="Maximum number of characters to include from dialogue context")
class ForgettingEngineConfig(BaseModel):
"""Configuration for the forgetting engine.
The forgetting engine implements a memory decay mechanism based on
time and memory strength parameters.
Attributes:
offset: Minimum retention level (0-1, prevents complete forgetting, default: 0.1)
lambda_time: Lambda parameter controlling time decay effect (default: 0.1)
lambda_mem: Lambda parameter controlling memory strength effect (default: 1.0)
"""
offset: float = Field(0.1, ge=0.0, le=1.0, description="Minimum retention level (prevents complete forgetting).")
lambda_time: float = Field(0.1, gt=0.0, description="Lambda parameter controlling time effect.")
lambda_mem: float = Field(1.0, gt=0.0, description="Lambda parameter controlling memory strength effect.")
class TripletExtractionConfig(BaseModel):
"""Configuration for triplet extraction behavior.
Attributes:
temperature: LLM temperature for triplet extraction (0-2, default: 0.1)
enable_entity_normalization: Whether to normalize entity names (default: True)
confidence_threshold: Minimum confidence for extracted triplets (0-1, default: 0.7)
"""
temperature: Optional[float] = Field(0.1, ge=0, le=2, description="LLM temperature for triplet extraction")
enable_entity_normalization: bool = Field(True, description="Whether to normalize entity names")
confidence_threshold: Optional[float] = Field(0.7, ge=0, le=1, description="Minimum confidence threshold for extracted triplets")
class TemporalExtractionConfig(BaseModel):
"""Configuration for temporal extraction behavior.
Attributes:
temperature: LLM temperature for temporal extraction (0-2, default: 0.1)
"""
temperature: Optional[float] = Field(0.1, ge=0, le=2, description="LLM temperature for temporal extraction")
class DedupConfig(BaseModel):
"""Configuration for entity deduplication behavior.
This configuration controls the multi-stage deduplication process,
including fuzzy matching, LLM-based deduplication, and disambiguation.
Attributes:
enable_llm_dedup_blockwise: Enable blockwise LLM-driven deduplication (default: False)
enable_llm_disambiguation: Enable LLM disambiguation for same-name different-type entities (default: False)
enable_llm_fallback_only_on_borderline: Only trigger LLM when borderline pairs exist (default: True)
fuzzy_name_threshold_strict: Strict threshold for name similarity (0-1, default: 0.90)
fuzzy_type_threshold_strict: Strict threshold for type similarity (0-1, default: 0.75)
fuzzy_overall_threshold: Overall similarity threshold to merge (0-1, default: 0.82)
fuzzy_unknown_type_name_threshold: Name threshold when entity type is UNKNOWN (0-1, default: 0.92)
fuzzy_unknown_type_type_threshold: Type threshold when entity type is UNKNOWN (0-1, default: 0.50)
name_weight: Weight of name similarity in overall score (0-1, default: 0.50)
desc_weight: Weight of description similarity in overall score (0-1, default: 0.30)
type_weight: Weight of type similarity in overall score (0-1, default: 0.20)
context_bonus: Bonus when entities co-occur in same statements (0-0.2, default: 0.03)
llm_fallback_floor: Lower bound for borderline score (0-1, default: 0.76)
llm_fallback_ceiling: Upper bound for borderline score (0-1, default: 0.82)
llm_block_size: Entities per block for LLM dedup (1-500, default: 50)
llm_block_concurrency: Concurrent blocks processed by LLM (1-64, default: 4)
llm_pair_concurrency: Concurrent pairwise decisions per block (1-64, default: 4)
llm_max_rounds: Maximum LLM iterative dedup rounds (1-10, default: 3)
"""
# LLM deduplication toggles
enable_llm_dedup_blockwise: bool = Field(False, description="Toggle blockwise LLM-driven deduplication")
enable_llm_disambiguation: bool = Field(False, description="Toggle LLM-driven disambiguation for same-name different-type entities")
enable_llm_fallback_only_on_borderline: bool = Field(True, description="Trigger LLM dedup only when borderline pairs are detected in fuzzy stage")
# Fuzzy match thresholds
fuzzy_name_threshold_strict: float = Field(0.90, ge=0, le=1, description="Strict threshold for name similarity")
fuzzy_type_threshold_strict: float = Field(0.75, ge=0, le=1, description="Strict threshold for type similarity")
fuzzy_overall_threshold: float = Field(0.82, ge=0, le=1, description="Overall similarity threshold to merge")
# Specialized thresholds when type is UNKNOWN
fuzzy_unknown_type_name_threshold: float = Field(0.92, ge=0, le=1, description="Name threshold when any entity type is UNKNOWN")
fuzzy_unknown_type_type_threshold: float = Field(0.50, ge=0, le=1, description="Type threshold when any entity type is UNKNOWN")
# Weighted scoring components for overall similarity
name_weight: float = Field(0.50, ge=0, le=1, description="Weight of name similarity in overall score")
desc_weight: float = Field(0.30, ge=0, le=1, description="Weight of description similarity in overall score")
type_weight: float = Field(0.20, ge=0, le=1, description="Weight of type similarity in overall score")
context_bonus: float = Field(0.03, ge=0, le=0.2, description="Bonus added to score when entities co-occur in same statements")
# Borderline range for LLM fallback triggering
llm_fallback_floor: float = Field(0.76, ge=0, le=1, description="Lower bound of overall score to consider as borderline for LLM fallback")
llm_fallback_ceiling: float = Field(0.82, ge=0, le=1, description="Upper bound (below merge threshold) of overall score for LLM fallback")
# LLM iterative dedup parameters
llm_block_size: int = Field(50, ge=1, le=500, description="Entities per block for LLM dedup")
llm_block_concurrency: int = Field(4, ge=1, le=64, description="Concurrent blocks processed by LLM")
llm_pair_concurrency: int = Field(4, ge=1, le=64, description="Concurrent pairwise decisions per block")
llm_max_rounds: int = Field(3, ge=1, le=10, description="Maximum LLM iterative dedup rounds")
class ExtractionPipelineConfig(BaseModel):
"""Configuration for the entire extraction pipeline.
This model combines all configuration components for the complete
extraction pipeline, including statement extraction, triplet extraction,
temporal extraction, deduplication, and forgetting mechanisms.
Attributes:
statement_extraction: Configuration for statement extraction
triplet_extraction: Configuration for triplet extraction
temporal_extraction: Configuration for temporal extraction
deduplication: Configuration for entity deduplication
forgetting_engine: Configuration for forgetting engine
"""
statement_extraction: StatementExtractionConfig = Field(default_factory=StatementExtractionConfig)
triplet_extraction: TripletExtractionConfig = Field(default_factory=TripletExtractionConfig)
temporal_extraction: TemporalExtractionConfig = Field(default_factory=TemporalExtractionConfig)
deduplication: DedupConfig = Field(default_factory=DedupConfig)
forgetting_engine: ForgettingEngineConfig = Field(default_factory=ForgettingEngineConfig)