Initial commit
This commit is contained in:
115
app/core/memory/models/__init__.py
Normal file
115
app/core/memory/models/__init__.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""Data models for the Memory module.
|
||||
|
||||
This package contains all Pydantic models used in the memory system,
|
||||
including models for messages, dialogues, statements, entities, triplets,
|
||||
graph nodes/edges, configurations, and deduplication decisions.
|
||||
"""
|
||||
|
||||
# Base response models
|
||||
from app.core.memory.models.base_response import RobustLLMResponse
|
||||
|
||||
# Configuration models
|
||||
from app.core.memory.models.config_models import (
|
||||
LLMConfig,
|
||||
ChunkerConfig,
|
||||
PruningConfig,
|
||||
TemporalSearchParams,
|
||||
)
|
||||
|
||||
# Deduplication models
|
||||
from app.core.memory.models.dedup_models import (
|
||||
EntityDedupDecision,
|
||||
EntityDisambDecision,
|
||||
)
|
||||
|
||||
# Graph models (nodes and edges)
|
||||
from app.core.memory.models.graph_models import (
|
||||
# Edges
|
||||
Edge,
|
||||
ChunkEdge,
|
||||
ChunkEntityEdge,
|
||||
ChunkDialogEdge,
|
||||
StatementChunkEdge,
|
||||
StatementEntityEdge,
|
||||
EntityEntityEdge,
|
||||
# Nodes
|
||||
Node,
|
||||
DialogueNode,
|
||||
StatementNode,
|
||||
ChunkNode,
|
||||
ExtractedEntityNode,
|
||||
MemorySummaryNode,
|
||||
)
|
||||
|
||||
# Message and dialogue models
|
||||
from app.core.memory.models.message_models import (
|
||||
ConversationMessage,
|
||||
TemporalValidityRange,
|
||||
Statement,
|
||||
ConversationContext,
|
||||
Chunk,
|
||||
DialogData,
|
||||
)
|
||||
|
||||
# Triplet and entity models
|
||||
from app.core.memory.models.triplet_models import (
|
||||
Entity,
|
||||
Triplet,
|
||||
TripletExtractionResponse,
|
||||
)
|
||||
|
||||
# Variable configuration models
|
||||
from app.core.memory.models.variate_config import (
|
||||
StatementExtractionConfig,
|
||||
ForgettingEngineConfig,
|
||||
TripletExtractionConfig,
|
||||
TemporalExtractionConfig,
|
||||
DedupConfig,
|
||||
ExtractionPipelineConfig,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Base response
|
||||
"RobustLLMResponse",
|
||||
# Configuration
|
||||
"LLMConfig",
|
||||
"ChunkerConfig",
|
||||
"PruningConfig",
|
||||
"TemporalSearchParams",
|
||||
# Deduplication
|
||||
"EntityDedupDecision",
|
||||
"EntityDisambDecision",
|
||||
# Graph edges
|
||||
"Edge",
|
||||
"ChunkEdge",
|
||||
"ChunkEntityEdge",
|
||||
"ChunkDialogEdge",
|
||||
"StatementChunkEdge",
|
||||
"StatementEntityEdge",
|
||||
"EntityEntityEdge",
|
||||
# Graph nodes
|
||||
"Node",
|
||||
"DialogueNode",
|
||||
"StatementNode",
|
||||
"ChunkNode",
|
||||
"ExtractedEntityNode",
|
||||
"MemorySummaryNode",
|
||||
# Messages and dialogues
|
||||
"ConversationMessage",
|
||||
"TemporalValidityRange",
|
||||
"Statement",
|
||||
"ConversationContext",
|
||||
"Chunk",
|
||||
"DialogData",
|
||||
# Triplets and entities
|
||||
"Entity",
|
||||
"Triplet",
|
||||
"TripletExtractionResponse",
|
||||
# Variable configuration
|
||||
"StatementExtractionConfig",
|
||||
"ForgettingEngineConfig",
|
||||
"TripletExtractionConfig",
|
||||
"TemporalExtractionConfig",
|
||||
"DedupConfig",
|
||||
"ExtractionPipelineConfig",
|
||||
]
|
||||
59
app/core/memory/models/base_response.py
Normal file
59
app/core/memory/models/base_response.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Base classes for LLM response models with common validators.
|
||||
|
||||
This module provides reusable base classes for Pydantic models that handle
|
||||
common LLM response patterns and edge cases.
|
||||
|
||||
Classes:
|
||||
RobustLLMResponse: Base class for LLM response models with robust validation
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
from pydantic import BaseModel, ConfigDict, model_validator
|
||||
|
||||
|
||||
class RobustLLMResponse(BaseModel):
|
||||
"""Base class for LLM response models with robust validation.
|
||||
|
||||
This base class provides:
|
||||
- Automatic handling of list-wrapped responses (e.g., [{"field": "value"}])
|
||||
- Ignoring extra fields from LLM output
|
||||
- Validation on assignment
|
||||
|
||||
Usage:
|
||||
class MyResponse(RobustLLMResponse):
|
||||
field1: str
|
||||
field2: int
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="ignore", # Allow extra fields to be ignored (more forgiving)
|
||||
validate_assignment=True # Validate on assignment
|
||||
)
|
||||
|
||||
@model_validator(mode='before')
|
||||
@classmethod
|
||||
def handle_list_input(cls, data: Any) -> Any:
|
||||
"""Handle cases where LLM returns a list instead of a dict.
|
||||
|
||||
Some LLMs may wrap the response in a list like [{"field": "value"}].
|
||||
This validator extracts the first item if that happens.
|
||||
|
||||
Args:
|
||||
data: The input data from the LLM
|
||||
|
||||
Returns:
|
||||
The unwrapped data (dict)
|
||||
|
||||
Raises:
|
||||
ValueError: If the input is invalid (empty list, wrong type, etc.)
|
||||
"""
|
||||
if isinstance(data, list):
|
||||
if len(data) == 0:
|
||||
raise ValueError("Received empty list from LLM")
|
||||
# Extract first item from list
|
||||
data = data[0]
|
||||
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"Expected dict or list, got {type(data).__name__}")
|
||||
|
||||
return data
|
||||
93
app/core/memory/models/config_models.py
Normal file
93
app/core/memory/models/config_models.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Configuration models for Memory module components.
|
||||
|
||||
This module contains Pydantic models for configuring various components
|
||||
of the memory system including LLM, chunking, pruning, and search.
|
||||
|
||||
Classes:
|
||||
LLMConfig: Configuration for LLM client
|
||||
ChunkerConfig: Configuration for dialogue chunking
|
||||
PruningConfig: Configuration for semantic pruning
|
||||
TemporalSearchParams: Parameters for temporal search queries
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class LLMConfig(BaseModel):
|
||||
"""Configuration for Large Language Model client.
|
||||
|
||||
Attributes:
|
||||
llm_name: The name of the LLM model to use (e.g., 'gpt-4', 'claude-3')
|
||||
api_base: Optional base URL for the API endpoint
|
||||
max_retries: Maximum number of retries for failed API calls (default: 3)
|
||||
"""
|
||||
llm_name: str = Field(..., description="The name of the LLM model to use.")
|
||||
api_base: Optional[str] = Field(None, description="The base URL for the API endpoint.")
|
||||
max_retries: Optional[int] = Field(3, ge=0, description="The maximum number of retries for API calls.")
|
||||
|
||||
|
||||
class ChunkerConfig(BaseModel):
|
||||
"""Configuration for dialogue chunking strategy.
|
||||
|
||||
Attributes:
|
||||
chunker_strategy: Name of the chunking strategy (e.g., 'RecursiveChunker', 'SemanticChunker')
|
||||
embedding_model: Name of the embedding model to use for semantic chunking
|
||||
chunk_size: Maximum size of each chunk in characters (default: 2048)
|
||||
threshold: Similarity threshold for semantic chunking (0-1, default: 0.8)
|
||||
language: Language of the text (default: 'zh' for Chinese)
|
||||
skip_window: Window size for skip-and-merge strategy (default: 0)
|
||||
min_sentences: Minimum number of sentences per chunk (default: 1)
|
||||
min_characters_per_chunk: Minimum characters per chunk (default: 24)
|
||||
"""
|
||||
chunker_strategy: str = Field(..., description="The name of the chunker strategy to use.")
|
||||
embedding_model: str = Field(..., description="The name of the embedding model to use.")
|
||||
chunk_size: Optional[int] = Field(2048, ge=0, description="The size of each chunk.")
|
||||
threshold: Optional[float] = Field(0.8, ge=0, le=1, description="The threshold for similarity.")
|
||||
language: Optional[str] = Field("zh", description="The language of the text.")
|
||||
skip_window: Optional[int] = Field(0, ge=0, description="The window for skip-and-merge.")
|
||||
min_sentences: Optional[int] = Field(1, ge=0, description="The minimum number of sentences in each chunk.")
|
||||
min_characters_per_chunk: Optional[int] = Field(24, ge=0, description="The minimum number of characters in each chunk.")
|
||||
|
||||
|
||||
class PruningConfig(BaseModel):
|
||||
"""Configuration for semantic pruning of dialogue content.
|
||||
|
||||
Attributes:
|
||||
pruning_switch: Enable or disable semantic pruning
|
||||
pruning_scene: Scene type for pruning ('education', 'online_service', 'outbound')
|
||||
pruning_threshold: Pruning ratio (0-0.9, max 0.9 to avoid complete removal)
|
||||
"""
|
||||
pruning_switch: bool = Field(False, description="Enable semantic pruning when True.")
|
||||
pruning_scene: str = Field(
|
||||
"education",
|
||||
description="Scene for pruning: one of 'education', 'online_service', 'outbound'.",
|
||||
)
|
||||
pruning_threshold: float = Field(
|
||||
0.5, ge=0.0, le=0.9,
|
||||
description="Pruning ratio within 0-0.9 (max 0.9 to avoid termination).")
|
||||
|
||||
|
||||
class TemporalSearchParams(BaseModel):
|
||||
"""Parameters for temporal search queries in the knowledge graph.
|
||||
|
||||
Attributes:
|
||||
group_id: Group ID to filter search results (default: 'test')
|
||||
apply_id: Application ID to filter search results
|
||||
user_id: User ID to filter search results
|
||||
start_date: Start date for temporal filtering (format: 'YYYY-MM-DD')
|
||||
end_date: End date for temporal filtering (format: 'YYYY-MM-DD')
|
||||
valid_date: Date when memory should be valid (format: 'YYYY-MM-DD')
|
||||
invalid_date: Date when memory should be invalid (format: 'YYYY-MM-DD')
|
||||
limit: Maximum number of results to return (default: 3)
|
||||
"""
|
||||
group_id: Optional[str] = Field("test", description="The group ID to filter the search.")
|
||||
apply_id: Optional[str] = Field(None, description="The apply ID to filter the search.")
|
||||
user_id: Optional[str] = Field(None, description="The user ID to filter the search.")
|
||||
start_date: Optional[str] = Field(None, description="The start date for the search.")
|
||||
end_date: Optional[str] = Field(None, description="The end date for the search.")
|
||||
valid_date: Optional[str] = Field(None, description="The valid date for the search.")
|
||||
invalid_date: Optional[str] = Field(None, description="The invalid date for the search.")
|
||||
limit: int = Field(default=3, description="The maximum number of results to return.")
|
||||
|
||||
|
||||
52
app/core/memory/models/dedup_models.py
Normal file
52
app/core/memory/models/dedup_models.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""Models for entity deduplication and disambiguation decisions.
|
||||
|
||||
This module contains Pydantic models for structured LLM responses
|
||||
during entity deduplication and disambiguation processes.
|
||||
|
||||
Classes:
|
||||
EntityDedupDecision: Decision model for entity deduplication
|
||||
EntityDisambDecision: Decision model for entity disambiguation
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class EntityDedupDecision(BaseModel):
|
||||
"""Structured decision returned by LLM for entity deduplication.
|
||||
|
||||
This model represents the LLM's decision on whether two entities
|
||||
refer to the same real-world entity and should be merged.
|
||||
|
||||
Attributes:
|
||||
same_entity: Whether the two entities refer to the same real-world entity
|
||||
confidence: Model confidence in the decision (0.0 to 1.0)
|
||||
canonical_idx: Index of the canonical entity to keep when merging (0 or 1, -1 if not applicable)
|
||||
reason: Brief rationale for the decision (1-3 sentences, kept for audit)
|
||||
"""
|
||||
same_entity: bool = Field(..., description="Two entities refer to the same entity")
|
||||
confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence of the decision")
|
||||
canonical_idx: int = Field(..., description="Index of canonical entity among the pair: 0 or 1; -1 if not applicable")
|
||||
reason: str = Field(..., description="Short rationale, 1-3 sentences")
|
||||
|
||||
|
||||
class EntityDisambDecision(BaseModel):
|
||||
"""Structured disambiguation decision for same-name but different-type entities.
|
||||
|
||||
This model represents the LLM's decision on whether two entities with
|
||||
the same name but different types should be merged or kept separate.
|
||||
|
||||
Attributes:
|
||||
should_merge: Whether the two entities should be merged despite type difference
|
||||
confidence: Model confidence in the decision (0.0 to 1.0)
|
||||
canonical_idx: Index of the canonical entity to keep when merging (0 or 1, -1 if not applicable)
|
||||
block_pair: If True, this pair should be blocked from fuzzy/auto merges
|
||||
suggested_type: Optional unified type to apply when should_merge is True
|
||||
reason: Brief rationale for audit and analysis (1-3 sentences)
|
||||
"""
|
||||
should_merge: bool = Field(..., description="Merge the pair despite type difference")
|
||||
confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence of the decision")
|
||||
canonical_idx: int = Field(..., description="Index of canonical entity among the pair: 0 or 1; -1 if not applicable")
|
||||
block_pair: bool = Field(False, description="Block this pair from fuzzy or heuristic merges")
|
||||
suggested_type: Optional[str] = Field(None, description="Unified entity type when merging")
|
||||
reason: str = Field(..., description="Short rationale, 1-3 sentences")
|
||||
304
app/core/memory/models/graph_models.py
Normal file
304
app/core/memory/models/graph_models.py
Normal file
@@ -0,0 +1,304 @@
|
||||
"""Graph models for Neo4j knowledge graph nodes and edges.
|
||||
|
||||
This module contains Pydantic models representing nodes and edges
|
||||
in the Neo4j knowledge graph, including dialogues, statements,
|
||||
chunks, entities, and their relationships.
|
||||
|
||||
Classes:
|
||||
Edge: Base class for all graph edges
|
||||
ChunkEdge: Edge connecting chunks
|
||||
ChunkEntityEdge: Edge connecting chunks to entities
|
||||
ChunkDialogEdge: Edge connecting chunks to dialogues
|
||||
StatementChunkEdge: Edge connecting statements to chunks
|
||||
StatementEntityEdge: Edge connecting statements to entities
|
||||
EntityEntityEdge: Edge connecting related entities
|
||||
Node: Base class for all graph nodes
|
||||
DialogueNode: Node representing a dialogue
|
||||
StatementNode: Node representing a statement
|
||||
ChunkNode: Node representing a conversation chunk
|
||||
ExtractedEntityNode: Node representing an extracted entity
|
||||
MemorySummaryNode: Node representing a memory summary
|
||||
"""
|
||||
|
||||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
import re
|
||||
|
||||
from app.core.memory.utils.data.ontology import TemporalInfo
|
||||
|
||||
|
||||
def parse_historical_datetime(v):
|
||||
"""支持任意年份的日期解析,包括历史日期(如公元755年)
|
||||
|
||||
Python datetime 支持公元1年到9999年的日期
|
||||
此函数手动解析 ISO 8601 格式的日期字符串,支持1-4位年份
|
||||
|
||||
Args:
|
||||
v: 日期值(可以是 None、datetime 对象或字符串)
|
||||
|
||||
Returns:
|
||||
datetime 对象或 None
|
||||
"""
|
||||
if v is None or isinstance(v, datetime):
|
||||
return v
|
||||
|
||||
if isinstance(v, str):
|
||||
# 匹配 ISO 8601 格式:YYYY-MM-DD 或 YYYY-MM-DDTHH:MM:SS[.ffffff][Z|±HH:MM]
|
||||
# 支持1-4位年份
|
||||
pattern = r'^(\d{1,4})-(\d{2})-(\d{2})(?:T(\d{2}):(\d{2}):(\d{2})(?:\.(\d+))?(?:Z|([+-]\d{2}:\d{2}))?)?'
|
||||
match = re.match(pattern, v)
|
||||
|
||||
if match:
|
||||
try:
|
||||
year = int(match.group(1))
|
||||
month = int(match.group(2))
|
||||
day = int(match.group(3))
|
||||
hour = int(match.group(4)) if match.group(4) else 0
|
||||
minute = int(match.group(5)) if match.group(5) else 0
|
||||
second = int(match.group(6)) if match.group(6) else 0
|
||||
microsecond = 0
|
||||
|
||||
# 处理微秒
|
||||
if match.group(7):
|
||||
# 补齐或截断到6位
|
||||
us_str = match.group(7).ljust(6, '0')[:6]
|
||||
microsecond = int(us_str)
|
||||
|
||||
# 处理时区
|
||||
tzinfo = None
|
||||
if 'Z' in v or match.group(8):
|
||||
tzinfo = timezone.utc
|
||||
|
||||
# 创建 datetime 对象
|
||||
return datetime(year, month, day, hour, minute, second, microsecond, tzinfo=tzinfo)
|
||||
|
||||
except (ValueError, OverflowError):
|
||||
# 日期值无效(如月份13、日期32等)
|
||||
return None
|
||||
|
||||
# 如果不匹配模式,尝试使用 fromisoformat(用于标准格式)
|
||||
try:
|
||||
return datetime.fromisoformat(v.replace('Z', '+00:00'))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return v
|
||||
|
||||
|
||||
class Edge(BaseModel):
|
||||
"""Base class for all graph edges in the knowledge graph.
|
||||
|
||||
Attributes:
|
||||
id: Unique identifier for the edge
|
||||
source: ID of the source node
|
||||
target: ID of the target node
|
||||
group_id: Group ID for multi-tenancy
|
||||
user_id: User ID for user-specific data
|
||||
apply_id: Application ID for application-specific data
|
||||
run_id: Unique identifier for the pipeline run that created this edge
|
||||
created_at: Timestamp when the edge was created (system perspective)
|
||||
expired_at: Optional timestamp when the edge expires (system perspective)
|
||||
"""
|
||||
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the edge.")
|
||||
source: str = Field(..., description="The ID of the source node.")
|
||||
target: str = Field(..., description="The ID of the target node.")
|
||||
group_id: str = Field(..., description="The group ID of the edge.")
|
||||
user_id: str = Field(..., description="The user ID of the edge.")
|
||||
apply_id: str = Field(..., description="The apply ID of the edge.")
|
||||
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
|
||||
created_at: datetime = Field(..., description="The valid time of the edge from system perspective.")
|
||||
expired_at: Optional[datetime] = Field(None, description="The expired time of the edge from system perspective.")
|
||||
|
||||
|
||||
class ChunkEdge(Edge):
|
||||
"""Edge connecting two chunks in sequence."""
|
||||
pass
|
||||
|
||||
|
||||
class ChunkEntityEdge(Edge):
|
||||
"""Edge connecting a chunk to an entity mentioned in it."""
|
||||
pass
|
||||
|
||||
|
||||
class ChunkDialogEdge(Edge):
|
||||
"""Edge connecting a chunk to its parent dialog.
|
||||
|
||||
Attributes:
|
||||
sequence_number: Order of this chunk within the dialog
|
||||
"""
|
||||
sequence_number: int = Field(..., description="Order of this chunk within the dialog")
|
||||
|
||||
|
||||
class StatementChunkEdge(Edge):
|
||||
"""Edge connecting a statement to its parent chunk."""
|
||||
pass
|
||||
|
||||
|
||||
class StatementEntityEdge(Edge):
|
||||
"""Edge connecting a statement to entities extracted from it.
|
||||
|
||||
Attributes:
|
||||
connect_strength: Classification of connection strength ('Strong' or 'Weak')
|
||||
"""
|
||||
connect_strength: str = Field(..., description="Strong VS Weak about this statement-entity edge")
|
||||
|
||||
|
||||
class EntityEntityEdge(Edge):
|
||||
"""Edge connecting related entities (from triplet relationships).
|
||||
|
||||
Attributes:
|
||||
relation_type: Type of relationship as defined in ontology
|
||||
relation_value: Optional value of the relation
|
||||
statement: The statement text where this relationship was found
|
||||
source_statement_id: ID of the statement where this relationship was extracted
|
||||
valid_at: Optional start date of temporal validity
|
||||
invalid_at: Optional end date of temporal validity
|
||||
"""
|
||||
relation_type: str = Field(..., description="Relation type as defined in ontology")
|
||||
relation_value: Optional[str] = Field(None, description="Value of the relation")
|
||||
statement: str = Field(..., description='The statement of the edge.')
|
||||
source_statement_id: str = Field(..., description="Statement where this relationship was extracted")
|
||||
valid_at: Optional[datetime] = Field(None, description="Temporal validity start")
|
||||
invalid_at: Optional[datetime] = Field(None, description="Temporal validity end")
|
||||
|
||||
@field_validator('valid_at', 'invalid_at', mode='before')
|
||||
@classmethod
|
||||
def validate_datetime(cls, v):
|
||||
"""使用通用的历史日期解析函数"""
|
||||
return parse_historical_datetime(v)
|
||||
|
||||
|
||||
class Node(BaseModel):
|
||||
"""Base class for all graph nodes in the knowledge graph.
|
||||
|
||||
Attributes:
|
||||
id: Unique identifier for the node
|
||||
name: Name of the node
|
||||
group_id: Group ID for multi-tenancy
|
||||
user_id: User ID for user-specific data
|
||||
apply_id: Application ID for application-specific data
|
||||
run_id: Unique identifier for the pipeline run that created this node
|
||||
created_at: Timestamp when the node was created (system perspective)
|
||||
expired_at: Optional timestamp when the node expires (system perspective)
|
||||
"""
|
||||
id: str = Field(..., description="The unique identifier for the node.")
|
||||
name: str = Field(..., description="The name of the node.")
|
||||
group_id: str = Field(..., description="The group ID of the node.")
|
||||
user_id: str = Field(..., description="The user ID of the edge.")
|
||||
apply_id: str = Field(..., description="The apply ID of the edge.")
|
||||
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
|
||||
created_at: datetime = Field(..., description="The valid time of the node from system perspective.")
|
||||
expired_at: Optional[datetime] = Field(None, description="The expired time of the node from system perspective.")
|
||||
|
||||
|
||||
class DialogueNode(Node):
|
||||
"""Node representing a dialogue in the knowledge graph.
|
||||
|
||||
Attributes:
|
||||
ref_id: Reference identifier linking to external dialog system
|
||||
content: Full dialogue content as text
|
||||
dialog_embedding: Optional embedding vector for the entire dialogue
|
||||
config_id: Configuration ID used to process this dialogue
|
||||
"""
|
||||
ref_id: str = Field(..., description="Reference identifier of the dialog")
|
||||
content: str = Field(..., description="Dialogue content")
|
||||
dialog_embedding: Optional[List[float]] = Field(None, description="Dialog embedding vector")
|
||||
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this dialogue (integer or string)")
|
||||
|
||||
|
||||
class StatementNode(Node):
|
||||
"""Node representing a statement extracted from dialogue.
|
||||
|
||||
Attributes:
|
||||
chunk_id: ID of the parent chunk this statement belongs to
|
||||
stmt_type: Type of the statement (from ontology)
|
||||
temporal_info: Temporal information extracted from the statement
|
||||
statement: The actual statement text content
|
||||
connect_strength: Classification of connection strength ('Strong' or 'Weak')
|
||||
valid_at: Optional start date of temporal validity
|
||||
invalid_at: Optional end date of temporal validity
|
||||
statement_embedding: Optional embedding vector for the statement
|
||||
chunk_embedding: Optional embedding vector for the parent chunk
|
||||
config_id: Configuration ID used to process this statement
|
||||
"""
|
||||
chunk_id: str = Field(..., description="ID of the parent chunk")
|
||||
stmt_type: str = Field(..., description="Type of the statement")
|
||||
temporal_info: TemporalInfo = Field(..., description="Temporal information")
|
||||
statement: str = Field(..., description="The statement text content")
|
||||
connect_strength: str = Field(..., description="Strong VS Weak classification of this statement")
|
||||
valid_at: Optional[datetime] = Field(None, description="Temporal validity start")
|
||||
invalid_at: Optional[datetime] = Field(None, description="Temporal validity end")
|
||||
statement_embedding: Optional[List[float]] = Field(None, description="Statement embedding vector")
|
||||
chunk_embedding: Optional[List[float]] = Field(None, description="Chunk embedding vector")
|
||||
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this statement (integer or string)")
|
||||
|
||||
@field_validator('valid_at', 'invalid_at', mode='before')
|
||||
@classmethod
|
||||
def validate_datetime(cls, v):
|
||||
"""使用通用的历史日期解析函数"""
|
||||
return parse_historical_datetime(v)
|
||||
|
||||
|
||||
class ChunkNode(Node):
|
||||
"""Node representing a chunk of conversation in the knowledge graph.
|
||||
|
||||
Attributes:
|
||||
dialog_id: ID of the parent dialog
|
||||
content: The text content of the chunk
|
||||
chunk_embedding: Optional embedding vector for the chunk
|
||||
sequence_number: Order of this chunk within the dialog
|
||||
metadata: Additional chunk metadata as key-value pairs
|
||||
"""
|
||||
dialog_id: str = Field(..., description="ID of the parent dialog")
|
||||
content: str = Field(..., description="The text content of the chunk")
|
||||
chunk_embedding: Optional[List[float]] = Field(None, description="Chunk embedding vector")
|
||||
sequence_number: int = Field(..., description="Order of this chunk within the dialog")
|
||||
metadata: dict = Field(default_factory=dict, description="Additional chunk metadata")
|
||||
|
||||
|
||||
class ExtractedEntityNode(Node):
|
||||
"""Node representing an extracted entity in the knowledge graph.
|
||||
|
||||
Attributes:
|
||||
entity_idx: Unique numeric identifier for the entity
|
||||
statement_id: ID of the statement this entity was extracted from
|
||||
entity_type: Type/category of the entity
|
||||
description: Textual description of the entity
|
||||
aliases: Optional list of alternative names for the entity
|
||||
name_embedding: Optional embedding vector for the entity name
|
||||
fact_summary: Summary of facts about this entity
|
||||
connect_strength: Classification of connection strength ('Strong' or 'Weak')
|
||||
config_id: Configuration ID used to process this entity
|
||||
"""
|
||||
entity_idx: int = Field(..., description="Unique identifier for the entity")
|
||||
statement_id: str = Field(..., description="Statement this entity was extracted from")
|
||||
entity_type: str = Field(..., description="Type of the entity")
|
||||
description: str = Field(..., description="Entity description")
|
||||
aliases: Optional[List[str]] = Field(default_factory=list, description="Entity aliases")
|
||||
name_embedding: Optional[List[float]] = Field(default_factory=list, description="Name embedding vector")
|
||||
fact_summary: str = Field(..., description="Summary of the fact about this entity")
|
||||
connect_strength: str = Field(..., description="Strong VS Weak about this entity")
|
||||
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this entity (integer or string)")
|
||||
|
||||
|
||||
class MemorySummaryNode(Node):
|
||||
"""Node representing a memory summary with vector embedding.
|
||||
|
||||
Attributes:
|
||||
summary_id: Unique identifier for the summary
|
||||
dialog_id: ID of the parent dialog
|
||||
chunk_ids: List of chunk IDs used to generate this summary
|
||||
content: Summary text content
|
||||
summary_embedding: Optional embedding vector for the summary
|
||||
metadata: Additional metadata for the summary
|
||||
config_id: Configuration ID used to process this summary
|
||||
"""
|
||||
summary_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for the summary")
|
||||
dialog_id: str = Field(..., description="ID of the parent dialog")
|
||||
chunk_ids: List[str] = Field(default_factory=list, description="List of chunk IDs used in the summary")
|
||||
content: str = Field(..., description="Summary text content")
|
||||
summary_embedding: Optional[List[float]] = Field(None, description="Embedding vector for the summary")
|
||||
metadata: dict = Field(default_factory=dict, description="Additional metadata for the summary")
|
||||
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this summary (integer or string)")
|
||||
247
app/core/memory/models/message_models.py
Normal file
247
app/core/memory/models/message_models.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""Models for dialogue messages, conversations, and statements.
|
||||
|
||||
This module contains Pydantic models for representing dialogue data,
|
||||
including messages, conversation context, chunks, and statements.
|
||||
|
||||
Classes:
|
||||
ConversationMessage: Single message in a conversation
|
||||
TemporalValidityRange: Temporal validity range for statements
|
||||
Statement: Statement extracted from dialogue with metadata
|
||||
ConversationContext: Full conversation history
|
||||
Chunk: Chunk of conversation text
|
||||
DialogData: Complete dialogue data structure
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
from uuid import uuid4
|
||||
from datetime import datetime
|
||||
|
||||
from app.core.memory.utils.data.ontology import StatementType, TemporalInfo, RelevenceInfo
|
||||
from app.core.memory.models.triplet_models import TripletExtractionResponse, Triplet
|
||||
|
||||
|
||||
class ConversationMessage(BaseModel):
|
||||
"""Represents a single message in a conversation.
|
||||
|
||||
Attributes:
|
||||
role: Role of the speaker (e.g., '用户' for user, 'AI' for assistant)
|
||||
msg: Text content of the message
|
||||
"""
|
||||
role: str = Field(..., description="The role of the speaker (e.g., '用户', 'AI').")
|
||||
msg: str = Field(..., description="The text content of the message.")
|
||||
|
||||
|
||||
class TemporalValidityRange(BaseModel):
|
||||
"""Represents the temporal validity range of a statement.
|
||||
|
||||
Attributes:
|
||||
valid_at: Start date of validity in 'YYYY-MM-DD' format (None if not specified)
|
||||
invalid_at: End date of validity in 'YYYY-MM-DD' format (None if not specified)
|
||||
"""
|
||||
valid_at: Optional[str] = Field(
|
||||
None,
|
||||
description="The start date of the statement's validity, in 'YYYY-MM-DD' format or 'None'.",
|
||||
)
|
||||
invalid_at: Optional[str] = Field(
|
||||
None,
|
||||
description="The end date of the statement's validity, in 'YYYY-MM-DD' format or 'None'.",
|
||||
)
|
||||
|
||||
|
||||
class Statement(BaseModel):
|
||||
"""Represents a statement extracted from dialogue with metadata.
|
||||
|
||||
Attributes:
|
||||
id: Unique identifier for the statement
|
||||
chunk_id: ID of the parent chunk this statement belongs to
|
||||
group_id: Optional group ID for multi-tenancy
|
||||
statement: The actual statement text content
|
||||
statement_embedding: Optional embedding vector for the statement
|
||||
stmt_type: Type of the statement (from ontology)
|
||||
temporal_info: Temporal information extracted from the statement
|
||||
relevence_info: Relevance classification (RELEVANT or IRRELEVANT)
|
||||
connect_strength: Optional connection strength ('Strong' or 'Weak')
|
||||
temporal_validity: Optional temporal validity range
|
||||
triplet_extraction_info: Optional triplet extraction results
|
||||
"""
|
||||
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the statement.")
|
||||
chunk_id: str = Field(..., description="ID of the parent chunk this statement belongs to.")
|
||||
group_id: Optional[str] = Field(None, description="ID of the group this statement belongs to.")
|
||||
statement: str = Field(..., description="The text content of the statement.")
|
||||
statement_embedding: Optional[List[float]] = Field(None, description="The embedding vector of the statement.")
|
||||
stmt_type: StatementType = Field(..., description="The type of the statement.")
|
||||
temporal_info: TemporalInfo = Field(..., description="The temporal information of the statement.")
|
||||
relevence_info: RelevenceInfo = Field(RelevenceInfo.RELEVANT, description="The relevence information of the statement.")
|
||||
connect_strength: Optional[str] = Field(None, description="Strong VS Weak about this entity")
|
||||
temporal_validity: Optional[TemporalValidityRange] = Field(
|
||||
None, description="The temporal validity range of the statement."
|
||||
)
|
||||
triplet_extraction_info: Optional[TripletExtractionResponse] = Field(
|
||||
None, description="The triplet extraction information of the statement."
|
||||
)
|
||||
|
||||
|
||||
class ConversationContext(BaseModel):
|
||||
"""Represents the full conversation history.
|
||||
|
||||
Attributes:
|
||||
msgs: List of messages in the conversation
|
||||
|
||||
Properties:
|
||||
content: Formatted string representation of the conversation
|
||||
"""
|
||||
msgs: List[ConversationMessage] = Field(..., description="A list of messages in the conversation.")
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""Get the content of the conversation as a formatted string.
|
||||
|
||||
Returns:
|
||||
String with format "role: message" for each message, joined by newlines
|
||||
"""
|
||||
return "\n".join([f"{msg.role}: {msg.msg}" for msg in self.msgs])
|
||||
|
||||
class Chunk(BaseModel):
|
||||
"""A chunk of text from the conversation context.
|
||||
|
||||
Attributes:
|
||||
id: Unique identifier for the chunk
|
||||
text: List of messages in the chunk
|
||||
content: The content of the chunk as a formatted string
|
||||
statements: List of statements extracted from this chunk
|
||||
chunk_embedding: Optional embedding vector for the chunk
|
||||
metadata: Additional metadata as key-value pairs
|
||||
"""
|
||||
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the chunk.")
|
||||
text: List[ConversationMessage] = Field(default_factory=list, description="A list of messages in the chunk.")
|
||||
content: str = Field(..., description="The content of the chunk as a string.")
|
||||
statements: List[Statement] = Field(default_factory=list, description="A list of statements in the chunk.")
|
||||
chunk_embedding: Optional[List[float]] = Field(None, description="The embedding vector of the chunk.")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata for the chunk.")
|
||||
|
||||
@classmethod
|
||||
def from_messages(cls, messages: List[ConversationMessage], metadata: Optional[Dict[str, Any]] = None):
|
||||
"""Create a chunk from a list of messages.
|
||||
|
||||
Args:
|
||||
messages: List of conversation messages
|
||||
metadata: Optional metadata dictionary
|
||||
|
||||
Returns:
|
||||
Chunk instance with formatted content
|
||||
"""
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
# Generate content from messages
|
||||
content = "\n".join([f"{msg.role}: {msg.msg}" for msg in messages])
|
||||
return cls(text=messages, content=content, metadata=metadata)
|
||||
|
||||
|
||||
class DialogData(BaseModel):
|
||||
"""Represents the complete data structure for a dialog record.
|
||||
|
||||
Attributes:
|
||||
id: Unique identifier for the dialog
|
||||
context: Full conversation context
|
||||
dialog_embedding: Optional embedding vector for the entire dialog
|
||||
ref_id: Reference ID linking to external dialog system
|
||||
group_id: Group ID for multi-tenancy
|
||||
user_id: User ID for user-specific data
|
||||
apply_id: Application ID for application-specific data
|
||||
created_at: Timestamp when the dialog was created
|
||||
expired_at: Timestamp when the dialog expires (default: far future)
|
||||
metadata: Additional metadata as key-value pairs
|
||||
chunks: List of chunks from the conversation
|
||||
config_id: Configuration ID used to process this dialog
|
||||
|
||||
Properties:
|
||||
content: Formatted string representation of the dialog
|
||||
"""
|
||||
id: str = Field(default_factory=lambda: uuid4().hex, description="A unique identifier for the dialog.")
|
||||
context: ConversationContext = Field(..., description="The full conversation context as a single string.")
|
||||
dialog_embedding: Optional[List[float]] = Field(None, description="The embedding vector of the dialog.")
|
||||
ref_id: str = Field(..., description="Refer to external dialog id. This is used to link to the original dialog.")
|
||||
group_id: str = Field(default=..., description="Group ID of dialogue data")
|
||||
user_id: str = Field(..., description="USER ID of dialogue data")
|
||||
apply_id: str = Field(..., description="APPLY ID of dialogue data")
|
||||
run_id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for this pipeline run.")
|
||||
created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the dialog was created.")
|
||||
expired_at: datetime = Field(default_factory=lambda: datetime(9999, 12, 31), description="The timestamp when the dialog expires.")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata for the dialog.")
|
||||
chunks: List[Chunk] = Field(default_factory=list, description="A list of chunks from the conversation context.")
|
||||
config_id: Optional[int | str] = Field(None, description="Configuration ID used to process this dialog (integer or string)")
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""Get the content of the dialog as a formatted string.
|
||||
|
||||
Returns:
|
||||
String representation of the conversation context
|
||||
"""
|
||||
return self.context.content
|
||||
|
||||
def get_statement_chunk(self, statement_id: str) -> Optional[Chunk]:
|
||||
"""Find the chunk containing a specific statement.
|
||||
|
||||
Args:
|
||||
statement_id: ID of the statement to find
|
||||
|
||||
Returns:
|
||||
Chunk containing the statement, or None if not found
|
||||
"""
|
||||
for chunk in self.chunks:
|
||||
for statement in chunk.statements:
|
||||
if statement.id == statement_id:
|
||||
return chunk
|
||||
return None
|
||||
|
||||
def get_all_statements(self) -> List[Statement]:
|
||||
"""Get all statements from all chunks.
|
||||
|
||||
Returns:
|
||||
List of all statements in the dialog
|
||||
"""
|
||||
all_statements = []
|
||||
for chunk in self.chunks:
|
||||
all_statements.extend(chunk.statements)
|
||||
return all_statements
|
||||
|
||||
def get_statement_by_id(self, statement_id: str) -> Optional[Statement]:
|
||||
"""Find a specific statement by its ID.
|
||||
|
||||
Args:
|
||||
statement_id: ID of the statement to find
|
||||
|
||||
Returns:
|
||||
Statement with the given ID, or None if not found
|
||||
"""
|
||||
for chunk in self.chunks:
|
||||
for statement in chunk.statements:
|
||||
if statement.id == statement_id:
|
||||
return statement
|
||||
return None
|
||||
|
||||
def get_triplets_for_statement(self, statement_id: str) -> List[Triplet]:
|
||||
"""Get all triplets extracted from a specific statement.
|
||||
|
||||
Args:
|
||||
statement_id: ID of the statement
|
||||
|
||||
Returns:
|
||||
List of triplets from the statement, or empty list if none found
|
||||
"""
|
||||
statement = self.get_statement_by_id(statement_id)
|
||||
if statement and statement.triplet_extraction_info:
|
||||
return statement.triplet_extraction_info.triplets
|
||||
return []
|
||||
|
||||
def assign_group_id_to_statements(self) -> None:
|
||||
"""Assign this dialog's group_id to all statements in all chunks.
|
||||
|
||||
This method updates statements that don't have a group_id set.
|
||||
"""
|
||||
for chunk in self.chunks:
|
||||
for statement in chunk.statements:
|
||||
if statement.group_id is None:
|
||||
statement.group_id = self.group_id
|
||||
85
app/core/memory/models/triplet_models.py
Normal file
85
app/core/memory/models/triplet_models.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Models for knowledge triplets and entities.
|
||||
|
||||
This module contains Pydantic models for representing extracted knowledge
|
||||
in the form of entities and triplets (subject-predicate-object relationships).
|
||||
|
||||
Classes:
|
||||
Entity: Represents an extracted entity
|
||||
Triplet: Represents a knowledge triplet (subject-predicate-object)
|
||||
TripletExtractionResponse: Response model containing extracted triplets and entities
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel, Field, ConfigDict
|
||||
from uuid import uuid4
|
||||
|
||||
|
||||
class Entity(BaseModel):
|
||||
"""Represents an extracted entity from dialogue.
|
||||
|
||||
Attributes:
|
||||
id: Unique string identifier for the entity
|
||||
entity_idx: Numeric index for the entity
|
||||
name: Name of the entity
|
||||
name_embedding: Optional embedding vector for the entity name
|
||||
type: Type/category of the entity (e.g., 'Person', 'Organization')
|
||||
description: Textual description of the entity
|
||||
|
||||
Config:
|
||||
extra: Ignore extra fields from LLM output
|
||||
"""
|
||||
model_config = ConfigDict(extra='ignore')
|
||||
id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for the entity.")
|
||||
entity_idx: int = Field(..., description="Unique identifier for the entity")
|
||||
name: str = Field(..., description="Name of the entity")
|
||||
name_embedding: Optional[List[float]] = Field(None, description="Embedding vector for the entity name")
|
||||
type: str = Field(..., description="Type/category of the entity")
|
||||
description: str = Field(..., description="Description of the entity")
|
||||
|
||||
|
||||
class Triplet(BaseModel):
|
||||
"""Represents an extracted knowledge triplet (subject-predicate-object).
|
||||
|
||||
A triplet represents a relationship between two entities, forming
|
||||
the basic unit of knowledge in the knowledge graph.
|
||||
|
||||
Attributes:
|
||||
id: Unique string identifier for the triplet
|
||||
statement_id: Optional ID of the parent statement (set programmatically)
|
||||
subject_name: Name of the subject entity
|
||||
subject_id: Numeric ID of the subject entity
|
||||
predicate: Relationship/predicate between subject and object
|
||||
object_name: Name of the object entity
|
||||
object_id: Numeric ID of the object entity
|
||||
value: Optional additional value or context for the relationship
|
||||
|
||||
Config:
|
||||
extra: Ignore extra fields from LLM output
|
||||
"""
|
||||
model_config = ConfigDict(extra='ignore')
|
||||
id: str = Field(default_factory=lambda: uuid4().hex, description="Unique identifier for the triplet.")
|
||||
statement_id: Optional[str] = Field(None, description="ID of the parent statement this triplet was extracted from.")
|
||||
subject_name: str = Field(..., description="Name of the subject entity")
|
||||
subject_id: int = Field(..., description="ID of the subject entity")
|
||||
predicate: str = Field(..., description="Relationship/predicate between subject and object")
|
||||
object_name: str = Field(..., description="Name of the object entity")
|
||||
object_id: int = Field(..., description="ID of the object entity")
|
||||
value: Optional[str] = Field(None, description="Additional value or context")
|
||||
|
||||
|
||||
class TripletExtractionResponse(BaseModel):
|
||||
"""Response model for triplet extraction from LLM.
|
||||
|
||||
This model represents the structured output from the LLM when
|
||||
extracting knowledge triplets and entities from statements.
|
||||
|
||||
Attributes:
|
||||
triplets: List of extracted knowledge triplets
|
||||
entities: List of extracted entities
|
||||
|
||||
Config:
|
||||
extra: Ignore extra fields from LLM output
|
||||
"""
|
||||
model_config = ConfigDict(extra='ignore')
|
||||
triplets: List[Triplet] = Field(default_factory=list, description="List of extracted triplets")
|
||||
entities: List[Entity] = Field(default_factory=list, description="List of extracted entities")
|
||||
151
app/core/memory/models/variate_config.py
Normal file
151
app/core/memory/models/variate_config.py
Normal file
@@ -0,0 +1,151 @@
|
||||
"""Variable configuration models for extraction pipeline components.
|
||||
|
||||
This module contains Pydantic models for configuring various aspects
|
||||
of the extraction pipeline, including statement extraction, triplet extraction,
|
||||
temporal extraction, deduplication, and forgetting mechanisms.
|
||||
|
||||
Classes:
|
||||
StatementExtractionConfig: Configuration for statement extraction
|
||||
ForgettingEngineConfig: Configuration for forgetting engine
|
||||
TripletExtractionConfig: Configuration for triplet extraction
|
||||
TemporalExtractionConfig: Configuration for temporal extraction
|
||||
DedupConfig: Configuration for entity deduplication
|
||||
ExtractionPipelineConfig: Combined configuration for entire pipeline
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class StatementExtractionConfig(BaseModel):
|
||||
"""Configuration for statement extraction behavior.
|
||||
|
||||
Attributes:
|
||||
statement_granularity: Granularity level (1-3):
|
||||
- 1: Split sentences into different statements
|
||||
- 2: Sentence-level statements
|
||||
- 3: Combine sentences, shorten long statements
|
||||
temperature: LLM temperature for statement extraction (0-2, default: 0.1)
|
||||
include_dialogue_context: Whether to include full dialogue context
|
||||
max_dialogue_context_chars: Maximum characters from dialogue context (default: 2000)
|
||||
"""
|
||||
statement_granularity: Optional[int] = Field(None, ge=1, le=3, description="Granularity of statements to extract, level 1 to 3")
|
||||
temperature: Optional[float] = Field(0.1, ge=0, le=2, description="LLM temperature for statement extraction")
|
||||
include_dialogue_context: bool = Field(True, description="Whether to include full dialogue context in extraction")
|
||||
max_dialogue_context_chars: Optional[int] = Field(2000, ge=100, description="Maximum number of characters to include from dialogue context")
|
||||
|
||||
|
||||
class ForgettingEngineConfig(BaseModel):
|
||||
"""Configuration for the forgetting engine.
|
||||
|
||||
The forgetting engine implements a memory decay mechanism based on
|
||||
time and memory strength parameters.
|
||||
|
||||
Attributes:
|
||||
offset: Minimum retention level (0-1, prevents complete forgetting, default: 0.1)
|
||||
lambda_time: Lambda parameter controlling time decay effect (default: 0.1)
|
||||
lambda_mem: Lambda parameter controlling memory strength effect (default: 1.0)
|
||||
"""
|
||||
offset: float = Field(0.1, ge=0.0, le=1.0, description="Minimum retention level (prevents complete forgetting).")
|
||||
lambda_time: float = Field(0.1, gt=0.0, description="Lambda parameter controlling time effect.")
|
||||
lambda_mem: float = Field(1.0, gt=0.0, description="Lambda parameter controlling memory strength effect.")
|
||||
|
||||
|
||||
class TripletExtractionConfig(BaseModel):
|
||||
"""Configuration for triplet extraction behavior.
|
||||
|
||||
Attributes:
|
||||
temperature: LLM temperature for triplet extraction (0-2, default: 0.1)
|
||||
enable_entity_normalization: Whether to normalize entity names (default: True)
|
||||
confidence_threshold: Minimum confidence for extracted triplets (0-1, default: 0.7)
|
||||
"""
|
||||
temperature: Optional[float] = Field(0.1, ge=0, le=2, description="LLM temperature for triplet extraction")
|
||||
enable_entity_normalization: bool = Field(True, description="Whether to normalize entity names")
|
||||
confidence_threshold: Optional[float] = Field(0.7, ge=0, le=1, description="Minimum confidence threshold for extracted triplets")
|
||||
|
||||
|
||||
class TemporalExtractionConfig(BaseModel):
|
||||
"""Configuration for temporal extraction behavior.
|
||||
|
||||
Attributes:
|
||||
temperature: LLM temperature for temporal extraction (0-2, default: 0.1)
|
||||
"""
|
||||
temperature: Optional[float] = Field(0.1, ge=0, le=2, description="LLM temperature for temporal extraction")
|
||||
|
||||
|
||||
class DedupConfig(BaseModel):
|
||||
"""Configuration for entity deduplication behavior.
|
||||
|
||||
This configuration controls the multi-stage deduplication process,
|
||||
including fuzzy matching, LLM-based deduplication, and disambiguation.
|
||||
|
||||
Attributes:
|
||||
enable_llm_dedup_blockwise: Enable blockwise LLM-driven deduplication (default: False)
|
||||
enable_llm_disambiguation: Enable LLM disambiguation for same-name different-type entities (default: False)
|
||||
enable_llm_fallback_only_on_borderline: Only trigger LLM when borderline pairs exist (default: True)
|
||||
fuzzy_name_threshold_strict: Strict threshold for name similarity (0-1, default: 0.90)
|
||||
fuzzy_type_threshold_strict: Strict threshold for type similarity (0-1, default: 0.75)
|
||||
fuzzy_overall_threshold: Overall similarity threshold to merge (0-1, default: 0.82)
|
||||
fuzzy_unknown_type_name_threshold: Name threshold when entity type is UNKNOWN (0-1, default: 0.92)
|
||||
fuzzy_unknown_type_type_threshold: Type threshold when entity type is UNKNOWN (0-1, default: 0.50)
|
||||
name_weight: Weight of name similarity in overall score (0-1, default: 0.50)
|
||||
desc_weight: Weight of description similarity in overall score (0-1, default: 0.30)
|
||||
type_weight: Weight of type similarity in overall score (0-1, default: 0.20)
|
||||
context_bonus: Bonus when entities co-occur in same statements (0-0.2, default: 0.03)
|
||||
llm_fallback_floor: Lower bound for borderline score (0-1, default: 0.76)
|
||||
llm_fallback_ceiling: Upper bound for borderline score (0-1, default: 0.82)
|
||||
llm_block_size: Entities per block for LLM dedup (1-500, default: 50)
|
||||
llm_block_concurrency: Concurrent blocks processed by LLM (1-64, default: 4)
|
||||
llm_pair_concurrency: Concurrent pairwise decisions per block (1-64, default: 4)
|
||||
llm_max_rounds: Maximum LLM iterative dedup rounds (1-10, default: 3)
|
||||
"""
|
||||
# LLM deduplication toggles
|
||||
enable_llm_dedup_blockwise: bool = Field(False, description="Toggle blockwise LLM-driven deduplication")
|
||||
enable_llm_disambiguation: bool = Field(False, description="Toggle LLM-driven disambiguation for same-name different-type entities")
|
||||
enable_llm_fallback_only_on_borderline: bool = Field(True, description="Trigger LLM dedup only when borderline pairs are detected in fuzzy stage")
|
||||
|
||||
# Fuzzy match thresholds
|
||||
fuzzy_name_threshold_strict: float = Field(0.90, ge=0, le=1, description="Strict threshold for name similarity")
|
||||
fuzzy_type_threshold_strict: float = Field(0.75, ge=0, le=1, description="Strict threshold for type similarity")
|
||||
fuzzy_overall_threshold: float = Field(0.82, ge=0, le=1, description="Overall similarity threshold to merge")
|
||||
|
||||
# Specialized thresholds when type is UNKNOWN
|
||||
fuzzy_unknown_type_name_threshold: float = Field(0.92, ge=0, le=1, description="Name threshold when any entity type is UNKNOWN")
|
||||
fuzzy_unknown_type_type_threshold: float = Field(0.50, ge=0, le=1, description="Type threshold when any entity type is UNKNOWN")
|
||||
|
||||
# Weighted scoring components for overall similarity
|
||||
name_weight: float = Field(0.50, ge=0, le=1, description="Weight of name similarity in overall score")
|
||||
desc_weight: float = Field(0.30, ge=0, le=1, description="Weight of description similarity in overall score")
|
||||
type_weight: float = Field(0.20, ge=0, le=1, description="Weight of type similarity in overall score")
|
||||
context_bonus: float = Field(0.03, ge=0, le=0.2, description="Bonus added to score when entities co-occur in same statements")
|
||||
|
||||
# Borderline range for LLM fallback triggering
|
||||
llm_fallback_floor: float = Field(0.76, ge=0, le=1, description="Lower bound of overall score to consider as borderline for LLM fallback")
|
||||
llm_fallback_ceiling: float = Field(0.82, ge=0, le=1, description="Upper bound (below merge threshold) of overall score for LLM fallback")
|
||||
|
||||
# LLM iterative dedup parameters
|
||||
llm_block_size: int = Field(50, ge=1, le=500, description="Entities per block for LLM dedup")
|
||||
llm_block_concurrency: int = Field(4, ge=1, le=64, description="Concurrent blocks processed by LLM")
|
||||
llm_pair_concurrency: int = Field(4, ge=1, le=64, description="Concurrent pairwise decisions per block")
|
||||
llm_max_rounds: int = Field(3, ge=1, le=10, description="Maximum LLM iterative dedup rounds")
|
||||
|
||||
|
||||
class ExtractionPipelineConfig(BaseModel):
|
||||
"""Configuration for the entire extraction pipeline.
|
||||
|
||||
This model combines all configuration components for the complete
|
||||
extraction pipeline, including statement extraction, triplet extraction,
|
||||
temporal extraction, deduplication, and forgetting mechanisms.
|
||||
|
||||
Attributes:
|
||||
statement_extraction: Configuration for statement extraction
|
||||
triplet_extraction: Configuration for triplet extraction
|
||||
temporal_extraction: Configuration for temporal extraction
|
||||
deduplication: Configuration for entity deduplication
|
||||
forgetting_engine: Configuration for forgetting engine
|
||||
"""
|
||||
statement_extraction: StatementExtractionConfig = Field(default_factory=StatementExtractionConfig)
|
||||
triplet_extraction: TripletExtractionConfig = Field(default_factory=TripletExtractionConfig)
|
||||
temporal_extraction: TemporalExtractionConfig = Field(default_factory=TemporalExtractionConfig)
|
||||
deduplication: DedupConfig = Field(default_factory=DedupConfig)
|
||||
forgetting_engine: ForgettingEngineConfig = Field(default_factory=ForgettingEngineConfig)
|
||||
Reference in New Issue
Block a user