feat(ontology): add learning/education domain predicates

Add new predicates for learning/education domain to support educational content extraction. Also add field validator to filter empty statements in extraction response to handle malformed LLM outputs.
2025-12-28 18:05:04 +08:00
parent 78744e7151
commit 4d187b9c19
2 changed files with 54 additions and 2 deletions
--- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py
+++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py
@@ -15,7 +15,7 @@ from app.core.memory.utils.data.ontology import (
    TemporalInfo,
 )
 from app.core.memory.utils.prompt.prompt_utils import render_statement_extraction_prompt
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator

 logger = logging.getLogger(__name__)

@@ -29,6 +29,33 @@ class ExtractedStatement(BaseModel):
 # 统一使用 StatementExtractionResponse 作为 LLM 的结构化返回（仅语句）
 class StatementExtractionResponse(BaseModel):
    statements: List[ExtractedStatement] = Field(default_factory=list, description="List of extracted statements")
+    
+    @field_validator('statements', mode='before')
+    @classmethod
+    def filter_empty_statements(cls, v):
+        """Filter out empty or invalid statement dicts before validation.
+        
+        This handles cases where the LLM returns malformed responses with empty dicts,
+        which can happen due to response truncation or parsing issues (especially with
+        providers like Bedrock that don't support with_structured_output).
+        """
+        if isinstance(v, list):
+            # Filter out empty dicts or dicts missing the required 'statement' field
+            valid_statements = []
+            filtered_count = 0
+            for i, stmt in enumerate(v):
+                if isinstance(stmt, dict) and stmt.get('statement'):
+                    valid_statements.append(stmt)
+                elif isinstance(stmt, dict):
+                    # Log which statement was filtered
+                    filtered_count += 1
+                    logger.debug(f"Filtering out invalid statement at index {i}: {stmt}")
+            
+            if filtered_count > 0:
+                logger.warning(f"Filtered out {filtered_count} empty/invalid statements from LLM response")
+            
+            return valid_statements
+        return v

 class StatementExtractor:
    """Class for extracting statements from dialog chunks using LLM (relations separated)"""
--- a/api/app/core/memory/utils/data/ontology.py
+++ b/api/app/core/memory/utils/data/ontology.py
@@ -3,9 +3,12 @@ from enum import StrEnum

 # Use jinja template.render
 PREDICATE_DEFINITIONS = {
+    # Core Relationships
    "IS_A": "Denotes a class-or-type relationship between two entities (e.g., 'Model Y IS_A electric-SUV'). Includes 'is' and 'was'.",
    "HAS_A": "Denotes a part-whole relationship between two entities (e.g., 'Model Y HAS_A electric-engine'). Includes 'has' and 'had'.",
    "LOCATED_IN": "Specifies geographic or organisational containment or proximity (e.g., headquarters LOCATED_IN Berlin).",
+    
+    # Business/Corporate
    "HOLDS_ROLE": "Connects a person to a formal office or title within an organisation (CEO, Chair, Director, etc.).",
    "PRODUCES": "Indicates that an entity manufactures, builds, or creates a product, service, or infrastructure (includes scale-ups and component inclusion).",
    "SELLS": "Marks a commercial seller-to-customer relationship for a product or service (markets, distributes, sells).",
@@ -23,10 +26,19 @@ PREDICATE_DEFINITIONS = {
    "PART_OF": "Expresses hierarchical membership or subset relationships (division, subsidiary, managed by, belongs to).",
    "DISCONTINUED": "Indicates official end-of-life, shutdown, or termination of a product, service, or relationship.",
    "SECURED": "Marks the successful acquisition of funding, contracts, assets, or rights by an entity.",
+    
+    # Learning/Education Domain (NEW - for educational/learning contexts)
+    "STUDIES": "Indicates a learning or study relationship between a person and educational content, subject, or material (e.g., '李阳 STUDIES 历史', 'student STUDIES mathematics').",
+    "COMMUNICATES_WITH": "Denotes direct communication, conversation, or interaction between two entities (e.g., '李阳 COMMUNICATES_WITH 张明'). Includes verbal and written communication.",
+    "RECORDS_IN": "Indicates recording, writing, or documenting information in a medium (e.g., '李阳 RECORDS_IN 课本', 'user RECORDS_IN notebook').",
+    "EVALUATES": "Expresses evaluation, assessment, or judgment of an entity (e.g., '老师 EVALUATES 作业', '李阳 EVALUATES 分封制').",
+    "REFERENCES": "Denotes a reference, comparison, or analogy relationship (e.g., '游戏 REFERENCES 历史', 'book REFERENCES theory').",
+    "CREATES": "Indicates creation, production, or generation of content, art, or artifacts (e.g., '李阳 CREATES 简笔画', 'artist CREATES painting').",
+    
+    # General
    "MENTIONS": "Denotes a reference or mention of an entity in a text or document.",

    # 移除了过于宽泛的谓语集合
-    # "MENTIONS": "Denotes a reference or mention of an entity in a text or document." ,
    # "FEELS" : "Denotes a subjective opinion or feeling about an entity (e.g., 'I feel like X').Includes 'THINKS'.",
    # "HELPS" :"Express a action that make it easier or possible for (someone) to do something by offering one's services or resources. Includes 'assist', 'aid' and 'support' " ,
    # "IS_DOING" : "Denotes a subjective action or activity about an entity (e.g., 'I am doing X').Includes 'DOES'.",
@@ -158,9 +170,12 @@ LABEL_DEFINITIONS: dict[str, dict[str, dict[str, str]]] = {
 class Predicate(StrEnum):
    """Enumeration of normalised predicates."""

+    # Core Relationships
    IS_A = "IS_A"
    HAS_A = "HAS_A"
    LOCATED_IN = "LOCATED_IN"
+    
+    # Business/Corporate
    HOLDS_ROLE = "HOLDS_ROLE"
    PRODUCES = "PRODUCES"
    SELLS = "SELLS"
@@ -178,6 +193,16 @@ class Predicate(StrEnum):
    PART_OF = "PART_OF"
    DISCONTINUED = "DISCONTINUED"
    SECURED = "SECURED"
+    
+    # Learning/Education Domain
+    STUDIES = "STUDIES"
+    COMMUNICATES_WITH = "COMMUNICATES_WITH"
+    RECORDS_IN = "RECORDS_IN"
+    EVALUATES = "EVALUATES"
+    REFERENCES = "REFERENCES"
+    CREATES = "CREATES"
+    
+    # General
    MENTIONS = "MENTIONS"