diff --git a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py index 17f76b17..8d37f5d2 100644 --- a/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py +++ b/api/app/core/memory/storage_services/extraction_engine/knowledge_extraction/statement_extraction.py @@ -15,7 +15,7 @@ from app.core.memory.utils.data.ontology import ( TemporalInfo, ) from app.core.memory.utils.prompt.prompt_utils import render_statement_extraction_prompt -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator logger = logging.getLogger(__name__) @@ -29,6 +29,33 @@ class ExtractedStatement(BaseModel): # 统一使用 StatementExtractionResponse 作为 LLM 的结构化返回(仅语句) class StatementExtractionResponse(BaseModel): statements: List[ExtractedStatement] = Field(default_factory=list, description="List of extracted statements") + + @field_validator('statements', mode='before') + @classmethod + def filter_empty_statements(cls, v): + """Filter out empty or invalid statement dicts before validation. + + This handles cases where the LLM returns malformed responses with empty dicts, + which can happen due to response truncation or parsing issues (especially with + providers like Bedrock that don't support with_structured_output). + """ + if isinstance(v, list): + # Filter out empty dicts or dicts missing the required 'statement' field + valid_statements = [] + filtered_count = 0 + for i, stmt in enumerate(v): + if isinstance(stmt, dict) and stmt.get('statement'): + valid_statements.append(stmt) + elif isinstance(stmt, dict): + # Log which statement was filtered + filtered_count += 1 + logger.debug(f"Filtering out invalid statement at index {i}: {stmt}") + + if filtered_count > 0: + logger.warning(f"Filtered out {filtered_count} empty/invalid statements from LLM response") + + return valid_statements + return v class StatementExtractor: """Class for extracting statements from dialog chunks using LLM (relations separated)""" diff --git a/api/app/core/memory/utils/data/ontology.py b/api/app/core/memory/utils/data/ontology.py index 19bddaa7..c25b5409 100644 --- a/api/app/core/memory/utils/data/ontology.py +++ b/api/app/core/memory/utils/data/ontology.py @@ -3,9 +3,12 @@ from enum import StrEnum # Use jinja template.render PREDICATE_DEFINITIONS = { + # Core Relationships "IS_A": "Denotes a class-or-type relationship between two entities (e.g., 'Model Y IS_A electric-SUV'). Includes 'is' and 'was'.", "HAS_A": "Denotes a part-whole relationship between two entities (e.g., 'Model Y HAS_A electric-engine'). Includes 'has' and 'had'.", "LOCATED_IN": "Specifies geographic or organisational containment or proximity (e.g., headquarters LOCATED_IN Berlin).", + + # Business/Corporate "HOLDS_ROLE": "Connects a person to a formal office or title within an organisation (CEO, Chair, Director, etc.).", "PRODUCES": "Indicates that an entity manufactures, builds, or creates a product, service, or infrastructure (includes scale-ups and component inclusion).", "SELLS": "Marks a commercial seller-to-customer relationship for a product or service (markets, distributes, sells).", @@ -23,10 +26,19 @@ PREDICATE_DEFINITIONS = { "PART_OF": "Expresses hierarchical membership or subset relationships (division, subsidiary, managed by, belongs to).", "DISCONTINUED": "Indicates official end-of-life, shutdown, or termination of a product, service, or relationship.", "SECURED": "Marks the successful acquisition of funding, contracts, assets, or rights by an entity.", + + # Learning/Education Domain (NEW - for educational/learning contexts) + "STUDIES": "Indicates a learning or study relationship between a person and educational content, subject, or material (e.g., '李阳 STUDIES 历史', 'student STUDIES mathematics').", + "COMMUNICATES_WITH": "Denotes direct communication, conversation, or interaction between two entities (e.g., '李阳 COMMUNICATES_WITH 张明'). Includes verbal and written communication.", + "RECORDS_IN": "Indicates recording, writing, or documenting information in a medium (e.g., '李阳 RECORDS_IN 课本', 'user RECORDS_IN notebook').", + "EVALUATES": "Expresses evaluation, assessment, or judgment of an entity (e.g., '老师 EVALUATES 作业', '李阳 EVALUATES 分封制').", + "REFERENCES": "Denotes a reference, comparison, or analogy relationship (e.g., '游戏 REFERENCES 历史', 'book REFERENCES theory').", + "CREATES": "Indicates creation, production, or generation of content, art, or artifacts (e.g., '李阳 CREATES 简笔画', 'artist CREATES painting').", + + # General "MENTIONS": "Denotes a reference or mention of an entity in a text or document.", # 移除了过于宽泛的谓语集合 - # "MENTIONS": "Denotes a reference or mention of an entity in a text or document." , # "FEELS" : "Denotes a subjective opinion or feeling about an entity (e.g., 'I feel like X').Includes 'THINKS'.", # "HELPS" :"Express a action that make it easier or possible for (someone) to do something by offering one's services or resources. Includes 'assist', 'aid' and 'support' " , # "IS_DOING" : "Denotes a subjective action or activity about an entity (e.g., 'I am doing X').Includes 'DOES'.", @@ -158,9 +170,12 @@ LABEL_DEFINITIONS: dict[str, dict[str, dict[str, str]]] = { class Predicate(StrEnum): """Enumeration of normalised predicates.""" + # Core Relationships IS_A = "IS_A" HAS_A = "HAS_A" LOCATED_IN = "LOCATED_IN" + + # Business/Corporate HOLDS_ROLE = "HOLDS_ROLE" PRODUCES = "PRODUCES" SELLS = "SELLS" @@ -178,6 +193,16 @@ class Predicate(StrEnum): PART_OF = "PART_OF" DISCONTINUED = "DISCONTINUED" SECURED = "SECURED" + + # Learning/Education Domain + STUDIES = "STUDIES" + COMMUNICATES_WITH = "COMMUNICATES_WITH" + RECORDS_IN = "RECORDS_IN" + EVALUATES = "EVALUATES" + REFERENCES = "REFERENCES" + CREATES = "CREATES" + + # General MENTIONS = "MENTIONS"