diff --git a/api/app/controllers/knowledge_controller.py b/api/app/controllers/knowledge_controller.py index bca69050..deed5723 100644 --- a/api/app/controllers/knowledge_controller.py +++ b/api/app/controllers/knowledge_controller.py @@ -1,26 +1,28 @@ -from typing import Optional import datetime import json +from typing import Optional import uuid + from fastapi import APIRouter, Depends, HTTPException, status, Query from sqlalchemy import or_ from sqlalchemy.orm import Session +from app.celery_app import celery_app +from app.core.logging_config import get_api_logger +from app.core.rag.common import settings +from app.core.rag.llm.chat_model import Base +from app.core.rag.nlp import rag_tokenizer, search +from app.core.rag.prompts.generator import graph_entity_types +from app.core.rag.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory +from app.core.response_utils import success from app.db import get_db from app.dependencies import get_current_user from app.models.user_model import User from app.models import knowledge_model, document_model, file_model from app.schemas import knowledge_schema from app.schemas.response_schema import ApiResponse -from app.core.response_utils import success from app.services import knowledge_service, document_service -from app.core.rag.llm.chat_model import Base -from app.core.rag.prompts.generator import graph_entity_types -from app.core.rag.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory -from app.core.logging_config import get_api_logger -from app.core.rag.nlp import rag_tokenizer, search -from app.core.rag.common import settings -from app.celery_app import celery_app +from app.services.model_service import ModelConfigService # Obtain a dedicated API logger api_logger = get_api_logger() @@ -47,6 +49,45 @@ def get_parser_types(): return success(msg="Successfully obtained the knowledge parser type", data=list(knowledge_model.ParserType)) +@router.get("/knowledge_graph_entity_types", response_model=ApiResponse) +async def get_knowledge_graph_entity_types( + llm_id: uuid.UUID, + scenario: str, + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user) +): + """ + get knowledge graph entity types based on llm_id + """ + api_logger.info(f"Obtain details of the knowledge graph: llm_id={llm_id}, username: {current_user.username}") + + try: + # 1. Check whether the model exists + api_logger.debug(f"Check whether the model exists: {llm_id}") + config = ModelConfigService.get_model_by_id(db=db, model_id=llm_id) + + if not config: + api_logger.warning( + f"The model does not exist or you do not have permission to access it: llm_id={llm_id}") + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="The model does not exist or you do not have permission to access it" + ) + # 2. Prepare to configure chat_mdl information + chat_model = Base( + key=config.api_keys[0].api_key, + model_name=config.api_keys[0].model_name, + base_url=config.api_keys[0].api_base + ) + response = graph_entity_types(chat_model, scenario) + return success(data=response, msg="Successfully obtained knowledge graph entity types") + except HTTPException: + raise + except Exception as e: + api_logger.error(f"get knowledge graph entity types failed: llm_id={llm_id} - {str(e)}") + raise + + @router.get("/knowledges", response_model=ApiResponse) async def get_knowledges( parent_id: Optional[uuid.UUID] = Query(None, description="parent folder id"), @@ -379,7 +420,7 @@ async def delete_knowledge_graph( current_user: User = Depends(get_current_user) ): """ - Soft-delete knowledge graph + delete knowledge graph """ api_logger.info(f"Request to delete knowledge graph: knowledge_id={knowledge_id}, username: {current_user.username}") @@ -442,42 +483,3 @@ async def rebuild_knowledge_graph( except Exception as e: api_logger.error(f"Failed to rebuild knowledge graph: knowledge_id={knowledge_id} - {str(e)}") raise - - -@router.get("/{knowledge_id}/knowledge_graph_entity_types", response_model=ApiResponse) -async def get_knowledge_graph_entity_types( - knowledge_id: uuid.UUID, - scenario: str, - db: Session = Depends(get_db), - current_user: User = Depends(get_current_user) -): - """ - get knowledge graph entity types based on knowledge_id - """ - api_logger.info(f"Obtain details of the knowledge graph: knowledge_id={knowledge_id}, username: {current_user.username}") - - try: - # 1. Check whether the knowledge base exists - api_logger.debug(f"Check whether the knowledge base exists: {knowledge_id}") - db_knowledge = knowledge_service.get_knowledge_by_id(db, knowledge_id=knowledge_id, current_user=current_user) - - if not db_knowledge: - api_logger.warning( - f"The knowledge base does not exist or you do not have permission to access it: knowledge_id={knowledge_id}") - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail="The knowledge base does not exist or you do not have permission to access it" - ) - # 2. Prepare to configure chat_mdl information - chat_model = Base( - key=db_knowledge.llm.api_keys[0].api_key, - model_name=db_knowledge.llm.api_keys[0].model_name, - base_url=db_knowledge.llm.api_keys[0].api_base - ) - response = graph_entity_types(chat_model, scenario) - return success(data=response, msg="Successfully obtained knowledge graph entity types") - except HTTPException: - raise - except Exception as e: - api_logger.error(f"get knowledge graph entity types failed: knowledge_id={knowledge_id} - {str(e)}") - raise diff --git a/api/app/core/memory/storage_services/reflection_engine/example/example.json b/api/app/core/memory/storage_services/reflection_engine/example/example.json index 18a2b185..09429829 100644 --- a/api/app/core/memory/storage_services/reflection_engine/example/example.json +++ b/api/app/core/memory/storage_services/reflection_engine/example/example.json @@ -2,52 +2,39 @@ "memory_verify": { "source_data": [ { - "statement_name": "用户是2023年春天去北京工作的。", - "statement_id": "62beac695b1346f4871740a45db88782" + "statement_name": "我是 2023 年春天去北京工作的,后来基本一直都在北京上班,也没怎么换过城市。不过后来公司调整,2024 年上半年我被调到上海待了差不多半年,那段时间每天都是在上海办公室打卡。当时入职资料用的还是我之前的身份信息,身份证号是 11010119950308123X,银行卡是 6222023847595898,这些一直没变。对了,其实我 从 2023 年开始就一直在北京生活,从来没有长期离开过北京,上海那段更多算是远程配合。" }, { - "statement_name": "用户后来基本一直都在北京上班。", - "statement_id": "4cba5ac08b674d7fb1e2ae634d2b8f0b" + "statement_name": "用户后来基本一直都在北京上班。" }, { - "statement_name": "用户从2023年开始就一直在北京生活。", - "statement_id": "e612a44da4db483993c350df7c97a1a1" + "statement_name": "用户从2023年开始就一直在北京生活。" }, { - "statement_name": "用户从来没有长期离开过北京。", - "statement_id": "b3c787a2e33c49f7981accabbbb4538a" + "statement_name": "用户从来没有长期离开过北京。" }, { - "statement_name": "由于公司调整,用户在2024年上半年被调到上海待了差不多半年。", - "statement_id": "64cde4230cb24a4da726e7db9e7aa616" + "statement_name": "由于公司调整,用户在2024年上半年被调到上海待了差不多半年。" }, { - "statement_name": "用户在被调到上海期间每天都是在上海办公室打卡。", - "statement_id": "8b1b12e23b844b8088dfeb67da6ad669" + "statement_name": "用户在被调到上海期间每天都是在上海办公室打卡。" }, { - "statement_name": "用户在入职时使用的身份信息是之前的,身份证号为11010119950308123X。", - "statement_id": "030afd362e9b4110b139e68e5d3e7143" + "statement_name": "用户在入职时使用的身份信息是之前的,身份证号为11010119950308123X。" }, { - "statement_name": "用户的银行卡号是6222023847595898。", - "statement_id": "6c7567cd1f3c478bb42d1b65383e6f2f" + "statement_name": "用户的银行卡号是6222023847595898。" }, { - "statement_name": "用户的身份信息和银行卡信息一直没变。", - "statement_id": "b3ca618e1e204b83bebd70e75cf2073f" + "statement_name": "用户的身份信息和银行卡信息一直没变。" }, { - "statement_name": "用户认为在上海的那段时间更多算是远程配合。", - "statement_id": "150af89d2c154e6eb41ff1a91e37f962" + "statement_name": "用户认为在上海的那段时间更多算是远程配合。" } ], "databasets": [ { "entity1_name": "Person", - "description": "表示人类个体的通用类型", - "statement_id": "62beac695b1346f4871740a45db88782", - "entity2_name": "用户", "entity2": { "description": "叙述者,讲述个人工作与生活经历的个体", "name": "用户" @@ -55,9 +42,6 @@ }, { "entity1_name": "用户", - "description": "叙述者,讲述个人工作与生活经历的个体", - "statement_id": "62beac695b1346f4871740a45db88782", - "entity2_name": "身份信息", "entity2": { "description": "用于个人身份识别的数据", "name": "身份信息" @@ -65,9 +49,6 @@ }, { "entity1_name": "用户", - "description": "叙述者,讲述个人工作与生活经历的个体", - "statement_id": "62beac695b1346f4871740a45db88782", - "entity2_name": "6222023847595898", "entity2": { "description": "用户的银行卡号码", "name": "6222023847595898" @@ -76,33 +57,24 @@ { "entity1_name": "用户", "description": "叙述者,讲述个人工作与生活经历的个体", - "statement_id": "62beac695b1346f4871740a45db88782", - "entity2_name": "上海办公室", "entity2": { "entity_idx": 1, "aliases": ["上海办"], - "description": "位于上海的工作办公场所", "name": "上海办公室" } }, { "entity1_name": "用户", "description": "叙述者,讲述个人工作与生活经历的个体", - "statement_id": "62beac695b1346f4871740a45db88782", - "entity2_name": "北京", "entity2": { "aliases": ["京", "京城", "北平"], - "description": "中国的首都城市,用户主要工作和生活所在地", "name": "北京" } }, { "entity1_name": "11010119950308123X", "description": "具体的身份证号码值", - "statement_id": "030afd362e9b4110b139e68e5d3e7143", - "entity2_name": "身份证号", "entity2": { - "description": "中华人民共和国公民的身份号码", "name": "身份证号" } } diff --git a/api/app/core/memory/storage_services/reflection_engine/self_reflexion.py b/api/app/core/memory/storage_services/reflection_engine/self_reflexion.py index 97f51fb9..e9fb8855 100644 --- a/api/app/core/memory/storage_services/reflection_engine/self_reflexion.py +++ b/api/app/core/memory/storage_services/reflection_engine/self_reflexion.py @@ -387,7 +387,7 @@ class ReflectionEngine: result_data['memory_verifies'] = memory_verifies result_data['quality_assessments'] = quality_assessments conflicts_found='' - + REMOVE_KEYS = {"created_at", "expired_at","relationship","predicate","statement_id","id","statement_id","relationship_statement_id"} # Clearn conflict_data,And memory_verify和quality_assessment cleaned_conflict_data = [] for item in conflict_data: @@ -396,7 +396,23 @@ class ReflectionEngine: 'conflict': item['conflict'] } cleaned_conflict_data.append(cleaned_item) - + cleaned_conflict_data_=[] + for item in conflict_data: + cleaned_data = [] + for row in item.get("data", []): + # 删除 created_at / expired_at + cleaned_row = { + k: v + for k, v in row.items() + if k not in REMOVE_KEYS + } + cleaned_data.append(cleaned_row) + cleaned_item = { + "data": cleaned_data, + "conflict": item.get("conflict"), + } + cleaned_conflict_data_.append(cleaned_item) + print(cleaned_conflict_data_) # 3. 解决冲突 solved_data = await self._resolve_conflicts(cleaned_conflict_data, source_data) if not solved_data: diff --git a/api/app/core/memory/utils/prompt/prompts/reflexion.jinja2 b/api/app/core/memory/utils/prompt/prompts/reflexion.jinja2 index 99476c82..ed3aad32 100644 --- a/api/app/core/memory/utils/prompt/prompts/reflexion.jinja2 +++ b/api/app/core/memory/utils/prompt/prompts/reflexion.jinja2 @@ -9,9 +9,7 @@ ## 任务目标 作为数据冲突解决专家,分析冲突原因,按类型分组处理,为每种冲突生成独立解决方案。 - **数据关系**: statement_databasets中的statement_id对应data中的记录,statement_created_at为用户输入时间。 - **处理模式**: - memory_verify=false: 仅处理数据冲突 - memory_verify=true: 处理数据冲突 + 隐私脱敏 @@ -111,7 +109,8 @@ - 隐私保护优先: 所有输出记录必须完成隐私脱敏 - 脱敏变更记录: 隐私脱敏变更也必须在change字段中记录{% endif %} - 不可修改数据: 数据被判定为正确时不可修改,无数据可输出时为空 -- 输出的结果reflexion字段中的reason字段和solution不允许含有(expired_at设为2024-01-01T00:00:00Z、memory_verify=true)等原数据字段以及涉及需要修改的字段以及内容 +- 输出的结果reflexion字段中的reason字段和solution不允许含有(expired_at设为2024-01-01T00:00:00Z、memory_verify=true、memory_verify=false)等原数据字段以及涉及需要修改的字段以及内容, + ,如果是FACT,只记录事实冲突相关的数据;如果是TIME,只记录时间冲突相关的数据;如果是HYBRID,则记录所有冲突相关的数据 **变更记录格式**: ```json @@ -158,8 +157,9 @@ "conflict": true }, "reflexion": { - "reason": "该冲突类型的原因分析", - "solution": "该冲突类型的解决方案" + "reason": "该冲突类型的原因分析,如果是FACT就是存在事实冲突,分析该冲突原因,如果是TIME就是存在时间冲突,分析该冲突原因,如果是HYBRID,可以输出存在时间与事实的混合冲突再添加上原因分析, + 不可以随意分配冲突类型以及原因,不允许输出字段比如(statement、description、entity1_name、entity2_name、name、memory_verify、expired_at、conflict)等类似这种", + "solution": "该冲突类型的解决方案(不允许输出字段比如(statement、description、entity1_name、entity2_name、name、memory_verify、expired_at、conflict)等类似这种)" }, "resolved": { "original_memory_id": "被设为失效的记忆id", @@ -182,4 +182,5 @@ - **resolved.change**: 包含详细变更信息 - 无需修改的冲突类型resolved为null - 与baseline不匹配的冲突类型不包含在results中 -模式参考: {{ json_schema }} \ No newline at end of file +模式参考: {{ json_schema }} + diff --git a/api/app/core/rag/app/naive.py b/api/app/core/rag/app/naive.py index 6d6b933a..23f0c4ba 100644 --- a/api/app/core/rag/app/naive.py +++ b/api/app/core/rag/app/naive.py @@ -672,6 +672,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, excel_parser = ExcelParser() if parser_config.get("html4excel"): sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] + parser_config["chunk_token_num"] = 0 else: sections = [(_, "") for _ in excel_parser(binary) if _] parser_config["chunk_token_num"] = 12800 diff --git a/api/app/core/rag/deepdoc/parser/excel_parser.py b/api/app/core/rag/deepdoc/parser/excel_parser.py index f7601ee3..d66a21a8 100644 --- a/api/app/core/rag/deepdoc/parser/excel_parser.py +++ b/api/app/core/rag/deepdoc/parser/excel_parser.py @@ -5,6 +5,7 @@ from io import BytesIO import pandas as pd from openpyxl import Workbook, load_workbook +from PIL import Image from app.core.rag.nlp import find_codec @@ -28,7 +29,7 @@ class RAGExcelParser: try: file_like_object.seek(0) - df = pd.read_csv(file_like_object) + df = pd.read_csv(file_like_object, on_bad_lines='skip') return RAGExcelParser._dataframe_to_workbook(df) except Exception as e_csv: @@ -42,14 +43,11 @@ class RAGExcelParser: file_like_object.seek(0) try: dfs = pd.read_excel(file_like_object, sheet_name=None) - if isinstance(dfs, dict): - dfs = next(iter(dfs.values())) return RAGExcelParser._dataframe_to_workbook(dfs) except Exception as ex: logging.info(f"pandas with default engine load error: {ex}, try calamine instead") file_like_object.seek(0) df = pd.read_excel(file_like_object, engine="calamine") - print(df) return RAGExcelParser._dataframe_to_workbook(df) except Exception as e_pandas: raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") @@ -68,7 +66,6 @@ class RAGExcelParser: # if contains multiple sheets use _dataframes_to_workbook if isinstance(df, dict) and len(df) > 1: return RAGExcelParser._dataframes_to_workbook(df) - df = RAGExcelParser._clean_dataframe(df) wb = Workbook() ws = wb.active @@ -80,15 +77,14 @@ class RAGExcelParser: for row_num, row in enumerate(df.values, 2): for col_num, value in enumerate(row, 1): ws.cell(row=row_num, column=col_num, value=value) - return wb - + @staticmethod def _dataframes_to_workbook(dfs: dict): wb = Workbook() default_sheet = wb.active wb.remove(default_sheet) - + for sheet_name, df in dfs.items(): df = RAGExcelParser._clean_dataframe(df) ws = wb.create_sheet(title=sheet_name) @@ -99,6 +95,52 @@ class RAGExcelParser: ws.cell(row=row_num, column=col_num, value=value) return wb + @staticmethod + def _extract_images_from_worksheet(ws, sheetname=None): + """ + Extract images from a worksheet and enrich them with vision-based descriptions. + + Returns: List[dict] + """ + images = getattr(ws, "_images", []) + if not images: + return [] + + raw_items = [] + + for img in images: + try: + img_bytes = img._data() + pil_img = Image.open(BytesIO(img_bytes)).convert("RGB") + + anchor = img.anchor + if hasattr(anchor, "_from") and hasattr(anchor, "_to"): + r1, c1 = anchor._from.row + 1, anchor._from.col + 1 + r2, c2 = anchor._to.row + 1, anchor._to.col + 1 + if r1 == r2 and c1 == c2: + span = "single_cell" + else: + span = "multi_cell" + else: + r1, c1 = anchor._from.row + 1, anchor._from.col + 1 + r2, c2 = r1, c1 + span = "single_cell" + + item = { + "sheet": sheetname or ws.title, + "image": pil_img, + "image_description": "", + "row_from": r1, + "col_from": c1, + "row_to": r2, + "col_to": c2, + "span_type": span, + } + raw_items.append(item) + except Exception: + continue + return raw_items + def html(self, fnm, chunk_rows=256): from html import escape @@ -131,7 +173,7 @@ class RAGExcelParser: tb = "" tb += f"