Merge branch 'refs/heads/develop' into feature/20251219_xjn
This commit is contained in:
@@ -1,26 +1,28 @@
|
|||||||
from typing import Optional
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
from typing import Optional
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, status, Query
|
from fastapi import APIRouter, Depends, HTTPException, status, Query
|
||||||
from sqlalchemy import or_
|
from sqlalchemy import or_
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.celery_app import celery_app
|
||||||
|
from app.core.logging_config import get_api_logger
|
||||||
|
from app.core.rag.common import settings
|
||||||
|
from app.core.rag.llm.chat_model import Base
|
||||||
|
from app.core.rag.nlp import rag_tokenizer, search
|
||||||
|
from app.core.rag.prompts.generator import graph_entity_types
|
||||||
|
from app.core.rag.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory
|
||||||
|
from app.core.response_utils import success
|
||||||
from app.db import get_db
|
from app.db import get_db
|
||||||
from app.dependencies import get_current_user
|
from app.dependencies import get_current_user
|
||||||
from app.models.user_model import User
|
from app.models.user_model import User
|
||||||
from app.models import knowledge_model, document_model, file_model
|
from app.models import knowledge_model, document_model, file_model
|
||||||
from app.schemas import knowledge_schema
|
from app.schemas import knowledge_schema
|
||||||
from app.schemas.response_schema import ApiResponse
|
from app.schemas.response_schema import ApiResponse
|
||||||
from app.core.response_utils import success
|
|
||||||
from app.services import knowledge_service, document_service
|
from app.services import knowledge_service, document_service
|
||||||
from app.core.rag.llm.chat_model import Base
|
from app.services.model_service import ModelConfigService
|
||||||
from app.core.rag.prompts.generator import graph_entity_types
|
|
||||||
from app.core.rag.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory
|
|
||||||
from app.core.logging_config import get_api_logger
|
|
||||||
from app.core.rag.nlp import rag_tokenizer, search
|
|
||||||
from app.core.rag.common import settings
|
|
||||||
from app.celery_app import celery_app
|
|
||||||
|
|
||||||
# Obtain a dedicated API logger
|
# Obtain a dedicated API logger
|
||||||
api_logger = get_api_logger()
|
api_logger = get_api_logger()
|
||||||
@@ -47,6 +49,45 @@ def get_parser_types():
|
|||||||
return success(msg="Successfully obtained the knowledge parser type", data=list(knowledge_model.ParserType))
|
return success(msg="Successfully obtained the knowledge parser type", data=list(knowledge_model.ParserType))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/knowledge_graph_entity_types", response_model=ApiResponse)
|
||||||
|
async def get_knowledge_graph_entity_types(
|
||||||
|
llm_id: uuid.UUID,
|
||||||
|
scenario: str,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
current_user: User = Depends(get_current_user)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
get knowledge graph entity types based on llm_id
|
||||||
|
"""
|
||||||
|
api_logger.info(f"Obtain details of the knowledge graph: llm_id={llm_id}, username: {current_user.username}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1. Check whether the model exists
|
||||||
|
api_logger.debug(f"Check whether the model exists: {llm_id}")
|
||||||
|
config = ModelConfigService.get_model_by_id(db=db, model_id=llm_id)
|
||||||
|
|
||||||
|
if not config:
|
||||||
|
api_logger.warning(
|
||||||
|
f"The model does not exist or you do not have permission to access it: llm_id={llm_id}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="The model does not exist or you do not have permission to access it"
|
||||||
|
)
|
||||||
|
# 2. Prepare to configure chat_mdl information
|
||||||
|
chat_model = Base(
|
||||||
|
key=config.api_keys[0].api_key,
|
||||||
|
model_name=config.api_keys[0].model_name,
|
||||||
|
base_url=config.api_keys[0].api_base
|
||||||
|
)
|
||||||
|
response = graph_entity_types(chat_model, scenario)
|
||||||
|
return success(data=response, msg="Successfully obtained knowledge graph entity types")
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
api_logger.error(f"get knowledge graph entity types failed: llm_id={llm_id} - {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
@router.get("/knowledges", response_model=ApiResponse)
|
@router.get("/knowledges", response_model=ApiResponse)
|
||||||
async def get_knowledges(
|
async def get_knowledges(
|
||||||
parent_id: Optional[uuid.UUID] = Query(None, description="parent folder id"),
|
parent_id: Optional[uuid.UUID] = Query(None, description="parent folder id"),
|
||||||
@@ -379,7 +420,7 @@ async def delete_knowledge_graph(
|
|||||||
current_user: User = Depends(get_current_user)
|
current_user: User = Depends(get_current_user)
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Soft-delete knowledge graph
|
delete knowledge graph
|
||||||
"""
|
"""
|
||||||
api_logger.info(f"Request to delete knowledge graph: knowledge_id={knowledge_id}, username: {current_user.username}")
|
api_logger.info(f"Request to delete knowledge graph: knowledge_id={knowledge_id}, username: {current_user.username}")
|
||||||
|
|
||||||
@@ -442,42 +483,3 @@ async def rebuild_knowledge_graph(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_logger.error(f"Failed to rebuild knowledge graph: knowledge_id={knowledge_id} - {str(e)}")
|
api_logger.error(f"Failed to rebuild knowledge graph: knowledge_id={knowledge_id} - {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{knowledge_id}/knowledge_graph_entity_types", response_model=ApiResponse)
|
|
||||||
async def get_knowledge_graph_entity_types(
|
|
||||||
knowledge_id: uuid.UUID,
|
|
||||||
scenario: str,
|
|
||||||
db: Session = Depends(get_db),
|
|
||||||
current_user: User = Depends(get_current_user)
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
get knowledge graph entity types based on knowledge_id
|
|
||||||
"""
|
|
||||||
api_logger.info(f"Obtain details of the knowledge graph: knowledge_id={knowledge_id}, username: {current_user.username}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 1. Check whether the knowledge base exists
|
|
||||||
api_logger.debug(f"Check whether the knowledge base exists: {knowledge_id}")
|
|
||||||
db_knowledge = knowledge_service.get_knowledge_by_id(db, knowledge_id=knowledge_id, current_user=current_user)
|
|
||||||
|
|
||||||
if not db_knowledge:
|
|
||||||
api_logger.warning(
|
|
||||||
f"The knowledge base does not exist or you do not have permission to access it: knowledge_id={knowledge_id}")
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_404_NOT_FOUND,
|
|
||||||
detail="The knowledge base does not exist or you do not have permission to access it"
|
|
||||||
)
|
|
||||||
# 2. Prepare to configure chat_mdl information
|
|
||||||
chat_model = Base(
|
|
||||||
key=db_knowledge.llm.api_keys[0].api_key,
|
|
||||||
model_name=db_knowledge.llm.api_keys[0].model_name,
|
|
||||||
base_url=db_knowledge.llm.api_keys[0].api_base
|
|
||||||
)
|
|
||||||
response = graph_entity_types(chat_model, scenario)
|
|
||||||
return success(data=response, msg="Successfully obtained knowledge graph entity types")
|
|
||||||
except HTTPException:
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
|
||||||
api_logger.error(f"get knowledge graph entity types failed: knowledge_id={knowledge_id} - {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|||||||
@@ -2,52 +2,39 @@
|
|||||||
"memory_verify": {
|
"memory_verify": {
|
||||||
"source_data": [
|
"source_data": [
|
||||||
{
|
{
|
||||||
"statement_name": "用户是2023年春天去北京工作的。",
|
"statement_name": "我是 2023 年春天去北京工作的,后来基本一直都在北京上班,也没怎么换过城市。不过后来公司调整,2024 年上半年我被调到上海待了差不多半年,那段时间每天都是在上海办公室打卡。当时入职资料用的还是我之前的身份信息,身份证号是 11010119950308123X,银行卡是 6222023847595898,这些一直没变。对了,其实我 从 2023 年开始就一直在北京生活,从来没有长期离开过北京,上海那段更多算是远程配合。"
|
||||||
"statement_id": "62beac695b1346f4871740a45db88782"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "用户后来基本一直都在北京上班。",
|
"statement_name": "用户后来基本一直都在北京上班。"
|
||||||
"statement_id": "4cba5ac08b674d7fb1e2ae634d2b8f0b"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "用户从2023年开始就一直在北京生活。",
|
"statement_name": "用户从2023年开始就一直在北京生活。"
|
||||||
"statement_id": "e612a44da4db483993c350df7c97a1a1"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "用户从来没有长期离开过北京。",
|
"statement_name": "用户从来没有长期离开过北京。"
|
||||||
"statement_id": "b3c787a2e33c49f7981accabbbb4538a"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "由于公司调整,用户在2024年上半年被调到上海待了差不多半年。",
|
"statement_name": "由于公司调整,用户在2024年上半年被调到上海待了差不多半年。"
|
||||||
"statement_id": "64cde4230cb24a4da726e7db9e7aa616"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "用户在被调到上海期间每天都是在上海办公室打卡。",
|
"statement_name": "用户在被调到上海期间每天都是在上海办公室打卡。"
|
||||||
"statement_id": "8b1b12e23b844b8088dfeb67da6ad669"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "用户在入职时使用的身份信息是之前的,身份证号为11010119950308123X。",
|
"statement_name": "用户在入职时使用的身份信息是之前的,身份证号为11010119950308123X。"
|
||||||
"statement_id": "030afd362e9b4110b139e68e5d3e7143"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "用户的银行卡号是6222023847595898。",
|
"statement_name": "用户的银行卡号是6222023847595898。"
|
||||||
"statement_id": "6c7567cd1f3c478bb42d1b65383e6f2f"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "用户的身份信息和银行卡信息一直没变。",
|
"statement_name": "用户的身份信息和银行卡信息一直没变。"
|
||||||
"statement_id": "b3ca618e1e204b83bebd70e75cf2073f"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"statement_name": "用户认为在上海的那段时间更多算是远程配合。",
|
"statement_name": "用户认为在上海的那段时间更多算是远程配合。"
|
||||||
"statement_id": "150af89d2c154e6eb41ff1a91e37f962"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"databasets": [
|
"databasets": [
|
||||||
{
|
{
|
||||||
"entity1_name": "Person",
|
"entity1_name": "Person",
|
||||||
"description": "表示人类个体的通用类型",
|
|
||||||
"statement_id": "62beac695b1346f4871740a45db88782",
|
|
||||||
"entity2_name": "用户",
|
|
||||||
"entity2": {
|
"entity2": {
|
||||||
"description": "叙述者,讲述个人工作与生活经历的个体",
|
"description": "叙述者,讲述个人工作与生活经历的个体",
|
||||||
"name": "用户"
|
"name": "用户"
|
||||||
@@ -55,9 +42,6 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"entity1_name": "用户",
|
"entity1_name": "用户",
|
||||||
"description": "叙述者,讲述个人工作与生活经历的个体",
|
|
||||||
"statement_id": "62beac695b1346f4871740a45db88782",
|
|
||||||
"entity2_name": "身份信息",
|
|
||||||
"entity2": {
|
"entity2": {
|
||||||
"description": "用于个人身份识别的数据",
|
"description": "用于个人身份识别的数据",
|
||||||
"name": "身份信息"
|
"name": "身份信息"
|
||||||
@@ -65,9 +49,6 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"entity1_name": "用户",
|
"entity1_name": "用户",
|
||||||
"description": "叙述者,讲述个人工作与生活经历的个体",
|
|
||||||
"statement_id": "62beac695b1346f4871740a45db88782",
|
|
||||||
"entity2_name": "6222023847595898",
|
|
||||||
"entity2": {
|
"entity2": {
|
||||||
"description": "用户的银行卡号码",
|
"description": "用户的银行卡号码",
|
||||||
"name": "6222023847595898"
|
"name": "6222023847595898"
|
||||||
@@ -76,33 +57,24 @@
|
|||||||
{
|
{
|
||||||
"entity1_name": "用户",
|
"entity1_name": "用户",
|
||||||
"description": "叙述者,讲述个人工作与生活经历的个体",
|
"description": "叙述者,讲述个人工作与生活经历的个体",
|
||||||
"statement_id": "62beac695b1346f4871740a45db88782",
|
|
||||||
"entity2_name": "上海办公室",
|
|
||||||
"entity2": {
|
"entity2": {
|
||||||
"entity_idx": 1,
|
"entity_idx": 1,
|
||||||
"aliases": ["上海办"],
|
"aliases": ["上海办"],
|
||||||
"description": "位于上海的工作办公场所",
|
|
||||||
"name": "上海办公室"
|
"name": "上海办公室"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"entity1_name": "用户",
|
"entity1_name": "用户",
|
||||||
"description": "叙述者,讲述个人工作与生活经历的个体",
|
"description": "叙述者,讲述个人工作与生活经历的个体",
|
||||||
"statement_id": "62beac695b1346f4871740a45db88782",
|
|
||||||
"entity2_name": "北京",
|
|
||||||
"entity2": {
|
"entity2": {
|
||||||
"aliases": ["京", "京城", "北平"],
|
"aliases": ["京", "京城", "北平"],
|
||||||
"description": "中国的首都城市,用户主要工作和生活所在地",
|
|
||||||
"name": "北京"
|
"name": "北京"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"entity1_name": "11010119950308123X",
|
"entity1_name": "11010119950308123X",
|
||||||
"description": "具体的身份证号码值",
|
"description": "具体的身份证号码值",
|
||||||
"statement_id": "030afd362e9b4110b139e68e5d3e7143",
|
|
||||||
"entity2_name": "身份证号",
|
|
||||||
"entity2": {
|
"entity2": {
|
||||||
"description": "中华人民共和国公民的身份号码",
|
|
||||||
"name": "身份证号"
|
"name": "身份证号"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -387,7 +387,7 @@ class ReflectionEngine:
|
|||||||
result_data['memory_verifies'] = memory_verifies
|
result_data['memory_verifies'] = memory_verifies
|
||||||
result_data['quality_assessments'] = quality_assessments
|
result_data['quality_assessments'] = quality_assessments
|
||||||
conflicts_found=''
|
conflicts_found=''
|
||||||
|
REMOVE_KEYS = {"created_at", "expired_at","relationship","predicate","statement_id","id","statement_id","relationship_statement_id"}
|
||||||
# Clearn conflict_data,And memory_verify和quality_assessment
|
# Clearn conflict_data,And memory_verify和quality_assessment
|
||||||
cleaned_conflict_data = []
|
cleaned_conflict_data = []
|
||||||
for item in conflict_data:
|
for item in conflict_data:
|
||||||
@@ -396,7 +396,23 @@ class ReflectionEngine:
|
|||||||
'conflict': item['conflict']
|
'conflict': item['conflict']
|
||||||
}
|
}
|
||||||
cleaned_conflict_data.append(cleaned_item)
|
cleaned_conflict_data.append(cleaned_item)
|
||||||
|
cleaned_conflict_data_=[]
|
||||||
|
for item in conflict_data:
|
||||||
|
cleaned_data = []
|
||||||
|
for row in item.get("data", []):
|
||||||
|
# 删除 created_at / expired_at
|
||||||
|
cleaned_row = {
|
||||||
|
k: v
|
||||||
|
for k, v in row.items()
|
||||||
|
if k not in REMOVE_KEYS
|
||||||
|
}
|
||||||
|
cleaned_data.append(cleaned_row)
|
||||||
|
cleaned_item = {
|
||||||
|
"data": cleaned_data,
|
||||||
|
"conflict": item.get("conflict"),
|
||||||
|
}
|
||||||
|
cleaned_conflict_data_.append(cleaned_item)
|
||||||
|
print(cleaned_conflict_data_)
|
||||||
# 3. 解决冲突
|
# 3. 解决冲突
|
||||||
solved_data = await self._resolve_conflicts(cleaned_conflict_data, source_data)
|
solved_data = await self._resolve_conflicts(cleaned_conflict_data, source_data)
|
||||||
if not solved_data:
|
if not solved_data:
|
||||||
|
|||||||
@@ -9,9 +9,7 @@
|
|||||||
|
|
||||||
## 任务目标
|
## 任务目标
|
||||||
作为数据冲突解决专家,分析冲突原因,按类型分组处理,为每种冲突生成独立解决方案。
|
作为数据冲突解决专家,分析冲突原因,按类型分组处理,为每种冲突生成独立解决方案。
|
||||||
|
|
||||||
**数据关系**: statement_databasets中的statement_id对应data中的记录,statement_created_at为用户输入时间。
|
**数据关系**: statement_databasets中的statement_id对应data中的记录,statement_created_at为用户输入时间。
|
||||||
|
|
||||||
**处理模式**:
|
**处理模式**:
|
||||||
- memory_verify=false: 仅处理数据冲突
|
- memory_verify=false: 仅处理数据冲突
|
||||||
- memory_verify=true: 处理数据冲突 + 隐私脱敏
|
- memory_verify=true: 处理数据冲突 + 隐私脱敏
|
||||||
@@ -111,7 +109,8 @@
|
|||||||
- 隐私保护优先: 所有输出记录必须完成隐私脱敏
|
- 隐私保护优先: 所有输出记录必须完成隐私脱敏
|
||||||
- 脱敏变更记录: 隐私脱敏变更也必须在change字段中记录{% endif %}
|
- 脱敏变更记录: 隐私脱敏变更也必须在change字段中记录{% endif %}
|
||||||
- 不可修改数据: 数据被判定为正确时不可修改,无数据可输出时为空
|
- 不可修改数据: 数据被判定为正确时不可修改,无数据可输出时为空
|
||||||
- 输出的结果reflexion字段中的reason字段和solution不允许含有(expired_at设为2024-01-01T00:00:00Z、memory_verify=true)等原数据字段以及涉及需要修改的字段以及内容
|
- 输出的结果reflexion字段中的reason字段和solution不允许含有(expired_at设为2024-01-01T00:00:00Z、memory_verify=true、memory_verify=false)等原数据字段以及涉及需要修改的字段以及内容,
|
||||||
|
,如果是FACT,只记录事实冲突相关的数据;如果是TIME,只记录时间冲突相关的数据;如果是HYBRID,则记录所有冲突相关的数据
|
||||||
|
|
||||||
**变更记录格式**:
|
**变更记录格式**:
|
||||||
```json
|
```json
|
||||||
@@ -158,8 +157,9 @@
|
|||||||
"conflict": true
|
"conflict": true
|
||||||
},
|
},
|
||||||
"reflexion": {
|
"reflexion": {
|
||||||
"reason": "该冲突类型的原因分析",
|
"reason": "该冲突类型的原因分析,如果是FACT就是存在事实冲突,分析该冲突原因,如果是TIME就是存在时间冲突,分析该冲突原因,如果是HYBRID,可以输出存在时间与事实的混合冲突再添加上原因分析,
|
||||||
"solution": "该冲突类型的解决方案"
|
不可以随意分配冲突类型以及原因,不允许输出字段比如(statement、description、entity1_name、entity2_name、name、memory_verify、expired_at、conflict)等类似这种",
|
||||||
|
"solution": "该冲突类型的解决方案(不允许输出字段比如(statement、description、entity1_name、entity2_name、name、memory_verify、expired_at、conflict)等类似这种)"
|
||||||
},
|
},
|
||||||
"resolved": {
|
"resolved": {
|
||||||
"original_memory_id": "被设为失效的记忆id",
|
"original_memory_id": "被设为失效的记忆id",
|
||||||
@@ -182,4 +182,5 @@
|
|||||||
- **resolved.change**: 包含详细变更信息
|
- **resolved.change**: 包含详细变更信息
|
||||||
- 无需修改的冲突类型resolved为null
|
- 无需修改的冲突类型resolved为null
|
||||||
- 与baseline不匹配的冲突类型不包含在results中
|
- 与baseline不匹配的冲突类型不包含在results中
|
||||||
模式参考: {{ json_schema }}
|
模式参考: {{ json_schema }}
|
||||||
|
|
||||||
|
|||||||
@@ -672,6 +672,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
excel_parser = ExcelParser()
|
excel_parser = ExcelParser()
|
||||||
if parser_config.get("html4excel"):
|
if parser_config.get("html4excel"):
|
||||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
else:
|
else:
|
||||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||||
parser_config["chunk_token_num"] = 12800
|
parser_config["chunk_token_num"] = 12800
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from io import BytesIO
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from openpyxl import Workbook, load_workbook
|
from openpyxl import Workbook, load_workbook
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from app.core.rag.nlp import find_codec
|
from app.core.rag.nlp import find_codec
|
||||||
|
|
||||||
@@ -28,7 +29,7 @@ class RAGExcelParser:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
file_like_object.seek(0)
|
file_like_object.seek(0)
|
||||||
df = pd.read_csv(file_like_object)
|
df = pd.read_csv(file_like_object, on_bad_lines='skip')
|
||||||
return RAGExcelParser._dataframe_to_workbook(df)
|
return RAGExcelParser._dataframe_to_workbook(df)
|
||||||
|
|
||||||
except Exception as e_csv:
|
except Exception as e_csv:
|
||||||
@@ -42,14 +43,11 @@ class RAGExcelParser:
|
|||||||
file_like_object.seek(0)
|
file_like_object.seek(0)
|
||||||
try:
|
try:
|
||||||
dfs = pd.read_excel(file_like_object, sheet_name=None)
|
dfs = pd.read_excel(file_like_object, sheet_name=None)
|
||||||
if isinstance(dfs, dict):
|
|
||||||
dfs = next(iter(dfs.values()))
|
|
||||||
return RAGExcelParser._dataframe_to_workbook(dfs)
|
return RAGExcelParser._dataframe_to_workbook(dfs)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
|
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
|
||||||
file_like_object.seek(0)
|
file_like_object.seek(0)
|
||||||
df = pd.read_excel(file_like_object, engine="calamine")
|
df = pd.read_excel(file_like_object, engine="calamine")
|
||||||
print(df)
|
|
||||||
return RAGExcelParser._dataframe_to_workbook(df)
|
return RAGExcelParser._dataframe_to_workbook(df)
|
||||||
except Exception as e_pandas:
|
except Exception as e_pandas:
|
||||||
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
|
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
|
||||||
@@ -68,7 +66,6 @@ class RAGExcelParser:
|
|||||||
# if contains multiple sheets use _dataframes_to_workbook
|
# if contains multiple sheets use _dataframes_to_workbook
|
||||||
if isinstance(df, dict) and len(df) > 1:
|
if isinstance(df, dict) and len(df) > 1:
|
||||||
return RAGExcelParser._dataframes_to_workbook(df)
|
return RAGExcelParser._dataframes_to_workbook(df)
|
||||||
|
|
||||||
df = RAGExcelParser._clean_dataframe(df)
|
df = RAGExcelParser._clean_dataframe(df)
|
||||||
wb = Workbook()
|
wb = Workbook()
|
||||||
ws = wb.active
|
ws = wb.active
|
||||||
@@ -80,15 +77,14 @@ class RAGExcelParser:
|
|||||||
for row_num, row in enumerate(df.values, 2):
|
for row_num, row in enumerate(df.values, 2):
|
||||||
for col_num, value in enumerate(row, 1):
|
for col_num, value in enumerate(row, 1):
|
||||||
ws.cell(row=row_num, column=col_num, value=value)
|
ws.cell(row=row_num, column=col_num, value=value)
|
||||||
|
|
||||||
return wb
|
return wb
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _dataframes_to_workbook(dfs: dict):
|
def _dataframes_to_workbook(dfs: dict):
|
||||||
wb = Workbook()
|
wb = Workbook()
|
||||||
default_sheet = wb.active
|
default_sheet = wb.active
|
||||||
wb.remove(default_sheet)
|
wb.remove(default_sheet)
|
||||||
|
|
||||||
for sheet_name, df in dfs.items():
|
for sheet_name, df in dfs.items():
|
||||||
df = RAGExcelParser._clean_dataframe(df)
|
df = RAGExcelParser._clean_dataframe(df)
|
||||||
ws = wb.create_sheet(title=sheet_name)
|
ws = wb.create_sheet(title=sheet_name)
|
||||||
@@ -99,6 +95,52 @@ class RAGExcelParser:
|
|||||||
ws.cell(row=row_num, column=col_num, value=value)
|
ws.cell(row=row_num, column=col_num, value=value)
|
||||||
return wb
|
return wb
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_images_from_worksheet(ws, sheetname=None):
|
||||||
|
"""
|
||||||
|
Extract images from a worksheet and enrich them with vision-based descriptions.
|
||||||
|
|
||||||
|
Returns: List[dict]
|
||||||
|
"""
|
||||||
|
images = getattr(ws, "_images", [])
|
||||||
|
if not images:
|
||||||
|
return []
|
||||||
|
|
||||||
|
raw_items = []
|
||||||
|
|
||||||
|
for img in images:
|
||||||
|
try:
|
||||||
|
img_bytes = img._data()
|
||||||
|
pil_img = Image.open(BytesIO(img_bytes)).convert("RGB")
|
||||||
|
|
||||||
|
anchor = img.anchor
|
||||||
|
if hasattr(anchor, "_from") and hasattr(anchor, "_to"):
|
||||||
|
r1, c1 = anchor._from.row + 1, anchor._from.col + 1
|
||||||
|
r2, c2 = anchor._to.row + 1, anchor._to.col + 1
|
||||||
|
if r1 == r2 and c1 == c2:
|
||||||
|
span = "single_cell"
|
||||||
|
else:
|
||||||
|
span = "multi_cell"
|
||||||
|
else:
|
||||||
|
r1, c1 = anchor._from.row + 1, anchor._from.col + 1
|
||||||
|
r2, c2 = r1, c1
|
||||||
|
span = "single_cell"
|
||||||
|
|
||||||
|
item = {
|
||||||
|
"sheet": sheetname or ws.title,
|
||||||
|
"image": pil_img,
|
||||||
|
"image_description": "",
|
||||||
|
"row_from": r1,
|
||||||
|
"col_from": c1,
|
||||||
|
"row_to": r2,
|
||||||
|
"col_to": c2,
|
||||||
|
"span_type": span,
|
||||||
|
}
|
||||||
|
raw_items.append(item)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return raw_items
|
||||||
|
|
||||||
def html(self, fnm, chunk_rows=256):
|
def html(self, fnm, chunk_rows=256):
|
||||||
from html import escape
|
from html import escape
|
||||||
|
|
||||||
@@ -131,7 +173,7 @@ class RAGExcelParser:
|
|||||||
tb = ""
|
tb = ""
|
||||||
tb += f"<table><caption>{sheetname}</caption>"
|
tb += f"<table><caption>{sheetname}</caption>"
|
||||||
tb += tb_rows_0
|
tb += tb_rows_0
|
||||||
for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
|
for r in list(rows[1 + chunk_i * chunk_rows: min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
|
||||||
tb += "<tr>"
|
tb += "<tr>"
|
||||||
for i, c in enumerate(r):
|
for i, c in enumerate(r):
|
||||||
if c.value is None:
|
if c.value is None:
|
||||||
@@ -154,7 +196,7 @@ class RAGExcelParser:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
|
logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
|
||||||
file_like_object.seek(0)
|
file_like_object.seek(0)
|
||||||
df = pd.read_csv(file_like_object)
|
df = pd.read_csv(file_like_object, on_bad_lines='skip')
|
||||||
df = df.replace(r"^\s*$", "", regex=True)
|
df = df.replace(r"^\s*$", "", regex=True)
|
||||||
return df.to_markdown(index=False)
|
return df.to_markdown(index=False)
|
||||||
|
|
||||||
@@ -172,19 +214,35 @@ class RAGExcelParser:
|
|||||||
continue
|
continue
|
||||||
if not rows:
|
if not rows:
|
||||||
continue
|
continue
|
||||||
|
# 获取表头
|
||||||
ti = list(rows[0])
|
ti = list(rows[0])
|
||||||
for r in list(rows[1:]):
|
header_fields = []
|
||||||
fields = []
|
for cell in ti:
|
||||||
for i, c in enumerate(r):
|
if cell.value: # 只添加有值的表头
|
||||||
if not c.value:
|
header_fields.append(str(cell.value))
|
||||||
continue
|
|
||||||
t = str(ti[i].value) if i < len(ti) else ""
|
# 如果有数据行,处理数据行;否则只处理表头
|
||||||
t += (":" if t else "") + str(c.value)
|
data_rows = rows[1:]
|
||||||
fields.append(t)
|
if data_rows:
|
||||||
line = "; ".join(fields)
|
for r in data_rows:
|
||||||
if sheetname.lower().find("sheet") < 0:
|
fields = []
|
||||||
line += " ——" + sheetname
|
for i, c in enumerate(r):
|
||||||
res.append(line)
|
if not c.value:
|
||||||
|
continue
|
||||||
|
t = str(ti[i].value) if i < len(ti) else ""
|
||||||
|
t += (":" if t else "") + str(c.value)
|
||||||
|
fields.append(t)
|
||||||
|
line = "; ".join(fields)
|
||||||
|
if sheetname.lower().find("sheet") < 0:
|
||||||
|
line += " ——" + sheetname
|
||||||
|
res.append(line)
|
||||||
|
else:
|
||||||
|
# 只有表头的情况
|
||||||
|
if header_fields:
|
||||||
|
line = "; ".join(header_fields)
|
||||||
|
if sheetname.lower().find("sheet") < 0:
|
||||||
|
line += " ——" + sheetname
|
||||||
|
res.append(line)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -192,14 +250,14 @@ class RAGExcelParser:
|
|||||||
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
||||||
wb = RAGExcelParser._load_excel_to_workbook(BytesIO(binary))
|
wb = RAGExcelParser._load_excel_to_workbook(BytesIO(binary))
|
||||||
total = 0
|
total = 0
|
||||||
|
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
try:
|
try:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
total += len(list(ws.rows))
|
total += len(list(ws.rows))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||||
continue
|
continue
|
||||||
return total
|
return total
|
||||||
|
|
||||||
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
||||||
|
|||||||
@@ -196,7 +196,7 @@ class EntityResolution(Extractor):
|
|||||||
ans_list = []
|
ans_list = []
|
||||||
records = [r.strip() for r in results.split(record_delimiter)]
|
records = [r.strip() for r in results.split(record_delimiter)]
|
||||||
for record in records:
|
for record in records:
|
||||||
pattern_int = f"{re.escape(entity_index_delimiter)}(\d+){re.escape(entity_index_delimiter)}"
|
pattern_int = fr"{re.escape(entity_index_delimiter)}(\d+){re.escape(entity_index_delimiter)}"
|
||||||
match_int = re.search(pattern_int, record)
|
match_int = re.search(pattern_int, record)
|
||||||
res_int = int(str(match_int.group(1) if match_int else '0'))
|
res_int = int(str(match_int.group(1) if match_int else '0'))
|
||||||
if res_int > records_length:
|
if res_int > records_length:
|
||||||
|
|||||||
@@ -197,6 +197,10 @@ def validate_embedding_model(
|
|||||||
embedding_id, "embedding", db, tenant_id, required=True,
|
embedding_id, "embedding", db, tenant_id, required=True,
|
||||||
config_id=config_id, workspace_id=workspace_id
|
config_id=config_id, workspace_id=workspace_id
|
||||||
)
|
)
|
||||||
|
print(100*'-')
|
||||||
|
print(embedding_uuid)
|
||||||
|
print(_)
|
||||||
|
print(100*'-')
|
||||||
|
|
||||||
if embedding_uuid is None:
|
if embedding_uuid is None:
|
||||||
raise InvalidConfigError(
|
raise InvalidConfigError(
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class EndNode(BaseNode):
|
|||||||
引用的节点 ID 列表
|
引用的节点 ID 列表
|
||||||
"""
|
"""
|
||||||
# 匹配 {{node_id.xxx}} 格式
|
# 匹配 {{node_id.xxx}} 格式
|
||||||
pattern = r'\{\{([a-zA-Z0-9_]+)\.[a-zA-Z0-9_]+\}\}'
|
pattern = r'\{\{([a-zA-Z0-9_-]+)\.[a-zA-Z0-9_]+\}\}'
|
||||||
matches = re.findall(pattern, template)
|
matches = re.findall(pattern, template)
|
||||||
return list(set(matches)) # 去重
|
return list(set(matches)) # 去重
|
||||||
|
|
||||||
|
|||||||
@@ -51,8 +51,8 @@ class DataConfig(Base):
|
|||||||
# 自我反思配置
|
# 自我反思配置
|
||||||
enable_self_reflexion = Column(Boolean, default=False, comment="是否启用自我反思")
|
enable_self_reflexion = Column(Boolean, default=False, comment="是否启用自我反思")
|
||||||
iteration_period = Column(String, default="3", comment="反思迭代周期")
|
iteration_period = Column(String, default="3", comment="反思迭代周期")
|
||||||
reflexion_range = Column(String, default="retrieval", comment="反思范围:部分/全部")
|
reflexion_range = Column(String, default="partial", comment="反思范围:部分/全部")
|
||||||
baseline = Column(String, default="time", comment="基线:时间/事实/时间和事实")
|
baseline = Column(String, default="TIME", comment="基线:时间/事实/时间和事实")
|
||||||
reflection_model_id = Column(String, nullable=True, comment="反思模型ID")
|
reflection_model_id = Column(String, nullable=True, comment="反思模型ID")
|
||||||
memory_verify = Column(Boolean, default=True, comment="记忆验证")
|
memory_verify = Column(Boolean, default=True, comment="记忆验证")
|
||||||
quality_assessment = Column(Boolean, default=True, comment="质量评估")
|
quality_assessment = Column(Boolean, default=True, comment="质量评估")
|
||||||
|
|||||||
@@ -361,14 +361,14 @@ class MasterAgentRouter:
|
|||||||
"model_name": api_key_config.model_name
|
"model_name": api_key_config.model_name
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
temperature = 0.3 # 决策任务使用较低温度
|
# temperature = 0.3 # 决策任务使用较低温度
|
||||||
max_tokens = 1000
|
# max_tokens = 1000
|
||||||
if self.model_parameters:
|
# if self.model_parameters:
|
||||||
temperature = self.model_parameters.temperature
|
# temperature = self.model_parameters["temperature"]
|
||||||
max_tokens = self.model_parameters.max_tokens
|
# max_tokens = self.model_parameters["max_tokens"]
|
||||||
|
|
||||||
extra_params = {"temperature": temperature,
|
extra_params = {"temperature": self.model_parameters.get("temperature", 0.3),
|
||||||
"max_tokens":max_tokens
|
"max_tokens":self.model_parameters.get("max_tokens", 1000)
|
||||||
}
|
}
|
||||||
# 创建 RedBearModelConfig
|
# 创建 RedBearModelConfig
|
||||||
model_config = RedBearModelConfig(
|
model_config = RedBearModelConfig(
|
||||||
|
|||||||
@@ -290,12 +290,22 @@ class MultiAgentService:
|
|||||||
else:
|
else:
|
||||||
execution_config_data = convert_uuids_to_str(data.execution_config.model_dump())
|
execution_config_data = convert_uuids_to_str(data.execution_config.model_dump())
|
||||||
|
|
||||||
|
# 处理 model_parameters(可能是 None、字典或 Pydantic 模型)
|
||||||
|
if data.model_parameters is None:
|
||||||
|
model_parameters_data = None
|
||||||
|
elif isinstance(data.model_parameters, dict):
|
||||||
|
# 过滤掉值为 None 的字段
|
||||||
|
model_parameters_data = {k: v for k, v in data.model_parameters.items() if v is not None}
|
||||||
|
else:
|
||||||
|
# 过滤掉值为 None 的字段
|
||||||
|
model_parameters_data = {k: v for k, v in data.model_parameters.model_dump().items() if v is not None}
|
||||||
|
|
||||||
config = MultiAgentConfig(
|
config = MultiAgentConfig(
|
||||||
app_id=app_id,
|
app_id=app_id,
|
||||||
master_agent_id=data.master_agent_id,
|
master_agent_id=data.master_agent_id,
|
||||||
master_agent_name=data.master_agent_name,
|
master_agent_name=data.master_agent_name,
|
||||||
default_model_config_id=data.default_model_config_id,
|
default_model_config_id=data.default_model_config_id,
|
||||||
model_parameters=data.model_parameters,
|
model_parameters=model_parameters_data,
|
||||||
orchestration_mode=data.orchestration_mode,
|
orchestration_mode=data.orchestration_mode,
|
||||||
sub_agents=sub_agents_data,
|
sub_agents=sub_agents_data,
|
||||||
# routing_rules=routing_rules_data,
|
# routing_rules=routing_rules_data,
|
||||||
|
|||||||
@@ -41,8 +41,6 @@ nodes:
|
|||||||
- 使用友好、礼貌的语气
|
- 使用友好、礼貌的语气
|
||||||
- 适当使用格式化(如列表、段落)提高可读性
|
- 适当使用格式化(如列表、段落)提高可读性
|
||||||
|
|
||||||
- role: user
|
|
||||||
content: "{{sys.message}}"
|
|
||||||
|
|
||||||
model_id: null
|
model_id: null
|
||||||
temperature: 0.7
|
temperature: 0.7
|
||||||
|
|||||||
Reference in New Issue
Block a user