Merge branch 'feature/20251219_lxc' into develop

This commit is contained in:
lixiangcheng1
2025-12-30 19:32:06 +08:00

View File

@@ -42,35 +42,31 @@ class RAGExcelParser:
file_like_object.seek(0) file_like_object.seek(0)
try: try:
dfs = pd.read_excel(file_like_object, sheet_name=None) dfs = pd.read_excel(file_like_object, sheet_name=None)
if isinstance(dfs, dict):
dfs = next(iter(dfs.values()))
return RAGExcelParser._dataframe_to_workbook(dfs) return RAGExcelParser._dataframe_to_workbook(dfs)
except Exception as ex: except Exception as ex:
logging.info(f"pandas with default engine load error: {ex}, try calamine instead") logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
file_like_object.seek(0) file_like_object.seek(0)
df = pd.read_excel(file_like_object, engine="calamine") df = pd.read_excel(file_like_object, engine="calamine")
print(df)
return RAGExcelParser._dataframe_to_workbook(df) return RAGExcelParser._dataframe_to_workbook(df)
except Exception as e_pandas: except Exception as e_pandas:
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
@staticmethod @staticmethod
def _clean_dataframe(df): def _clean_dataframe(df: pd.DataFrame):
def clean_string(s): def clean_string(s):
if isinstance(s, str): if isinstance(s, str):
return ILLEGAL_CHARACTERS_RE.sub(" ", s) return ILLEGAL_CHARACTERS_RE.sub(" ", s)
return s return s
# 处理单 DataFrame 或字典(多 Sheet return df.apply(lambda col: col.map(clean_string))
if isinstance(df, dict):
return {sheet: RAGExcelParser._clean_dataframe(sheet_df) for sheet, sheet_df in df.items()}
elif isinstance(df, pd.DataFrame):
return df.apply(lambda col: col.map(clean_string))
else:
raise ValueError(f"Unsupported type for cleaning: {type(df)}")
@staticmethod @staticmethod
def _dataframe_to_workbook(df): def _dataframe_to_workbook(df):
# if contains multiple sheets use _dataframes_to_workbook # if contains multiple sheets use _dataframes_to_workbook
# if isinstance(df, dict) and len(df) > 1: if isinstance(df, dict) and len(df) > 1:
if isinstance(df, dict):
return RAGExcelParser._dataframes_to_workbook(df) return RAGExcelParser._dataframes_to_workbook(df)
df = RAGExcelParser._clean_dataframe(df) df = RAGExcelParser._clean_dataframe(df)