Merge pull request #639 from SuanmoSuanyangTechnology/fix/features_028
fix(app features)
This commit is contained in:
@@ -149,15 +149,15 @@ class FileUploadConfig(BaseModel):
|
|||||||
)
|
)
|
||||||
# 通用文件:PDF/DOCX/XLSX/TXT/CSV/JSON,最大 100MB
|
# 通用文件:PDF/DOCX/XLSX/TXT/CSV/JSON,最大 100MB
|
||||||
document_enabled: bool = Field(default=False)
|
document_enabled: bool = Field(default=False)
|
||||||
document_max_size_mb: int = Field(default=100)
|
document_max_size_mb: int = Field(default=50)
|
||||||
document_allowed_extensions: List[str] = Field(
|
document_allowed_extensions: List[str] = Field(
|
||||||
default=["pdf", "docx", "doc", "xlsx", "xls", "txt", "csv", "json", "md"]
|
default=["pdf", "docx", "doc", "xlsx", "xls", "txt", "csv", "json", "md"]
|
||||||
)
|
)
|
||||||
# 视频文件:MP4/MOV/AVI/WebM,最大 500MB
|
# 视频文件:MP4/MOV/AVI/WebM,最大 500MB
|
||||||
video_enabled: bool = Field(default=False)
|
video_enabled: bool = Field(default=False)
|
||||||
video_max_size_mb: int = Field(default=500)
|
video_max_size_mb: int = Field(default=50)
|
||||||
video_allowed_extensions: List[str] = Field(
|
video_allowed_extensions: List[str] = Field(
|
||||||
default=["mp4", "mov"]
|
default=["mp4"]
|
||||||
)
|
)
|
||||||
# 最大文件数量
|
# 最大文件数量
|
||||||
max_file_count: int = Field(default=5, ge=1)
|
max_file_count: int = Field(default=5, ge=1)
|
||||||
|
|||||||
@@ -622,30 +622,71 @@ class MultimodalService:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def _extract_word_text(file_content: bytes) -> str:
|
async def _extract_word_text(file_content: bytes) -> str:
|
||||||
"""提取 Word 文档文本"""
|
"""提取 Word 文档文本(支持 .docx 和旧版 .doc)"""
|
||||||
|
# 先尝试 docx(ZIP 格式)
|
||||||
|
if file_content[:2] == b'PK':
|
||||||
|
try:
|
||||||
|
word_file = io.BytesIO(file_content)
|
||||||
|
doc = Document(word_file)
|
||||||
|
return '\n'.join(p.text for p in doc.paragraphs)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"提取 docx 文本失败: {e}")
|
||||||
|
return f"[docx 提取失败: {str(e)}]"
|
||||||
|
|
||||||
|
# 旧版 .doc(OLE2 格式)
|
||||||
try:
|
try:
|
||||||
word_file = io.BytesIO(file_content)
|
import olefile
|
||||||
doc = Document(word_file)
|
ole = olefile.OleFileIO(io.BytesIO(file_content))
|
||||||
text_parts = [paragraph.text for paragraph in doc.paragraphs]
|
if not ole.exists('WordDocument'):
|
||||||
return '\n'.join(text_parts)
|
return "[doc 提取失败: 未找到 WordDocument 流]"
|
||||||
|
# 读取 WordDocument 流,提取可见 ASCII/Unicode 文本
|
||||||
|
stream = ole.openstream('WordDocument').read()
|
||||||
|
# Word Binary Format: 文本在流中以 UTF-16-LE 编码存储
|
||||||
|
# 简单提取:过滤出可打印字符段
|
||||||
|
try:
|
||||||
|
text = stream.decode('utf-16-le', errors='ignore')
|
||||||
|
except Exception:
|
||||||
|
text = stream.decode('latin-1', errors='ignore')
|
||||||
|
# 过滤控制字符,保留可打印内容
|
||||||
|
import re
|
||||||
|
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
||||||
|
text = re.sub(r' +', ' ', text).strip()
|
||||||
|
ole.close()
|
||||||
|
return text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"提取 Word 文本失败: {e}")
|
logger.error(f"提取 doc 文本失败: {e}")
|
||||||
return f"[Word 提取失败: {str(e)}]"
|
return f"[doc 提取失败: {str(e)}]"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def _extract_xlsx_text(file_content: bytes) -> str:
|
async def _extract_xlsx_text(file_content: bytes) -> str:
|
||||||
"""提取 Excel 文本"""
|
"""提取 Excel 文本(支持 .xlsx 和旧版 .xls)"""
|
||||||
|
# xlsx(ZIP 格式)
|
||||||
|
if file_content[:2] == b'PK':
|
||||||
|
try:
|
||||||
|
wb = openpyxl.load_workbook(io.BytesIO(file_content), read_only=True, data_only=True)
|
||||||
|
parts = []
|
||||||
|
for sheet in wb.worksheets:
|
||||||
|
parts.append(f"[Sheet: {sheet.title}]")
|
||||||
|
for row in sheet.iter_rows(values_only=True):
|
||||||
|
parts.append('\t'.join('' if v is None else str(v) for v in row))
|
||||||
|
return '\n'.join(parts)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"提取 xlsx 文本失败: {e}")
|
||||||
|
return f"[xlsx 提取失败: {str(e)}]"
|
||||||
|
|
||||||
|
# xls(OLE2/BIFF 格式)
|
||||||
try:
|
try:
|
||||||
wb = openpyxl.load_workbook(io.BytesIO(file_content), read_only=True, data_only=True)
|
import xlrd
|
||||||
|
wb = xlrd.open_workbook(file_contents=file_content)
|
||||||
parts = []
|
parts = []
|
||||||
for sheet in wb.worksheets:
|
for sheet in wb.sheets():
|
||||||
parts.append(f"[Sheet: {sheet.title}]")
|
parts.append(f"[Sheet: {sheet.name}]")
|
||||||
for row in sheet.iter_rows(values_only=True):
|
for row_idx in range(sheet.nrows):
|
||||||
parts.append('\t'.join('' if v is None else str(v) for v in row))
|
parts.append('\t'.join(str(sheet.cell_value(row_idx, col)) for col in range(sheet.ncols)))
|
||||||
return '\n'.join(parts)
|
return '\n'.join(parts)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"提取 Excel 文本失败: {e}")
|
logger.error(f"提取 xls 文本失败: {e}")
|
||||||
return f"[Excel 提取失败: {str(e)}]"
|
return f"[xls 提取失败: {str(e)}]"
|
||||||
|
|
||||||
async def _extract_csv_text(self, file_content: bytes) -> str:
|
async def _extract_csv_text(self, file_content: bytes) -> str:
|
||||||
"""提取 CSV 文本"""
|
"""提取 CSV 文本"""
|
||||||
|
|||||||
Reference in New Issue
Block a user