fix(app):

1.The end users are still bound to the app.
2. Multi-modal file support includes xlsx, csv, and json.
3. The file routing protocol is consistent with the page routing.
This commit is contained in:
Timebomb2018
2026-03-18 10:46:55 +08:00
parent dfcc85a466
commit 7b99a32a1e
6 changed files with 87 additions and 3 deletions

View File

@@ -14,9 +14,13 @@ import uuid
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional
import csv
import json
import PyPDF2
import httpx
import magic
import openpyxl
from docx import Document
from sqlalchemy.orm import Session
@@ -39,6 +43,13 @@ DOC_MIME = [
'application/msword',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
]
XLSX_MIME = [
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.ms-excel',
'application/zip'
]
CSV_MIME = ['text/csv', 'application/csv']
JSON_MIME = ['application/json']
class MultimodalFormatStrategy(ABC):
@@ -577,6 +588,12 @@ class MultimodalService:
return await self._extract_pdf_text(file_content)
elif file_mime_type in DOC_MIME:
return await self._extract_word_text(file_content)
elif file_mime_type in XLSX_MIME:
return await self._extract_xlsx_text(file_content)
elif file_mime_type in CSV_MIME:
return await self._extract_csv_text(file_content)
elif file_mime_type in JSON_MIME:
return await self._extract_json_text(file_content)
else:
return f"[Unsupported file type: {file_mime_type}]"
except Exception as e:
@@ -602,7 +619,6 @@ class MultimodalService:
async def _extract_word_text(file_content: bytes) -> str:
"""提取 Word 文档文本"""
try:
# 使用 BytesIO 读取 Word 文档
word_file = io.BytesIO(file_content)
doc = Document(word_file)
text_parts = [paragraph.text for paragraph in doc.paragraphs]
@@ -611,6 +627,42 @@ class MultimodalService:
logger.error(f"提取 Word 文本失败: {e}")
return f"[Word 提取失败: {str(e)}]"
@staticmethod
async def _extract_xlsx_text(file_content: bytes) -> str:
"""提取 Excel 文本"""
try:
wb = openpyxl.load_workbook(io.BytesIO(file_content), read_only=True, data_only=True)
parts = []
for sheet in wb.worksheets:
parts.append(f"[Sheet: {sheet.title}]")
for row in sheet.iter_rows(values_only=True):
parts.append('\t'.join('' if v is None else str(v) for v in row))
return '\n'.join(parts)
except Exception as e:
logger.error(f"提取 Excel 文本失败: {e}")
return f"[Excel 提取失败: {str(e)}]"
@staticmethod
async def _extract_csv_text(file_content: bytes) -> str:
"""提取 CSV 文本"""
try:
text = file_content.decode('utf-8-sig')
reader = csv.reader(io.StringIO(text))
return '\n'.join('\t'.join(row) for row in reader)
except Exception as e:
logger.error(f"提取 CSV 文本失败: {e}")
return f"[CSV 提取失败: {str(e)}]"
@staticmethod
async def _extract_json_text(file_content: bytes) -> str:
"""提取 JSON 文本"""
try:
data = json.loads(file_content.decode('utf-8'))
return json.dumps(data, ensure_ascii=False, indent=2)
except Exception as e:
logger.error(f"提取 JSON 文本失败: {e}")
return f"[JSON 提取失败: {str(e)}]"
def get_multimodal_service(db: Session) -> MultimodalService:
"""获取多模态服务实例(依赖注入)"""