From a51e34852c3201da1d8c134e58adface43606507 Mon Sep 17 00:00:00 2001
From: Timebomb2018 <18868801967@163.com>
Date: Thu, 19 Mar 2026 21:41:45 +0800
Subject: [PATCH] fix(app features): Support for xls and doc files

---
 api/app/schemas/app_schema.py          |  6 +--
 api/app/services/multimodal_service.py | 71 ++++++++++++++++++++------
 2 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/api/app/schemas/app_schema.py b/api/app/schemas/app_schema.py
index 8ca43a76..1582d862 100644
--- a/api/app/schemas/app_schema.py
+++ b/api/app/schemas/app_schema.py
@@ -149,15 +149,15 @@ class FileUploadConfig(BaseModel):
     )
     # 通用文件：PDF/DOCX/XLSX/TXT/CSV/JSON，最大 100MB
     document_enabled: bool = Field(default=False)
-    document_max_size_mb: int = Field(default=100)
+    document_max_size_mb: int = Field(default=50)
     document_allowed_extensions: List[str] = Field(
         default=["pdf", "docx", "doc", "xlsx", "xls", "txt", "csv", "json", "md"]
     )
     # 视频文件：MP4/MOV/AVI/WebM，最大 500MB
     video_enabled: bool = Field(default=False)
-    video_max_size_mb: int = Field(default=500)
+    video_max_size_mb: int = Field(default=50)
     video_allowed_extensions: List[str] = Field(
-        default=["mp4", "mov"]
+        default=["mp4"]
     )
     # 最大文件数量
     max_file_count: int = Field(default=5, ge=1)
diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py
index e7fb3de3..d8cca67e 100644
--- a/api/app/services/multimodal_service.py
+++ b/api/app/services/multimodal_service.py
@@ -622,30 +622,71 @@ class MultimodalService:
 
     @staticmethod
     async def _extract_word_text(file_content: bytes) -> str:
-        """提取 Word 文档文本"""
+        """提取 Word 文档文本（支持 .docx 和旧版 .doc）"""
+        # 先尝试 docx（ZIP 格式）
+        if file_content[:2] == b'PK':
+            try:
+                word_file = io.BytesIO(file_content)
+                doc = Document(word_file)
+                return '\n'.join(p.text for p in doc.paragraphs)
+            except Exception as e:
+                logger.error(f"提取 docx 文本失败: {e}")
+                return f"[docx 提取失败: {str(e)}]"
+
+        # 旧版 .doc（OLE2 格式）
         try:
-            word_file = io.BytesIO(file_content)
-            doc = Document(word_file)
-            text_parts = [paragraph.text for paragraph in doc.paragraphs]
-            return '\n'.join(text_parts)
+            import olefile
+            ole = olefile.OleFileIO(io.BytesIO(file_content))
+            if not ole.exists('WordDocument'):
+                return "[doc 提取失败: 未找到 WordDocument 流]"
+            # 读取 WordDocument 流，提取可见 ASCII/Unicode 文本
+            stream = ole.openstream('WordDocument').read()
+            # Word Binary Format: 文本在流中以 UTF-16-LE 编码存储
+            # 简单提取：过滤出可打印字符段
+            try:
+                text = stream.decode('utf-16-le', errors='ignore')
+            except Exception:
+                text = stream.decode('latin-1', errors='ignore')
+            # 过滤控制字符，保留可打印内容
+            import re
+            text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+            text = re.sub(r' +', ' ', text).strip()
+            ole.close()
+            return text
         except Exception as e:
-            logger.error(f"提取 Word 文本失败: {e}")
-            return f"[Word 提取失败: {str(e)}]"
+            logger.error(f"提取 doc 文本失败: {e}")
+            return f"[doc 提取失败: {str(e)}]"
 
     @staticmethod
     async def _extract_xlsx_text(file_content: bytes) -> str:
-        """提取 Excel 文本"""
+        """提取 Excel 文本（支持 .xlsx 和旧版 .xls）"""
+        # xlsx（ZIP 格式）
+        if file_content[:2] == b'PK':
+            try:
+                wb = openpyxl.load_workbook(io.BytesIO(file_content), read_only=True, data_only=True)
+                parts = []
+                for sheet in wb.worksheets:
+                    parts.append(f"[Sheet: {sheet.title}]")
+                    for row in sheet.iter_rows(values_only=True):
+                        parts.append('\t'.join('' if v is None else str(v) for v in row))
+                return '\n'.join(parts)
+            except Exception as e:
+                logger.error(f"提取 xlsx 文本失败: {e}")
+                return f"[xlsx 提取失败: {str(e)}]"
+
+        # xls（OLE2/BIFF 格式）
         try:
-            wb = openpyxl.load_workbook(io.BytesIO(file_content), read_only=True, data_only=True)
+            import xlrd
+            wb = xlrd.open_workbook(file_contents=file_content)
             parts = []
-            for sheet in wb.worksheets:
-                parts.append(f"[Sheet: {sheet.title}]")
-                for row in sheet.iter_rows(values_only=True):
-                    parts.append('\t'.join('' if v is None else str(v) for v in row))
+            for sheet in wb.sheets():
+                parts.append(f"[Sheet: {sheet.name}]")
+                for row_idx in range(sheet.nrows):
+                    parts.append('\t'.join(str(sheet.cell_value(row_idx, col)) for col in range(sheet.ncols)))
             return '\n'.join(parts)
         except Exception as e:
-            logger.error(f"提取 Excel 文本失败: {e}")
-            return f"[Excel 提取失败: {str(e)}]"
+            logger.error(f"提取 xls 文本失败: {e}")
+            return f"[xls 提取失败: {str(e)}]"
 
     async def _extract_csv_text(self, file_content: bytes) -> str:
         """提取 CSV 文本"""