From f485398768af135ba32860ff60444b54775d6471 Mon Sep 17 00:00:00 2001 From: Timebomb2018 <18868801967@163.com> Date: Fri, 27 Mar 2026 19:13:51 +0800 Subject: [PATCH] fix(workflow): Parsing of DOC files --- api/app/models/conversation_model.py | 6 -- api/app/services/conversation_service.py | 8 +-- api/app/services/multimodal_service.py | 85 ++++++++++++++++++------ 3 files changed, 70 insertions(+), 29 deletions(-) diff --git a/api/app/models/conversation_model.py b/api/app/models/conversation_model.py index 4ae9034d..4011247f 100644 --- a/api/app/models/conversation_model.py +++ b/api/app/models/conversation_model.py @@ -57,12 +57,6 @@ class Conversation(Base): workspace = relationship("Workspace") messages = relationship("Message", back_populates="conversation", cascade="all, delete-orphan") - @property - def is_first_user_message(self): - """判断当前是否是用户的第一条消息(无视开场白)""" - user_message_count = sum(1 for msg in self.messages if msg.role == "user") - return user_message_count == 1 - class ConversationDetail(Base): __tablename__ = "conversation_details" diff --git a/api/app/services/conversation_service.py b/api/app/services/conversation_service.py index ecf316d9..bd7f7496 100644 --- a/api/app/services/conversation_service.py +++ b/api/app/services/conversation_service.py @@ -214,14 +214,14 @@ class ConversationService: conversation.message_count += 1 - self.db.commit() - self.db.refresh(message) - - if conversation.is_first_user_message and role == "user": + if conversation.message_count <= 2 and role == "user": conversation.title = ( content[:50] + ("..." if len(content) > 50 else "") ) + self.db.commit() + self.db.refresh(message) + logger.info( "Message added successfully", extra={ diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py index 4cf3d89d..f854e987 100644 --- a/api/app/services/multimodal_service.py +++ b/api/app/services/multimodal_service.py @@ -12,6 +12,9 @@ import base64 import csv import io import json +import re +import olefile +import struct import zipfile from abc import ABC, abstractmethod from typing import List, Dict, Any, Optional @@ -602,31 +605,75 @@ class MultimodalService: try: word_file = io.BytesIO(file_content) doc = Document(word_file) - return '\n'.join(p.text for p in doc.paragraphs) + text_lines = [] + for p in doc.paragraphs: + text = p.text.strip() + if text: + text_lines.append(text) + + for table in doc.tables: + for row in table.rows: + for cell in row.cells: + text = cell.text.strip() + if text: + text_lines.append(text) + + full_text = "\n".join(text_lines) + return full_text.strip() or "[docx 文件无文本内容]" except Exception as e: - logger.error(f"提取 docx 文本失败: {e}") + logger.error(f"提取 docx 文本失败: {str(e)}", exc_info=True) return f"[docx 提取失败: {str(e)}]" - # 旧版 .doc(OLE2 格式) + # 旧版 .doc(OLE2/CFB 格式),按 Word Binary Format 规范解析 piece table try: - import olefile ole = olefile.OleFileIO(io.BytesIO(file_content)) - if not ole.exists('WordDocument'): - return "[doc 提取失败: 未找到 WordDocument 流]" - # 读取 WordDocument 流,提取可见 ASCII/Unicode 文本 - stream = ole.openstream('WordDocument').read() - # Word Binary Format: 文本在流中以 UTF-16-LE 编码存储 - # 简单提取:过滤出可打印字符段 - try: - text = stream.decode('utf-16-le', errors='ignore') - except Exception: - text = stream.decode('latin-1', errors='ignore') - # 过滤控制字符,保留可打印内容 - import re - text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) - text = re.sub(r' +', ' ', text).strip() + word_stream = ole.openstream('WordDocument').read() + + # FIB offset 0xA bit9 决定使用 0Table 还是 1Table + fib_flags = struct.unpack_from('