From 99c501f18813e1b9831df4843f18f80e36d85727 Mon Sep 17 00:00:00 2001 From: yujiangping Date: Thu, 25 Dec 2025 17:39:01 +0800 Subject: [PATCH] feat(knowledgeBase): add media file validation and PDF enhancement method selection - Add i18n translations for file size and duration validation errors in English and Chinese - Implement media file validation with 256MB size limit and 150-second duration limit - Add support for audio and video file formats (mp3, mp4, mov, wav) in dataset creation - Add checkMediaDuration helper function to validate media file duration using HTML5 media API - Add PDF enhancement method selection dropdown with options (DeepDoc, MinerU, TextLN) - Change default PDF enhancement setting from disabled to enabled - Update file type array to include media formats - Add error messaging for file size and duration validation failures - Improve UI spacing for file parsing settings section --- web/src/i18n/en.ts | 5 ++ web/src/i18n/zh.ts | 5 ++ .../[knowledgeBaseId]/CreateDataset.tsx | 86 ++++++++++++++++--- .../[knowledgeBaseId]/Private.tsx | 33 +++---- 4 files changed, 103 insertions(+), 26 deletions(-) diff --git a/web/src/i18n/en.ts b/web/src/i18n/en.ts index e05be03f..2452465a 100644 --- a/web/src/i18n/en.ts +++ b/web/src/i18n/en.ts @@ -615,6 +615,11 @@ export const en = { qaMode: 'QA Mode', fileParsingSettings: 'File Parsing Settings', pdfEnhancementAnalysis: 'PDF Enhancement Analysis', + fileSizeExceeds: 'File size exceeds the limit', + sizeLimitError: 'The file size exceeds the limit. The maximum supported size is 256MB. The current file size is', + fileDurationExceeds: 'File duration exceeds the limit', + fileDurationLimitError: 'The duration of the media file exceeds the limit. The maximum supported duration is 150 seconds. Current duration', + unableReadFile:'Unable to read the information of the media file. Please check the file format.', createForm:{ name: 'Name', embedding_id: 'Embedding', diff --git a/web/src/i18n/zh.ts b/web/src/i18n/zh.ts index c1b6b448..6f577177 100644 --- a/web/src/i18n/zh.ts +++ b/web/src/i18n/zh.ts @@ -239,6 +239,11 @@ export const zh = { qaMode: '问答模式', fileParsingSettings: '文件解析设置', pdfEnhancementAnalysis: 'PDF增强解析', + fileSizeExceeds: '文件大小超过限制', + sizeLimitError: '文件大小超过限制,最大支持256MB,当前文件大小', + fileDurationExceeds:'文件时长超过限制', + fileDurationLimitError: '媒体文件时长超过限制,最大支持150秒,当前时长', + unableReadFile:'无法读取媒体文件信息,请检查文件格式', createForm: { name: '名称', embedding_id: '嵌入模型', diff --git a/web/src/views/KnowledgeBase/[knowledgeBaseId]/CreateDataset.tsx b/web/src/views/KnowledgeBase/[knowledgeBaseId]/CreateDataset.tsx index 150d32b8..34a91850 100644 --- a/web/src/views/KnowledgeBase/[knowledgeBaseId]/CreateDataset.tsx +++ b/web/src/views/KnowledgeBase/[knowledgeBaseId]/CreateDataset.tsx @@ -1,5 +1,5 @@ import { useMemo,useRef, useState, useEffect } from 'react'; -import { Button, Flex, Radio, Steps, Modal, Input, Spin, message, Checkbox} from 'antd'; +import { Button, Flex, Radio, Steps, Modal, Input, Spin, message, Checkbox, Select} from 'antd'; import { useTranslation } from 'react-i18next'; import { useLocation, useNavigate, useParams } from 'react-router-dom'; import Table, { type TableRef } from '@/components/Table' @@ -81,9 +81,10 @@ const CreateDataset = () => { const [blockSize, setBlockSize] = useState(130); const [processingMethod, setProcessingMethod] = useState('directBlock'); const [parameterSettings, setParameterSettings] = useState('defaultSettings'); - const [pdfEnhancementEnabled, setPdfEnhancementEnabled] = useState(false); + const [pdfEnhancementEnabled, setPdfEnhancementEnabled] = useState(true); + const [pdfEnhancementMethod, setPdfEnhancementMethod] = useState('deepdoc'); const [messageApi, contextHolder] = message.useMessage(); - const fileType = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'csv', 'md', 'htm', 'html', 'json', 'ppt', 'pptx', 'txt','png','jpg'] + const fileType = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'csv', 'md', 'htm', 'html', 'json', 'ppt', 'pptx', 'txt','png','jpg','mp3','mp4','mov','wav'] const steps = useMemo( () => [ { title: t('knowledgeBase.selectFile') }, @@ -119,7 +120,7 @@ const CreateDataset = () => { const params = { progress: 0, parser_config: { - layout_recognize:'DeepDOC', + layout_recognize: pdfEnhancementMethod || 'DeepDOC', delimiter: delimiter, chunk_token_num: blockSize, auto_questions: processingMethod === 'directBlock' ? 0 : 1, @@ -244,11 +245,61 @@ const CreateDataset = () => { ), }, ]; - // 上传文件 - const handleUpload = (options: UploadRequestOption) => { - const { file, onSuccess, onError, onProgress, filename = 'file' } = options; - const formData = new FormData(); + // 检查媒体文件时长的辅助函数 + const checkMediaDuration = (file: File): Promise => { + return new Promise((resolve, reject) => { + const url = URL.createObjectURL(file); + const media = document.createElement(file.type.startsWith('video/') ? 'video' : 'audio'); + + media.onloadedmetadata = () => { + URL.revokeObjectURL(url); + resolve(media.duration); + }; + + media.onerror = () => { + URL.revokeObjectURL(url); + reject(new Error('无法读取媒体文件')); + }; + + media.src = url; + }); + }; + // 上传文件 + const handleUpload = async (options: UploadRequestOption) => { + const { file, onSuccess, onError, onProgress, filename = 'file' } = options; + + // 获取文件扩展名 + const fileExtension = (file as File).name.split('.').pop()?.toLowerCase(); + const mediaExtensions = ['mp3', 'mp4', 'mov', 'wav']; + + // 如果是媒体文件,进行大小和时长检查 + if (fileExtension && mediaExtensions.includes(fileExtension)) { + const fileSizeInMB = (file as File).size / (1024 * 1024); + + // 检查文件大小(256MB限制) + if (fileSizeInMB > 256) { + messageApi.error(`${t('knowledgeBase.sizeLimitError')}:${fileSizeInMB.toFixed(2)}MB`); + onError?.(new Error(`${t('knowledgeBase.fileSizeExceeds')}`)); + return; + } + + try { + // 检查媒体时长(150秒限制) + const duration = await checkMediaDuration(file as File); + if (duration > 150) { + messageApi.error(`${t('knowledgeBase.fileDurationLimitError')}:${Math.round(duration)}秒`); + onError?.(new Error(`${t('knowledgeBase.fileDurationExceeds')}`)); + return; + } + } catch (error) { + messageApi.error(`${t('knowledgeBase.unableReadFile')}`); + onError?.(error as Error); + return; + } + } + + const formData = new FormData(); formData.append(filename, file as File); if (knowledgeBaseId) { formData.append('kb_id', knowledgeBaseId); @@ -469,7 +520,7 @@ const CreateDataset = () => { ))} )} -
+
{t('knowledgeBase.fileParsingSettings')}
@@ -477,7 +528,7 @@ const CreateDataset = () => { className={`rb:flex rb:items-center rb:w-full rb:border rb:rounded-lg rb:p-4 rb:cursor-pointer ${ pdfEnhancementEnabled ? 'rb:border-blue-500' : 'rb:border-gray-300' }`} - onClick={() => setPdfEnhancementEnabled(!pdfEnhancementEnabled)} + // onClick={() => setPdfEnhancementEnabled(!pdfEnhancementEnabled)} > { {t('knowledgeBase.pdfEnhancementAnalysis')} + {pdfEnhancementEnabled && ( +
+