fix(http-request,embedding,naive): tighten form-data validation, reduce truncation length to 8000, and disable chunking for Excel

The form-data validation now ensures all items in the list are of type HttpFormData. Truncation length for embedding inputs is reduced from 8191 to 8000 to accommodate tokenizer differences and avoid overflow. Excel parsing now disables chunking by setting chunk_token_num to 0, aligning with intended behavior for structured file ingestion.
This commit is contained in:
Timebomb2018
2026-04-14 16:14:01 +08:00
parent 0965008210
commit e3265e4ba3
3 changed files with 10 additions and 5 deletions

View File

@@ -675,7 +675,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config["chunk_token_num"] = 0
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
parser_config["chunk_token_num"] = 12800
parser_config["chunk_token_num"] = 0
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")

View File

@@ -50,7 +50,9 @@ class OpenAIEmbed(Base):
def encode(self, texts: list):
# OpenAI requires batch size <=16
batch_size = 16
texts = [truncate(t, 8191) for t in texts]
# Use 8000 instead of 8191 to leave safety margin for tokenizer differences
# between cl100k_base (used by truncate) and the actual embedding model
texts = [truncate(t, 8000) for t in texts]
ress = []
total_tokens = 0
for i in range(0, len(texts), batch_size):
@@ -63,7 +65,7 @@ class OpenAIEmbed(Base):
return np.array(ress), total_tokens
def encode_queries(self, text):
res = self.client.embeddings.create(input=[truncate(text, 8191)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True})
res = self.client.embeddings.create(input=[truncate(text, 8000)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True})
return np.array(res.data[0].embedding), self.total_token_count(res)
@@ -79,6 +81,7 @@ class LocalAIEmbed(Base):
def encode(self, texts: list):
batch_size = 16
texts = [truncate(t, 8000) for t in texts]
ress = []
for i in range(0, len(texts), batch_size):
res = self.client.embeddings.create(input=texts[i : i + batch_size], model=self.model_name)
@@ -173,6 +176,7 @@ class XinferenceEmbed(Base):
def encode(self, texts: list):
batch_size = 16
texts = [truncate(t, 8000) for t in texts]
ress = []
total_tokens = 0
for i in range(0, len(texts), batch_size):
@@ -188,7 +192,7 @@ class XinferenceEmbed(Base):
def encode_queries(self, text):
res = None
try:
res = self.client.embeddings.create(input=[text], model=self.model_name)
res = self.client.embeddings.create(input=[truncate(text, 8000)], model=self.model_name)
return np.array(res.data[0].embedding), self.total_token_count(res)
except Exception as _e:
log_exception(_e, res)

View File

@@ -72,7 +72,8 @@ class HttpContentTypeConfig(BaseModel):
@classmethod
def validate_data(cls, v, info):
content_type = info.data.get("content_type")
if content_type == HttpContentType.FROM_DATA and not isinstance(v, list):
if content_type == HttpContentType.FROM_DATA and (
not isinstance(v, list) or not all(isinstance(item, HttpFormData) for item in v)):
raise ValueError("When content_type is 'form-data', data must be a list of HttpFormData")
elif content_type in [HttpContentType.JSON] and not isinstance(v, str):
raise ValueError("When content_type is JSON, data must be of type str")