fix(http-request,embedding,naive): tighten form-data validation, reduce truncation length to 8000, and disable chunking for Excel

The form-data validation now ensures all items in the list are of type HttpFormData. Truncation length for embedding inputs is reduced from 8191 to 8000 to accommodate tokenizer differences and avoid overflow. Excel parsing now disables chunking by setting chunk_token_num to 0, aligning with intended behavior for structured file ingestion.
2026-04-14 16:14:01 +08:00
parent 0965008210
commit e3265e4ba3
3 changed files with 10 additions and 5 deletions
--- a/api/app/core/rag/app/naive.py
+++ b/api/app/core/rag/app/naive.py
@@ -675,7 +675,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            parser_config["chunk_token_num"] = 0
        else:
            sections = [(_, "") for _ in excel_parser(binary) if _]
-        parser_config["chunk_token_num"] = 12800
+        parser_config["chunk_token_num"] = 0

    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
--- a/api/app/core/rag/llm/embedding_model.py
+++ b/api/app/core/rag/llm/embedding_model.py
@@ -50,7 +50,9 @@ class OpenAIEmbed(Base):
    def encode(self, texts: list):
        # OpenAI requires batch size <=16
        batch_size = 16
-        texts = [truncate(t, 8191) for t in texts]
+        # Use 8000 instead of 8191 to leave safety margin for tokenizer differences
+        # between cl100k_base (used by truncate) and the actual embedding model
+        texts = [truncate(t, 8000) for t in texts]
        ress = []
        total_tokens = 0
        for i in range(0, len(texts), batch_size):
@@ -63,7 +65,7 @@ class OpenAIEmbed(Base):
        return np.array(ress), total_tokens

    def encode_queries(self, text):
-        res = self.client.embeddings.create(input=[truncate(text, 8191)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True})
+        res = self.client.embeddings.create(input=[truncate(text, 8000)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True})
        return np.array(res.data[0].embedding), self.total_token_count(res)


@@ -79,6 +81,7 @@ class LocalAIEmbed(Base):

    def encode(self, texts: list):
        batch_size = 16
+        texts = [truncate(t, 8000) for t in texts]
        ress = []
        for i in range(0, len(texts), batch_size):
            res = self.client.embeddings.create(input=texts[i : i + batch_size], model=self.model_name)
@@ -173,6 +176,7 @@ class XinferenceEmbed(Base):

    def encode(self, texts: list):
        batch_size = 16
+        texts = [truncate(t, 8000) for t in texts]
        ress = []
        total_tokens = 0
        for i in range(0, len(texts), batch_size):
@@ -188,7 +192,7 @@ class XinferenceEmbed(Base):
    def encode_queries(self, text):
        res = None
        try:
-            res = self.client.embeddings.create(input=[text], model=self.model_name)
+            res = self.client.embeddings.create(input=[truncate(text, 8000)], model=self.model_name)
            return np.array(res.data[0].embedding), self.total_token_count(res)
        except Exception as _e:
            log_exception(_e, res)