diff --git a/api/app/core/rag/app/naive.py b/api/app/core/rag/app/naive.py index 72272347..93b96843 100644 --- a/api/app/core/rag/app/naive.py +++ b/api/app/core/rag/app/naive.py @@ -675,7 +675,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, parser_config["chunk_token_num"] = 0 else: sections = [(_, "") for _ in excel_parser(binary) if _] - parser_config["chunk_token_num"] = 12800 + parser_config["chunk_token_num"] = 0 elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") diff --git a/api/app/core/rag/llm/embedding_model.py b/api/app/core/rag/llm/embedding_model.py index 22e35a15..59210054 100644 --- a/api/app/core/rag/llm/embedding_model.py +++ b/api/app/core/rag/llm/embedding_model.py @@ -50,7 +50,9 @@ class OpenAIEmbed(Base): def encode(self, texts: list): # OpenAI requires batch size <=16 batch_size = 16 - texts = [truncate(t, 8191) for t in texts] + # Use 8000 instead of 8191 to leave safety margin for tokenizer differences + # between cl100k_base (used by truncate) and the actual embedding model + texts = [truncate(t, 8000) for t in texts] ress = [] total_tokens = 0 for i in range(0, len(texts), batch_size): @@ -63,7 +65,7 @@ class OpenAIEmbed(Base): return np.array(ress), total_tokens def encode_queries(self, text): - res = self.client.embeddings.create(input=[truncate(text, 8191)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True}) + res = self.client.embeddings.create(input=[truncate(text, 8000)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True}) return np.array(res.data[0].embedding), self.total_token_count(res) @@ -79,6 +81,7 @@ class LocalAIEmbed(Base): def encode(self, texts: list): batch_size = 16 + texts = [truncate(t, 8000) for t in texts] ress = [] for i in range(0, len(texts), batch_size): res = self.client.embeddings.create(input=texts[i : i + batch_size], model=self.model_name) @@ -173,6 +176,7 @@ class XinferenceEmbed(Base): def encode(self, texts: list): batch_size = 16 + texts = [truncate(t, 8000) for t in texts] ress = [] total_tokens = 0 for i in range(0, len(texts), batch_size): @@ -188,7 +192,7 @@ class XinferenceEmbed(Base): def encode_queries(self, text): res = None try: - res = self.client.embeddings.create(input=[text], model=self.model_name) + res = self.client.embeddings.create(input=[truncate(text, 8000)], model=self.model_name) return np.array(res.data[0].embedding), self.total_token_count(res) except Exception as _e: log_exception(_e, res) diff --git a/api/app/core/workflow/nodes/http_request/config.py b/api/app/core/workflow/nodes/http_request/config.py index 3473f666..72474436 100644 --- a/api/app/core/workflow/nodes/http_request/config.py +++ b/api/app/core/workflow/nodes/http_request/config.py @@ -72,7 +72,8 @@ class HttpContentTypeConfig(BaseModel): @classmethod def validate_data(cls, v, info): content_type = info.data.get("content_type") - if content_type == HttpContentType.FROM_DATA and not isinstance(v, list): + if content_type == HttpContentType.FROM_DATA and ( + not isinstance(v, list) or not all(isinstance(item, HttpFormData) for item in v)): raise ValueError("When content_type is 'form-data', data must be a list of HttpFormData") elif content_type in [HttpContentType.JSON] and not isinstance(v, str): raise ValueError("When content_type is JSON, data must be of type str")