From f45ee9692efe44122784c5946c3f257951f7acce Mon Sep 17 00:00:00 2001 From: Chaser Huang Date: Wed, 20 Aug 2025 15:50:15 -0400 Subject: [PATCH 1/3] Add ENV to configure api request delay for generic open ai embedding engine --- server/utils/EmbeddingEngines/genericOpenAi/index.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/server/utils/EmbeddingEngines/genericOpenAi/index.js b/server/utils/EmbeddingEngines/genericOpenAi/index.js index e88538f4b45..5fa954b59ef 100644 --- a/server/utils/EmbeddingEngines/genericOpenAi/index.js +++ b/server/utils/EmbeddingEngines/genericOpenAi/index.js @@ -14,6 +14,9 @@ class GenericOpenAiEmbedder { }); this.model = process.env.EMBEDDING_MODEL_PREF ?? null; this.embeddingMaxChunkLength = maximumChunkLength(); + // Delay to apply after each embedding API call + // For some implementation (e.g. llama.cpp) this is necessary to avoid 429 errors + this.apiRequestDelay = process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS ?? null; // this.maxConcurrentChunks is delegated to the getter below. // Refer to your specific model and provider you use this class with to determine a valid maxChunkLength @@ -76,6 +79,10 @@ class GenericOpenAiEmbedder { }); }) ); + // Apply delay before dispatching the next api request + if (this.apiRequestDelay) { + await new Promise((resolve) => setTimeout(resolve, this.apiRequestDelay)); + } } const { data = [], error = null } = await Promise.all( From d19048ce875ff010afd5156d547c69e11ebee691 Mon Sep 17 00:00:00 2001 From: Chaser Huang Date: Wed, 20 Aug 2025 16:54:18 -0400 Subject: [PATCH 2/3] yarn lint formatting --- server/utils/EmbeddingEngines/genericOpenAi/index.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/server/utils/EmbeddingEngines/genericOpenAi/index.js b/server/utils/EmbeddingEngines/genericOpenAi/index.js index 5fa954b59ef..bac7afef7ba 100644 --- a/server/utils/EmbeddingEngines/genericOpenAi/index.js +++ b/server/utils/EmbeddingEngines/genericOpenAi/index.js @@ -16,7 +16,8 @@ class GenericOpenAiEmbedder { this.embeddingMaxChunkLength = maximumChunkLength(); // Delay to apply after each embedding API call // For some implementation (e.g. llama.cpp) this is necessary to avoid 429 errors - this.apiRequestDelay = process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS ?? null; + this.apiRequestDelay = + process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS ?? null; // this.maxConcurrentChunks is delegated to the getter below. // Refer to your specific model and provider you use this class with to determine a valid maxChunkLength @@ -81,7 +82,9 @@ class GenericOpenAiEmbedder { ); // Apply delay before dispatching the next api request if (this.apiRequestDelay) { - await new Promise((resolve) => setTimeout(resolve, this.apiRequestDelay)); + await new Promise((resolve) => + setTimeout(resolve, this.apiRequestDelay) + ); } } From 90398433a8404eb9a50361f3de35c4069ffaf8e1 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 17 Sep 2025 20:51:39 -0700 Subject: [PATCH 3/3] refactor --- docker/.env.example | 1 + server/.env.example | 1 + .../EmbeddingEngines/genericOpenAi/index.js | 115 +++++++++--------- 3 files changed, 57 insertions(+), 60 deletions(-) diff --git a/docker/.env.example b/docker/.env.example index dca22fa0493..47556dc061b 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -190,6 +190,7 @@ GID='1000' # EMBEDDING_BASE_PATH='http://127.0.0.1:4000' # GENERIC_OPEN_AI_EMBEDDING_API_KEY='sk-123abc' # GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS=500 +# GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS=1000 # EMBEDDING_ENGINE='gemini' # GEMINI_EMBEDDING_API_KEY= diff --git a/server/.env.example b/server/.env.example index 0d3d1ecd0e0..0dc4e3c0d7c 100644 --- a/server/.env.example +++ b/server/.env.example @@ -188,6 +188,7 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long. # EMBEDDING_BASE_PATH='http://127.0.0.1:4000' # GENERIC_OPEN_AI_EMBEDDING_API_KEY='sk-123abc' # GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS=500 +# GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS=1000 # EMBEDDING_ENGINE='gemini' # GEMINI_EMBEDDING_API_KEY= diff --git a/server/utils/EmbeddingEngines/genericOpenAi/index.js b/server/utils/EmbeddingEngines/genericOpenAi/index.js index bac7afef7ba..a8a3ac1a584 100644 --- a/server/utils/EmbeddingEngines/genericOpenAi/index.js +++ b/server/utils/EmbeddingEngines/genericOpenAi/index.js @@ -14,10 +14,6 @@ class GenericOpenAiEmbedder { }); this.model = process.env.EMBEDDING_MODEL_PREF ?? null; this.embeddingMaxChunkLength = maximumChunkLength(); - // Delay to apply after each embedding API call - // For some implementation (e.g. llama.cpp) this is necessary to avoid 429 errors - this.apiRequestDelay = - process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS ?? null; // this.maxConcurrentChunks is delegated to the getter below. // Refer to your specific model and provider you use this class with to determine a valid maxChunkLength @@ -32,6 +28,35 @@ class GenericOpenAiEmbedder { console.log(`\x1b[36m[GenericOpenAiEmbedder]\x1b[0m ${text}`, ...args); } + /** + * returns the `GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS` env variable as a number or null if the env variable is not set or is not a number. + * The minimum delay is 500ms. + * + * For some implementation this is necessary to avoid 429 errors due to rate limiting or + * hardware limitations where a single-threaded process is not able to handle the requests fast enough. + * @returns {number} + */ + get apiRequestDelay() { + if (!("GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS" in process.env)) return null; + if (isNaN(Number(process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS))) + return null; + const delayTimeout = Number( + process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS + ); + if (delayTimeout < 500) return 500; // minimum delay of 500ms + return delayTimeout; + } + + /** + * runs the delay if it is set and valid. + * @returns {Promise} + */ + async runDelay() { + if (!this.apiRequestDelay) return; + this.log(`Delaying new batch request for ${this.apiRequestDelay}ms`); + await new Promise((resolve) => setTimeout(resolve, this.apiRequestDelay)); + } + /** * returns the `GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS` env variable as a number * or 500 if the env variable is not set or is not a number. @@ -56,68 +81,38 @@ class GenericOpenAiEmbedder { async embedChunks(textChunks = []) { // Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb) - // we concurrently execute each max batch of text chunks possible. + // we sequentially execute each max batch of text chunks possible. // Refer to constructor maxConcurrentChunks for more info. - const embeddingRequests = []; + const allResults = []; for (const chunk of toChunks(textChunks, this.maxConcurrentChunks)) { - embeddingRequests.push( - new Promise((resolve) => { - this.openai.embeddings - .create({ - model: this.model, - input: chunk, - }) - .then((result) => { - resolve({ data: result?.data, error: null }); - }) - .catch((e) => { - e.type = - e?.response?.data?.error?.code || - e?.response?.status || - "failed_to_embed"; - e.message = e?.response?.data?.error?.message || e.message; - resolve({ data: [], error: e }); - }); - }) - ); - // Apply delay before dispatching the next api request - if (this.apiRequestDelay) { - await new Promise((resolve) => - setTimeout(resolve, this.apiRequestDelay) - ); - } - } + const { data = [], error = null } = await new Promise((resolve) => { + this.openai.embeddings + .create({ + model: this.model, + input: chunk, + }) + .then((result) => resolve({ data: result?.data, error: null })) + .catch((e) => { + e.type = + e?.response?.data?.error?.code || + e?.response?.status || + "failed_to_embed"; + e.message = e?.response?.data?.error?.message || e.message; + resolve({ data: [], error: e }); + }); + }); - const { data = [], error = null } = await Promise.all( - embeddingRequests - ).then((results) => { // If any errors were returned from OpenAI abort the entire sequence because the embeddings // will be incomplete. - const errors = results - .filter((res) => !!res.error) - .map((res) => res.error) - .flat(); - if (errors.length > 0) { - let uniqueErrors = new Set(); - errors.map((error) => - uniqueErrors.add(`[${error.type}]: ${error.message}`) - ); - - return { - data: [], - error: Array.from(uniqueErrors).join(", "), - }; - } - return { - data: results.map((res) => res?.data || []).flat(), - error: null, - }; - }); + if (error) + throw new Error(`GenericOpenAI Failed to embed: ${error.message}`); + allResults.push(...(data || [])); + if (this.apiRequestDelay) await this.runDelay(); + } - if (!!error) throw new Error(`GenericOpenAI Failed to embed: ${error}`); - return data.length > 0 && - data.every((embd) => embd.hasOwnProperty("embedding")) - ? data.map((embd) => embd.embedding) + return allResults.length > 0 && + allResults.every((embd) => embd.hasOwnProperty("embedding")) + ? allResults.map((embd) => embd.embedding) : null; } }