From f45ee9692efe44122784c5946c3f257951f7acce Mon Sep 17 00:00:00 2001
From: Chaser Huang <huangkangjing@gmail.com>
Date: Wed, 20 Aug 2025 15:50:15 -0400
Subject: [PATCH 1/3] Add ENV to configure api request delay for generic open
 ai embedding engine

---
 server/utils/EmbeddingEngines/genericOpenAi/index.js | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/server/utils/EmbeddingEngines/genericOpenAi/index.js b/server/utils/EmbeddingEngines/genericOpenAi/index.js
index e88538f4b45..5fa954b59ef 100644
--- a/server/utils/EmbeddingEngines/genericOpenAi/index.js
+++ b/server/utils/EmbeddingEngines/genericOpenAi/index.js
@@ -14,6 +14,9 @@ class GenericOpenAiEmbedder {
     });
     this.model = process.env.EMBEDDING_MODEL_PREF ?? null;
     this.embeddingMaxChunkLength = maximumChunkLength();
+    // Delay to apply after each embedding API call
+    // For some implementation (e.g. llama.cpp) this is necessary to avoid 429 errors
+    this.apiRequestDelay = process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS ?? null;
 
     // this.maxConcurrentChunks is delegated to the getter below.
     // Refer to your specific model and provider you use this class with to determine a valid maxChunkLength
@@ -76,6 +79,10 @@ class GenericOpenAiEmbedder {
             });
         })
       );
+      // Apply delay before dispatching the next api request
+      if (this.apiRequestDelay) {
+        await new Promise((resolve) => setTimeout(resolve, this.apiRequestDelay));
+      }
     }
 
     const { data = [], error = null } = await Promise.all(

From d19048ce875ff010afd5156d547c69e11ebee691 Mon Sep 17 00:00:00 2001
From: Chaser Huang <huangkangjing@gmail.com>
Date: Wed, 20 Aug 2025 16:54:18 -0400
Subject: [PATCH 2/3] yarn lint formatting

---
 server/utils/EmbeddingEngines/genericOpenAi/index.js | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/server/utils/EmbeddingEngines/genericOpenAi/index.js b/server/utils/EmbeddingEngines/genericOpenAi/index.js
index 5fa954b59ef..bac7afef7ba 100644
--- a/server/utils/EmbeddingEngines/genericOpenAi/index.js
+++ b/server/utils/EmbeddingEngines/genericOpenAi/index.js
@@ -16,7 +16,8 @@ class GenericOpenAiEmbedder {
     this.embeddingMaxChunkLength = maximumChunkLength();
     // Delay to apply after each embedding API call
     // For some implementation (e.g. llama.cpp) this is necessary to avoid 429 errors
-    this.apiRequestDelay = process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS ?? null;
+    this.apiRequestDelay =
+      process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS ?? null;
 
     // this.maxConcurrentChunks is delegated to the getter below.
     // Refer to your specific model and provider you use this class with to determine a valid maxChunkLength
@@ -81,7 +82,9 @@ class GenericOpenAiEmbedder {
       );
       // Apply delay before dispatching the next api request
       if (this.apiRequestDelay) {
-        await new Promise((resolve) => setTimeout(resolve, this.apiRequestDelay));
+        await new Promise((resolve) =>
+          setTimeout(resolve, this.apiRequestDelay)
+        );
       }
     }
 

From 90398433a8404eb9a50361f3de35c4069ffaf8e1 Mon Sep 17 00:00:00 2001
From: timothycarambat <rambat1010@gmail.com>
Date: Wed, 17 Sep 2025 20:51:39 -0700
Subject: [PATCH 3/3] refactor

---
 docker/.env.example                           |   1 +
 server/.env.example                           |   1 +
 .../EmbeddingEngines/genericOpenAi/index.js   | 115 +++++++++---------
 3 files changed, 57 insertions(+), 60 deletions(-)

diff --git a/docker/.env.example b/docker/.env.example
index dca22fa0493..47556dc061b 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -190,6 +190,7 @@ GID='1000'
 # EMBEDDING_BASE_PATH='http://127.0.0.1:4000'
 # GENERIC_OPEN_AI_EMBEDDING_API_KEY='sk-123abc'
 # GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS=500
+# GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS=1000
 
 # EMBEDDING_ENGINE='gemini'
 # GEMINI_EMBEDDING_API_KEY=
diff --git a/server/.env.example b/server/.env.example
index 0d3d1ecd0e0..0dc4e3c0d7c 100644
--- a/server/.env.example
+++ b/server/.env.example
@@ -188,6 +188,7 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long.
 # EMBEDDING_BASE_PATH='http://127.0.0.1:4000'
 # GENERIC_OPEN_AI_EMBEDDING_API_KEY='sk-123abc'
 # GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS=500
+# GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS=1000
 
 # EMBEDDING_ENGINE='gemini'
 # GEMINI_EMBEDDING_API_KEY=
diff --git a/server/utils/EmbeddingEngines/genericOpenAi/index.js b/server/utils/EmbeddingEngines/genericOpenAi/index.js
index bac7afef7ba..a8a3ac1a584 100644
--- a/server/utils/EmbeddingEngines/genericOpenAi/index.js
+++ b/server/utils/EmbeddingEngines/genericOpenAi/index.js
@@ -14,10 +14,6 @@ class GenericOpenAiEmbedder {
     });
     this.model = process.env.EMBEDDING_MODEL_PREF ?? null;
     this.embeddingMaxChunkLength = maximumChunkLength();
-    // Delay to apply after each embedding API call
-    // For some implementation (e.g. llama.cpp) this is necessary to avoid 429 errors
-    this.apiRequestDelay =
-      process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS ?? null;
 
     // this.maxConcurrentChunks is delegated to the getter below.
     // Refer to your specific model and provider you use this class with to determine a valid maxChunkLength
@@ -32,6 +28,35 @@ class GenericOpenAiEmbedder {
     console.log(`\x1b[36m[GenericOpenAiEmbedder]\x1b[0m ${text}`, ...args);
   }
 
+  /**
+   * returns the `GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS` env variable as a number or null if the env variable is not set or is not a number.
+   * The minimum delay is 500ms.
+   *
+   * For some implementation this is necessary to avoid 429 errors due to rate limiting or
+   * hardware limitations where a single-threaded process is not able to handle the requests fast enough.
+   * @returns {number}
+   */
+  get apiRequestDelay() {
+    if (!("GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS" in process.env)) return null;
+    if (isNaN(Number(process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS)))
+      return null;
+    const delayTimeout = Number(
+      process.env.GENERIC_OPEN_AI_EMBEDDING_API_DELAY_MS
+    );
+    if (delayTimeout < 500) return 500; // minimum delay of 500ms
+    return delayTimeout;
+  }
+
+  /**
+   * runs the delay if it is set and valid.
+   * @returns {Promise<void>}
+   */
+  async runDelay() {
+    if (!this.apiRequestDelay) return;
+    this.log(`Delaying new batch request for ${this.apiRequestDelay}ms`);
+    await new Promise((resolve) => setTimeout(resolve, this.apiRequestDelay));
+  }
+
   /**
    * returns the `GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS` env variable as a number
    * or 500 if the env variable is not set or is not a number.
@@ -56,68 +81,38 @@ class GenericOpenAiEmbedder {
 
   async embedChunks(textChunks = []) {
     // Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb)
-    // we concurrently execute each max batch of text chunks possible.
+    // we sequentially execute each max batch of text chunks possible.
     // Refer to constructor maxConcurrentChunks for more info.
-    const embeddingRequests = [];
+    const allResults = [];
     for (const chunk of toChunks(textChunks, this.maxConcurrentChunks)) {
-      embeddingRequests.push(
-        new Promise((resolve) => {
-          this.openai.embeddings
-            .create({
-              model: this.model,
-              input: chunk,
-            })
-            .then((result) => {
-              resolve({ data: result?.data, error: null });
-            })
-            .catch((e) => {
-              e.type =
-                e?.response?.data?.error?.code ||
-                e?.response?.status ||
-                "failed_to_embed";
-              e.message = e?.response?.data?.error?.message || e.message;
-              resolve({ data: [], error: e });
-            });
-        })
-      );
-      // Apply delay before dispatching the next api request
-      if (this.apiRequestDelay) {
-        await new Promise((resolve) =>
-          setTimeout(resolve, this.apiRequestDelay)
-        );
-      }
-    }
+      const { data = [], error = null } = await new Promise((resolve) => {
+        this.openai.embeddings
+          .create({
+            model: this.model,
+            input: chunk,
+          })
+          .then((result) => resolve({ data: result?.data, error: null }))
+          .catch((e) => {
+            e.type =
+              e?.response?.data?.error?.code ||
+              e?.response?.status ||
+              "failed_to_embed";
+            e.message = e?.response?.data?.error?.message || e.message;
+            resolve({ data: [], error: e });
+          });
+      });
 
-    const { data = [], error = null } = await Promise.all(
-      embeddingRequests
-    ).then((results) => {
       // If any errors were returned from OpenAI abort the entire sequence because the embeddings
       // will be incomplete.
-      const errors = results
-        .filter((res) => !!res.error)
-        .map((res) => res.error)
-        .flat();
-      if (errors.length > 0) {
-        let uniqueErrors = new Set();
-        errors.map((error) =>
-          uniqueErrors.add(`[${error.type}]: ${error.message}`)
-        );
-
-        return {
-          data: [],
-          error: Array.from(uniqueErrors).join(", "),
-        };
-      }
-      return {
-        data: results.map((res) => res?.data || []).flat(),
-        error: null,
-      };
-    });
+      if (error)
+        throw new Error(`GenericOpenAI Failed to embed: ${error.message}`);
+      allResults.push(...(data || []));
+      if (this.apiRequestDelay) await this.runDelay();
+    }
 
-    if (!!error) throw new Error(`GenericOpenAI Failed to embed: ${error}`);
-    return data.length > 0 &&
-      data.every((embd) => embd.hasOwnProperty("embedding"))
-      ? data.map((embd) => embd.embedding)
+    return allResults.length > 0 &&
+      allResults.every((embd) => embd.hasOwnProperty("embedding"))
+      ? allResults.map((embd) => embd.embedding)
       : null;
   }
 }