From 2baaf4def50a3ee7bf96ea32b9c1ce4112c5c2dc Mon Sep 17 00:00:00 2001
From: timothycarambat <rambat1010@gmail.com>
Date: Thu, 14 Mar 2024 15:27:42 -0700
Subject: [PATCH 1/4] Support External Transcription providers

---
 collector/index.js                            |   4 +-
 collector/package.json                        |   3 +-
 .../processSingleFile/convert/asAudio.js      | 115 +++-------------
 collector/processSingleFile/index.js          |   3 +-
 .../utils/WhisperProviders/localWhisper.js    | 123 +++++++++++++++++-
 collector/yarn.lock                           |  20 +++
 docker/.env.example                           |  10 ++
 frontend/src/App.jsx                          |   9 ++
 .../src/components/SettingsSidebar/index.jsx  |  11 +-
 frontend/src/utils/paths.js                   |   3 +
 server/.env.example                           |  10 ++
 server/models/systemSettings.js               |   1 +
 server/utils/collectorApi/index.js            |  14 +-
 server/utils/helpers/updateENV.js             |  14 ++
 14 files changed, 231 insertions(+), 109 deletions(-)

diff --git a/collector/index.js b/collector/index.js
index 9ebe5f1ce16..a1142d75665 100644
--- a/collector/index.js
+++ b/collector/index.js
@@ -25,7 +25,7 @@ app.use(
 );
 
 app.post("/process", async function (request, response) {
-  const { filename } = reqBody(request);
+  const { filename, options = {} } = reqBody(request);
   try {
     const targetFilename = path
       .normalize(filename)
@@ -34,7 +34,7 @@ app.post("/process", async function (request, response) {
       success,
       reason,
       documents = [],
-    } = await processSingleFile(targetFilename);
+    } = await processSingleFile(targetFilename, options);
     response
       .status(200)
       .json({ filename: targetFilename, success, reason, documents });
diff --git a/collector/package.json b/collector/package.json
index d145ab86568..8a0441d7820 100644
--- a/collector/package.json
+++ b/collector/package.json
@@ -33,6 +33,7 @@
     "moment": "^2.29.4",
     "multer": "^1.4.5-lts.1",
     "officeparser": "^4.0.5",
+    "openai": "^3.2.1",
     "pdf-parse": "^1.1.1",
     "puppeteer": "~21.5.2",
     "slugify": "^1.6.6",
@@ -46,4 +47,4 @@
     "nodemon": "^2.0.22",
     "prettier": "^2.4.1"
   }
-}
+}
\ No newline at end of file
diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js
index 15ae5cf001b..f6745049d4a 100644
--- a/collector/processSingleFile/convert/asAudio.js
+++ b/collector/processSingleFile/convert/asAudio.js
@@ -1,5 +1,3 @@
-const fs = require("fs");
-const path = require("path");
 const { v4 } = require("uuid");
 const {
   createdDate,
@@ -9,38 +7,34 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 const { LocalWhisper } = require("../../utils/WhisperProviders/localWhisper");
+const { OpenAiWhisper } = require("../../utils/WhisperProviders/OpenAiWhisper");
 
-async function asAudio({ fullFilePath = "", filename = "" }) {
-  const whisper = new LocalWhisper();
+const WHISPER_PROVIDERS = {
+  openai: OpenAiWhisper,
+  local: LocalWhisper,
+};
+
+async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
+  const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
+    options?.whisperProvider
+  )
+    ? WHISPER_PROVIDERS[options?.whisperProvider]
+    : WHISPER_PROVIDERS.local;
 
   console.log(`-- Working ${filename} --`);
-  const transcriberPromise = new Promise((resolve) =>
-    whisper.client().then((client) => resolve(client))
-  );
-  const audioDataPromise = new Promise((resolve) =>
-    convertToWavAudioData(fullFilePath).then((audioData) => resolve(audioData))
-  );
-  const [audioData, transcriber] = await Promise.all([
-    audioDataPromise,
-    transcriberPromise,
-  ]);
+  const whisper = new WhisperProvider({ options });
+  const { content, error } = await whisper.processFile(fullFilePath, filename);
 
-  if (!audioData) {
-    console.error(`Failed to parse content from ${filename}.`);
+  if (!!error) {
+    console.error(`Error encountered for parsing of ${filename}.`);
     trashFile(fullFilePath);
     return {
       success: false,
-      reason: `Failed to parse content from ${filename}.`,
+      reason: error,
       documents: [],
     };
   }
 
-  console.log(`[Model Working]: Transcribing audio data to text`);
-  const { text: content } = await transcriber(audioData, {
-    chunk_length_s: 30,
-    stride_length_s: 5,
-  });
-
   if (!content.length) {
     console.error(`Resulting text content was empty for ${filename}.`);
     trashFile(fullFilePath);
@@ -76,79 +70,4 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
   return { success: true, reason: null, documents: [document] };
 }
 
-async function convertToWavAudioData(sourcePath) {
-  try {
-    let buffer;
-    const wavefile = require("wavefile");
-    const ffmpeg = require("fluent-ffmpeg");
-    const outFolder = path.resolve(__dirname, `../../storage/tmp`);
-    if (!fs.existsSync(outFolder)) fs.mkdirSync(outFolder, { recursive: true });
-
-    const fileExtension = path.extname(sourcePath).toLowerCase();
-    if (fileExtension !== ".wav") {
-      console.log(
-        `[Conversion Required] ${fileExtension} file detected - converting to .wav`
-      );
-      const outputFile = path.resolve(outFolder, `${v4()}.wav`);
-      const convert = new Promise((resolve) => {
-        ffmpeg(sourcePath)
-          .toFormat("wav")
-          .on("error", (error) => {
-            console.error(`[Conversion Error] ${error.message}`);
-            resolve(false);
-          })
-          .on("progress", (progress) =>
-            console.log(
-              `[Conversion Processing]: ${progress.targetSize}KB converted`
-            )
-          )
-          .on("end", () => {
-            console.log("[Conversion Complete]: File converted to .wav!");
-            resolve(true);
-          })
-          .save(outputFile);
-      });
-      const success = await convert;
-      if (!success)
-        throw new Error(
-          "[Conversion Failed]: Could not convert file to .wav format!"
-        );
-
-      const chunks = [];
-      const stream = fs.createReadStream(outputFile);
-      for await (let chunk of stream) chunks.push(chunk);
-      buffer = Buffer.concat(chunks);
-      fs.rmSync(outputFile);
-    } else {
-      const chunks = [];
-      const stream = fs.createReadStream(sourcePath);
-      for await (let chunk of stream) chunks.push(chunk);
-      buffer = Buffer.concat(chunks);
-    }
-
-    const wavFile = new wavefile.WaveFile(buffer);
-    wavFile.toBitDepth("32f");
-    wavFile.toSampleRate(16000);
-
-    let audioData = wavFile.getSamples();
-    if (Array.isArray(audioData)) {
-      if (audioData.length > 1) {
-        const SCALING_FACTOR = Math.sqrt(2);
-
-        // Merge channels into first channel to save memory
-        for (let i = 0; i < audioData[0].length; ++i) {
-          audioData[0][i] =
-            (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
-        }
-      }
-      audioData = audioData[0];
-    }
-
-    return audioData;
-  } catch (error) {
-    console.error(`convertToWavAudioData`, error);
-    return null;
-  }
-}
-
 module.exports = asAudio;
diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js
index 569a2cde27a..5d9e6a38af9 100644
--- a/collector/processSingleFile/index.js
+++ b/collector/processSingleFile/index.js
@@ -7,7 +7,7 @@ const {
 const { trashFile, isTextType } = require("../utils/files");
 const RESERVED_FILES = ["__HOTDIR__.md"];
 
-async function processSingleFile(targetFilename) {
+async function processSingleFile(targetFilename, options = {}) {
   const fullFilePath = path.resolve(WATCH_DIRECTORY, targetFilename);
   if (RESERVED_FILES.includes(targetFilename))
     return {
@@ -54,6 +54,7 @@ async function processSingleFile(targetFilename) {
   return await FileTypeProcessor({
     fullFilePath,
     filename: targetFilename,
+    options,
   });
 }
 
diff --git a/collector/utils/WhisperProviders/localWhisper.js b/collector/utils/WhisperProviders/localWhisper.js
index 6503e2021eb..7a4e61ccda6 100644
--- a/collector/utils/WhisperProviders/localWhisper.js
+++ b/collector/utils/WhisperProviders/localWhisper.js
@@ -1,5 +1,6 @@
-const path = require("path");
 const fs = require("fs");
+const path = require("path");
+const { v4 } = require("uuid");
 
 class LocalWhisper {
   constructor() {
@@ -16,12 +17,94 @@ class LocalWhisper {
     // Make directory when it does not exist in existing installations
     if (!fs.existsSync(this.cacheDir))
       fs.mkdirSync(this.cacheDir, { recursive: true });
+
+    this.#log("Initialized.");
+  }
+
+  #log(text, ...args) {
+    console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args);
+  }
+
+  async #convertToWavAudioData(sourcePath) {
+    try {
+      let buffer;
+      const wavefile = require("wavefile");
+      const ffmpeg = require("fluent-ffmpeg");
+      const outFolder = path.resolve(__dirname, `../../storage/tmp`);
+      if (!fs.existsSync(outFolder))
+        fs.mkdirSync(outFolder, { recursive: true });
+
+      const fileExtension = path.extname(sourcePath).toLowerCase();
+      if (fileExtension !== ".wav") {
+        this.#log(
+          `File conversion required! ${fileExtension} file detected - converting to .wav`
+        );
+        const outputFile = path.resolve(outFolder, `${v4()}.wav`);
+        const convert = new Promise((resolve) => {
+          ffmpeg(sourcePath)
+            .toFormat("wav")
+            .on("error", (error) => {
+              this.#log(`Conversion Error! ${error.message}`);
+              resolve(false);
+            })
+            .on("progress", (progress) =>
+              this.#log(
+                `Conversion Processing! ${progress.targetSize}KB converted`
+              )
+            )
+            .on("end", () => {
+              this.#log(`Conversion Complete! File converted to .wav!`);
+              resolve(true);
+            })
+            .save(outputFile);
+        });
+        const success = await convert;
+        if (!success)
+          throw new Error(
+            "[Conversion Failed]: Could not convert file to .wav format!"
+          );
+
+        const chunks = [];
+        const stream = fs.createReadStream(outputFile);
+        for await (let chunk of stream) chunks.push(chunk);
+        buffer = Buffer.concat(chunks);
+        fs.rmSync(outputFile);
+      } else {
+        const chunks = [];
+        const stream = fs.createReadStream(sourcePath);
+        for await (let chunk of stream) chunks.push(chunk);
+        buffer = Buffer.concat(chunks);
+      }
+
+      const wavFile = new wavefile.WaveFile(buffer);
+      wavFile.toBitDepth("32f");
+      wavFile.toSampleRate(16000);
+
+      let audioData = wavFile.getSamples();
+      if (Array.isArray(audioData)) {
+        if (audioData.length > 1) {
+          const SCALING_FACTOR = Math.sqrt(2);
+
+          // Merge channels into first channel to save memory
+          for (let i = 0; i < audioData[0].length; ++i) {
+            audioData[0][i] =
+              (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
+          }
+        }
+        audioData = audioData[0];
+      }
+
+      return audioData;
+    } catch (error) {
+      console.error(`convertToWavAudioData`, error);
+      return null;
+    }
   }
 
   async client() {
     if (!fs.existsSync(this.modelPath)) {
-      console.log(
-        "\x1b[34m[INFO]\x1b[0m The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~250MB)\n\n"
+      this.#log(
+        `The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~250MB)`
       );
     }
 
@@ -48,10 +131,42 @@ class LocalWhisper {
           : {}),
       });
     } catch (error) {
-      console.error("Failed to load the native whisper model:", error);
+      this.#log("Failed to load the native whisper model:", error);
       throw error;
     }
   }
+
+  async processFile(fullFilePath, filename) {
+    try {
+      const transcriberPromise = new Promise((resolve) =>
+        this.client().then((client) => resolve(client))
+      );
+      const audioDataPromise = new Promise((resolve) =>
+        this.#convertToWavAudioData(fullFilePath).then((audioData) =>
+          resolve(audioData)
+        )
+      );
+      const [audioData, transcriber] = await Promise.all([
+        audioDataPromise,
+        transcriberPromise,
+      ]);
+
+      if (!audioData) {
+        this.#log(`Failed to parse content from ${filename}.`);
+        return null;
+      }
+
+      this.#log(`Transcribing audio data to text...`);
+      const { text } = await transcriber(audioData, {
+        chunk_length_s: 30,
+        stride_length_s: 5,
+      });
+
+      return { content: text, error: null };
+    } catch (error) {
+      return { content: null, error: error.message };
+    }
+  }
 }
 
 module.exports = {
diff --git a/collector/yarn.lock b/collector/yarn.lock
index bf979c86c0d..3bb0f1ea794 100644
--- a/collector/yarn.lock
+++ b/collector/yarn.lock
@@ -372,6 +372,13 @@ asynckit@^0.4.0:
   resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79"
   integrity sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==
 
+axios@^0.26.0:
+  version "0.26.1"
+  resolved "https://registry.yarnpkg.com/axios/-/axios-0.26.1.tgz#1ede41c51fcf51bbbd6fd43669caaa4f0495aaa9"
+  integrity sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==
+  dependencies:
+    follow-redirects "^1.14.8"
+
 b4a@^1.6.4:
   version "1.6.4"
   resolved "https://registry.yarnpkg.com/b4a/-/b4a-1.6.4.tgz#ef1c1422cae5ce6535ec191baeed7567443f36c9"
@@ -1203,6 +1210,11 @@ fluent-ffmpeg@^2.1.2:
     async ">=0.2.9"
     which "^1.1.1"
 
+follow-redirects@^1.14.8:
+  version "1.15.6"
+  resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.15.6.tgz#7f815c0cda4249c74ff09e95ef97c23b5fd0399b"
+  integrity sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==
+
 form-data-encoder@1.7.2:
   version "1.7.2"
   resolved "https://registry.yarnpkg.com/form-data-encoder/-/form-data-encoder-1.7.2.tgz#1f1ae3dccf58ed4690b86d87e4f57c654fbab040"
@@ -2304,6 +2316,14 @@ onnxruntime-web@1.14.0:
     onnxruntime-common "~1.14.0"
     platform "^1.3.6"
 
+openai@^3.2.1:
+  version "3.3.0"
+  resolved "https://registry.yarnpkg.com/openai/-/openai-3.3.0.tgz#a6408016ad0945738e1febf43f2fccca83a3f532"
+  integrity sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==
+  dependencies:
+    axios "^0.26.0"
+    form-data "^4.0.0"
+
 openai@^4.19.0:
   version "4.20.1"
   resolved "https://registry.yarnpkg.com/openai/-/openai-4.20.1.tgz#afa0d496d125b5a0f6cebcb4b9aeabf71e00214e"
diff --git a/docker/.env.example b/docker/.env.example
index ae4913dc44c..ed6fd3bce63 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -131,6 +131,16 @@ GID='1000'
 # ASTRA_DB_APPLICATION_TOKEN=
 # ASTRA_DB_ENDPOINT=
 
+###########################################
+######## Audio Model Selection ############
+###########################################
+# (default) use built-in whisper-small model.
+# WHISPER_PROVIDER="local"
+
+# use openai hosted whisper model.
+# WHISPER_PROVIDER="openai"
+# OPEN_AI_KEY=sk-xxxxxxxx
+
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # DISABLE_TELEMETRY="false"
diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
index 86f6eb08ae2..8a57d27bb65 100644
--- a/frontend/src/App.jsx
+++ b/frontend/src/App.jsx
@@ -29,6 +29,9 @@ const GeneralApiKeys = lazy(() => import("@/pages/GeneralSettings/ApiKeys"));
 const GeneralLLMPreference = lazy(
   () => import("@/pages/GeneralSettings/LLMPreference")
 );
+const GeneralTranscriptionPreference = lazy(
+  () => import("@/pages/GeneralSettings/TranscriptionPreference")
+);
 const GeneralEmbeddingPreference = lazy(
   () => import("@/pages/GeneralSettings/EmbeddingPreference")
 );
@@ -76,6 +79,12 @@ export default function App() {
                 path="/settings/llm-preference"
                 element={<AdminRoute Component={GeneralLLMPreference} />}
               />
+              <Route
+                path="/settings/transcription-preference"
+                element={
+                  <AdminRoute Component={GeneralTranscriptionPreference} />
+                }
+              />
               <Route
                 path="/settings/embedding-preference"
                 element={<AdminRoute Component={GeneralEmbeddingPreference} />}
diff --git a/frontend/src/components/SettingsSidebar/index.jsx b/frontend/src/components/SettingsSidebar/index.jsx
index 84b78064a3a..a7aca7ffe21 100644
--- a/frontend/src/components/SettingsSidebar/index.jsx
+++ b/frontend/src/components/SettingsSidebar/index.jsx
@@ -19,6 +19,7 @@ import {
   Notepad,
   CodeBlock,
   Barcode,
+  ClosedCaptioning,
 } from "@phosphor-icons/react";
 import useUser from "@/hooks/useUser";
 import { USER_BACKGROUND_COLOR } from "@/utils/constants";
@@ -278,9 +279,17 @@ const SidebarOptions = ({ user = null }) => (
       flex={true}
       allowedRole={["admin"]}
     />
+    <Option
+      href={paths.settings.transcriptionPreference()}
+      btnText="Transcription Model"
+      icon={<ClosedCaptioning className="h-5 w-5 flex-shrink-0" />}
+      user={user}
+      flex={true}
+      allowedRole={["admin"]}
+    />
     <Option
       href={paths.settings.embeddingPreference()}
-      btnText="Embedding Preference"
+      btnText="Embedding Model"
       icon={<FileCode className="h-5 w-5 flex-shrink-0" />}
       user={user}
       flex={true}
diff --git a/frontend/src/utils/paths.js b/frontend/src/utils/paths.js
index da10aa23cba..6c8745af31b 100644
--- a/frontend/src/utils/paths.js
+++ b/frontend/src/utils/paths.js
@@ -92,6 +92,9 @@ export default {
     llmPreference: () => {
       return "/settings/llm-preference";
     },
+    transcriptionPreference: () => {
+      return "/settings/transcription-preference";
+    },
     embeddingPreference: () => {
       return "/settings/embedding-preference";
     },
diff --git a/server/.env.example b/server/.env.example
index 88e60182ce0..c5681db4a07 100644
--- a/server/.env.example
+++ b/server/.env.example
@@ -128,6 +128,16 @@ VECTOR_DB="lancedb"
 # ZILLIZ_ENDPOINT="https://sample.api.gcp-us-west1.zillizcloud.com"
 # ZILLIZ_API_TOKEN=api-token-here
 
+###########################################
+######## Audio Model Selection ############
+###########################################
+# (default) use built-in whisper-small model.
+WHISPER_PROVIDER="local"
+
+# use openai hosted whisper model.
+# WHISPER_PROVIDER="openai"
+# OPEN_AI_KEY=sk-xxxxxxxx
+
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # STORAGE_DIR= # absolute filesystem path with no trailing slash
diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js
index b06fe123004..680ecf4f7cc 100644
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@@ -258,6 +258,7 @@ const SystemSettings = {
             AzureOpenAiEmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
           }
         : {}),
+      WhisperProvider: process.env.WHISPER_PROVIDER || "local",
     };
   },
 
diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js
index 7e8c1149308..ed27a928924 100644
--- a/server/utils/collectorApi/index.js
+++ b/server/utils/collectorApi/index.js
@@ -5,13 +5,20 @@
 
 class CollectorApi {
   constructor() {
-    this.endpoint = "http://0.0.0.0:8888";
+    this.endpoint = `http://0.0.0.0:${process.env.COLLECTOR_PORT || 8888}`;
   }
 
   log(text, ...args) {
     console.log(`\x1b[36m[CollectorApi]\x1b[0m ${text}`, ...args);
   }
 
+  #attachOptions() {
+    return {
+      whisperProvider: process.env.WHISPER_PROVIDER || "local",
+      openAiKey: process.env.OPEN_AI_KEY || null,
+    };
+  }
+
   async online() {
     return await fetch(this.endpoint)
       .then((res) => res.ok)
@@ -38,7 +45,10 @@ class CollectorApi {
       headers: {
         "Content-Type": "application/json",
       },
-      body: JSON.stringify({ filename }),
+      body: JSON.stringify({
+        filename,
+        options: this.#attachOptions(),
+      }),
     })
       .then((res) => {
         if (!res.ok) throw new Error("Response could not be completed");
diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
index aa814d69046..6d91e646d4e 100644
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@@ -269,6 +269,13 @@ const KEY_MAPPING = {
     checks: [isNotEmpty],
   },
 
+  // Whisper (transcription) providers
+  WhisperProvider: {
+    envKey: "WHISPER_PROVIDER",
+    checks: [isNotEmpty, supportedTranscriptionProvider],
+    postUpdate: [],
+  },
+
   // System Settings
   AuthToken: {
     envKey: "AUTH_TOKEN",
@@ -351,6 +358,13 @@ function supportedLLM(input = "") {
   return validSelection ? null : `${input} is not a valid LLM provider.`;
 }
 
+function supportedTranscriptionProvider(input = "") {
+  const validSelection = ["openai", "local"].includes(input);
+  return validSelection
+    ? null
+    : `${input} is not a valid transcription model provider.`;
+}
+
 function validGeminiModel(input = "") {
   const validModels = ["gemini-pro"];
   return validModels.includes(input)

From 3cb8aa841cf5ff8bd4f11e2be7e6d942f2809ac8 Mon Sep 17 00:00:00 2001
From: timothycarambat <rambat1010@gmail.com>
Date: Thu, 14 Mar 2024 15:27:50 -0700
Subject: [PATCH 2/4] patch files

---
 .../utils/WhisperProviders/OpenAiWhisper.js   |  44 +++++
 .../NativeTranscriptionOptions/index.jsx      |  38 ++++
 .../OpenAiOptions/index.jsx                   |  41 ++++
 .../TranscriptionPreference/index.jsx         | 180 ++++++++++++++++++
 4 files changed, 303 insertions(+)
 create mode 100644 collector/utils/WhisperProviders/OpenAiWhisper.js
 create mode 100644 frontend/src/components/TranscriptionSelection/NativeTranscriptionOptions/index.jsx
 create mode 100644 frontend/src/components/TranscriptionSelection/OpenAiOptions/index.jsx
 create mode 100644 frontend/src/pages/GeneralSettings/TranscriptionPreference/index.jsx

diff --git a/collector/utils/WhisperProviders/OpenAiWhisper.js b/collector/utils/WhisperProviders/OpenAiWhisper.js
new file mode 100644
index 00000000000..3b9d08e6a65
--- /dev/null
+++ b/collector/utils/WhisperProviders/OpenAiWhisper.js
@@ -0,0 +1,44 @@
+const fs = require("fs");
+
+class OpenAiWhisper {
+  constructor({ options }) {
+    const { Configuration, OpenAIApi } = require("openai");
+    if (!options.openAiKey) throw new Error("No OpenAI API key was set.");
+
+    const config = new Configuration({
+      apiKey: options.openAiKey,
+    });
+    this.openai = new OpenAIApi(config);
+    this.model = "whisper-1";
+    this.temperature = 0;
+    this.#log("Initialized.");
+  }
+
+  #log(text, ...args) {
+    console.log(`\x1b[32m[OpenAiWhisper]\x1b[0m ${text}`, ...args);
+  }
+
+  async processFile(fullFilePath) {
+    return await this.openai
+      .createTranscription(
+        fs.createReadStream(fullFilePath),
+        this.model,
+        undefined,
+        "text",
+        this.temperature
+      )
+      .then((res) => {
+        if (res.hasOwnProperty("data"))
+          return { content: res.data, error: null };
+        return { content: "", error: "No content was able to be transcribed." };
+      })
+      .catch((e) => {
+        this.#log(`Could not get any response from openai whisper`, e.message);
+        return { content: "", error: e.message };
+      });
+  }
+}
+
+module.exports = {
+  OpenAiWhisper,
+};
diff --git a/frontend/src/components/TranscriptionSelection/NativeTranscriptionOptions/index.jsx b/frontend/src/components/TranscriptionSelection/NativeTranscriptionOptions/index.jsx
new file mode 100644
index 00000000000..07ee12126ae
--- /dev/null
+++ b/frontend/src/components/TranscriptionSelection/NativeTranscriptionOptions/index.jsx
@@ -0,0 +1,38 @@
+import { Gauge } from "@phosphor-icons/react";
+export default function NativeTranscriptionOptions() {
+  return (
+    <div className="w-full flex flex-col gap-y-4">
+      <div className="flex flex-col md:flex-row md:items-center gap-x-2 text-white mb-4 bg-blue-800/30 w-fit rounded-lg px-4 py-2">
+        <div className="gap-x-2 flex items-center">
+          <Gauge size={25} />
+          <p className="text-sm">
+            Using the local whisper model on machines with limited RAM or CPU
+            can stall AnythingLLM when processing media files.
+            <br />
+            We recommend at least 2GB of RAM and upload files &lt;10Mb.
+            <br />
+            <br />
+            <i>
+              The built-in model will automatically download on the first use.
+            </i>
+          </p>
+        </div>
+      </div>
+      <div className="w-full flex items-center gap-4">
+        <div className="flex flex-col w-60">
+          <label className="text-white text-sm font-semibold block mb-4">
+            Model Selection
+          </label>
+          <select
+            disabled={true}
+            className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+          >
+            <option disabled={true} selected={true}>
+              Xenova/whisper-small
+            </option>
+          </select>
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/frontend/src/components/TranscriptionSelection/OpenAiOptions/index.jsx b/frontend/src/components/TranscriptionSelection/OpenAiOptions/index.jsx
new file mode 100644
index 00000000000..aa48363e9d3
--- /dev/null
+++ b/frontend/src/components/TranscriptionSelection/OpenAiOptions/index.jsx
@@ -0,0 +1,41 @@
+import { useState } from "react";
+
+export default function OpenAiWhisperOptions({ settings }) {
+  const [inputValue, setInputValue] = useState(settings?.OpenAiKey);
+  const [_openAIKey, setOpenAIKey] = useState(settings?.OpenAiKey);
+
+  return (
+    <div className="flex gap-x-4">
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          API Key
+        </label>
+        <input
+          type="password"
+          name="OpenAiKey"
+          className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
+          placeholder="OpenAI API Key"
+          defaultValue={settings?.OpenAiKey ? "*".repeat(20) : ""}
+          required={true}
+          autoComplete="off"
+          spellCheck={false}
+          onChange={(e) => setInputValue(e.target.value)}
+          onBlur={() => setOpenAIKey(inputValue)}
+        />
+      </div>
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          Whisper Model
+        </label>
+        <select
+          disabled={true}
+          className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+        >
+          <option disabled={true} selected={true}>
+            Whisper Large
+          </option>
+        </select>
+      </div>
+    </div>
+  );
+}
diff --git a/frontend/src/pages/GeneralSettings/TranscriptionPreference/index.jsx b/frontend/src/pages/GeneralSettings/TranscriptionPreference/index.jsx
new file mode 100644
index 00000000000..a56dc26e7c9
--- /dev/null
+++ b/frontend/src/pages/GeneralSettings/TranscriptionPreference/index.jsx
@@ -0,0 +1,180 @@
+import React, { useEffect, useState } from "react";
+import { isMobile } from "react-device-detect";
+import Sidebar from "@/components/SettingsSidebar";
+import System from "@/models/system";
+import showToast from "@/utils/toast";
+import PreLoader from "@/components/Preloader";
+
+import OpenAiLogo from "@/media/llmprovider/openai.png";
+import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png";
+import OpenAiWhisperOptions from "@/components/TranscriptionSelection/OpenAiOptions";
+import NativeTranscriptionOptions from "@/components/TranscriptionSelection/NativeTranscriptionOptions";
+import LLMItem from "@/components/LLMSelection/LLMItem";
+import { MagnifyingGlass } from "@phosphor-icons/react";
+
+export default function TranscriptionModelPreference() {
+  const [saving, setSaving] = useState(false);
+  const [hasChanges, setHasChanges] = useState(false);
+  const [settings, setSettings] = useState(null);
+  const [loading, setLoading] = useState(true);
+  const [searchQuery, setSearchQuery] = useState("");
+  const [filteredProviders, setFilteredProviders] = useState([]);
+  const [selectedProvider, setSelectedProvider] = useState(null);
+
+  const handleSubmit = async (e) => {
+    e.preventDefault();
+    const form = e.target;
+    const data = { WhisperProvider: selectedProvider };
+    const formData = new FormData(form);
+
+    for (var [key, value] of formData.entries()) data[key] = value;
+    const { error } = await System.updateSystem(data);
+    setSaving(true);
+
+    if (error) {
+      showToast(`Failed to save preferences: ${error}`, "error");
+    } else {
+      showToast("Transcription preferences saved successfully.", "success");
+    }
+    setSaving(false);
+    setHasChanges(!!error);
+  };
+
+  const updateProviderChoice = (selection) => {
+    setSelectedProvider(selection);
+    setHasChanges(true);
+  };
+
+  useEffect(() => {
+    async function fetchKeys() {
+      const _settings = await System.keys();
+      setSettings(_settings);
+      setSelectedProvider(_settings?.WhisperProvider || "local");
+      setLoading(false);
+    }
+    fetchKeys();
+  }, []);
+
+  useEffect(() => {
+    const filtered = PROVIDERS.filter((provider) =>
+      provider.name.toLowerCase().includes(searchQuery.toLowerCase())
+    );
+    setFilteredProviders(filtered);
+  }, [searchQuery, selectedProvider]);
+
+  const PROVIDERS = [
+    {
+      name: "OpenAI",
+      value: "openai",
+      logo: OpenAiLogo,
+      options: <OpenAiWhisperOptions settings={settings} />,
+      description:
+        "Leverage the OpenAI Whisper-large model using your API key.",
+    },
+    {
+      name: "AnythingLLM Built-In",
+      value: "local",
+      logo: AnythingLLMIcon,
+      options: <NativeTranscriptionOptions settings={settings} />,
+      description: "Run a built-in whisper model on this instance privately.",
+    },
+  ];
+
+  return (
+    <div className="w-screen h-screen overflow-hidden bg-sidebar flex">
+      <Sidebar />
+      {loading ? (
+        <div
+          style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
+          className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
+        >
+          <div className="w-full h-full flex justify-center items-center">
+            <PreLoader />
+          </div>
+        </div>
+      ) : (
+        <div
+          style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
+          className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
+        >
+          <form onSubmit={handleSubmit} className="flex w-full">
+            <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[86px] md:py-6 py-16">
+              <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10">
+                <div className="flex gap-x-4 items-center">
+                  <p className="text-lg leading-6 font-bold text-white">
+                    Transcription Model Preference
+                  </p>
+                  {hasChanges && (
+                    <button
+                      type="submit"
+                      disabled={saving}
+                      className="flex items-center gap-x-2 px-4 py-2 rounded-lg bg-[#2C2F36] text-white text-sm hover:bg-[#3D4147] shadow-md border border-[#3D4147]"
+                    >
+                      {saving ? "Saving..." : "Save changes"}
+                    </button>
+                  )}
+                </div>
+                <p className="text-xs leading-[18px] font-base text-white text-opacity-60">
+                  These are the credentials and settings for your preferred
+                  transcription model provider. Its important these keys are
+                  current and correct or else media files and audio will not
+                  transcribe.
+                </p>
+              </div>
+              <div className="text-sm font-medium text-white mt-6 mb-4">
+                Transcription Providers
+              </div>
+              <div className="w-full">
+                <div className="w-full relative border-slate-300/20 shadow border-4 rounded-xl text-white">
+                  <div className="w-full p-4 absolute top-0 rounded-t-lg backdrop-blur-sm">
+                    <div className="w-full flex items-center sticky top-0">
+                      <MagnifyingGlass
+                        size={16}
+                        weight="bold"
+                        className="absolute left-4 z-30 text-white"
+                      />
+                      <input
+                        type="text"
+                        placeholder="Search audio transcription providers"
+                        className="bg-zinc-600 z-20 pl-10 h-[38px] rounded-full w-full px-4 py-1 text-sm border-2 border-slate-300/40 outline-none focus:border-white text-white"
+                        onChange={(e) => setSearchQuery(e.target.value)}
+                        autoComplete="off"
+                        onKeyDown={(e) => {
+                          if (e.key === "Enter") e.preventDefault();
+                        }}
+                      />
+                    </div>
+                  </div>
+                  <div className="px-4 pt-[70px] flex flex-col gap-y-1 max-h-[390px] overflow-y-auto no-scroll pb-4">
+                    {filteredProviders.map((provider) => {
+                      return (
+                        <LLMItem
+                          key={provider.name}
+                          name={provider.name}
+                          value={provider.value}
+                          image={provider.logo}
+                          description={provider.description}
+                          checked={selectedProvider === provider.value}
+                          onClick={() => updateProviderChoice(provider.value)}
+                        />
+                      );
+                    })}
+                  </div>
+                </div>
+                <div
+                  onChange={() => setHasChanges(true)}
+                  className="mt-4 flex flex-col gap-y-1"
+                >
+                  {selectedProvider &&
+                    PROVIDERS.find(
+                      (provider) => provider.value === selectedProvider
+                    )?.options}
+                </div>
+              </div>
+            </div>
+          </form>
+        </div>
+      )}
+    </div>
+  );
+}

From d86028febba93cca00cb1c79f264a47a7b5a601d Mon Sep 17 00:00:00 2001
From: timothycarambat <rambat1010@gmail.com>
Date: Thu, 14 Mar 2024 15:32:55 -0700
Subject: [PATCH 3/4] update docs

---
 server/storage/models/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/server/storage/models/README.md b/server/storage/models/README.md
index 965083dcedb..432f60572ad 100644
--- a/server/storage/models/README.md
+++ b/server/storage/models/README.md
@@ -14,6 +14,9 @@ AnythingLLM allows you to upload various audio and video formats as source docum
 
 Once transcribed you can embed these transcriptions into your workspace like you would any other file! 
 
+**Other external model/transcription providers are also live.**
+- [OpenAI Whisper via API key.](https://openai.com/research/whisper)
+
 ## Text generation (LLM selection)
 > [!IMPORTANT]
 > Use of a locally running LLM model is **experimental** and may behave unexpectedly, crash, or not function at all.

From 96f2a044afe48338380b84bbfbbdfbe4d09e6285 Mon Sep 17 00:00:00 2001
From: timothycarambat <rambat1010@gmail.com>
Date: Thu, 14 Mar 2024 15:42:19 -0700
Subject: [PATCH 4/4] fix return data

---
 collector/processSingleFile/convert/asAudio.js   | 2 +-
 collector/utils/WhisperProviders/localWhisper.js | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js
index f6745049d4a..170426e4068 100644
--- a/collector/processSingleFile/convert/asAudio.js
+++ b/collector/processSingleFile/convert/asAudio.js
@@ -35,7 +35,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
     };
   }
 
-  if (!content.length) {
+  if (!content?.length) {
     console.error(`Resulting text content was empty for ${filename}.`);
     trashFile(fullFilePath);
     return {
diff --git a/collector/utils/WhisperProviders/localWhisper.js b/collector/utils/WhisperProviders/localWhisper.js
index 7a4e61ccda6..46dbe226b4d 100644
--- a/collector/utils/WhisperProviders/localWhisper.js
+++ b/collector/utils/WhisperProviders/localWhisper.js
@@ -153,7 +153,10 @@ class LocalWhisper {
 
       if (!audioData) {
         this.#log(`Failed to parse content from ${filename}.`);
-        return null;
+        return {
+          content: null,
+          error: `Failed to parse content from ${filename}.`,
+        };
       }
 
       this.#log(`Transcribing audio data to text...`);