Mintplex-Labs · timothycarambat · May 14, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -11,6 +11,7 @@
     "cooldowns",
     "Deduplicator",
     "Dockerized",
+    "elevenlabs",
     "Embeddable",
     "epub",
     "GROQ",

diff --git a/docker/.env.example b/docker/.env.example
@@ -171,6 +171,19 @@ GID='1000'
 # WHISPER_PROVIDER="openai"
 # OPEN_AI_KEY=sk-xxxxxxxx
 
+###########################################
+######## TTS/STT Model Selection ##########
+###########################################
+# TTS_PROVIDER="native"
+
+# TTS_PROVIDER="openai"
+# TTS_OPEN_AI_KEY=sk-example
+# TTS_OPEN_AI_VOICE_MODEL=nova
+
+# TTS_PROVIDER="elevenlabs"
+# TTS_ELEVEN_LABS_KEY=
+# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel
+
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # DISABLE_TELEMETRY="false"

diff --git a/frontend/package.json b/frontend/package.json
@@ -28,6 +28,7 @@
     "react-dropzone": "^14.2.3",
     "react-loading-skeleton": "^3.1.0",
     "react-router-dom": "^6.3.0",
+    "react-speech-recognition": "^3.10.0",
     "react-tag-input-component": "^2.0.2",
     "react-toastify": "^9.1.3",
     "react-tooltip": "^5.25.2",

diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
@@ -32,6 +32,9 @@ const GeneralLLMPreference = lazy(
 const GeneralTranscriptionPreference = lazy(
   () => import("@/pages/GeneralSettings/TranscriptionPreference")
 );
+const GeneralAudioPreference = lazy(
+  () => import("@/pages/GeneralSettings/AudioPreference")
+);
 const GeneralEmbeddingPreference = lazy(
   () => import("@/pages/GeneralSettings/EmbeddingPreference")
 );
@@ -85,6 +88,10 @@ export default function App() {
                   <AdminRoute Component={GeneralTranscriptionPreference} />
                 }
               />
+              <Route
+                path="/settings/audio-preference"
+                element={<AdminRoute Component={GeneralAudioPreference} />}
+              />
               <Route
                 path="/settings/embedding-preference"
                 element={<AdminRoute Component={GeneralEmbeddingPreference} />}

diff --git a/frontend/src/components/SettingsSidebar/index.jsx b/frontend/src/components/SettingsSidebar/index.jsx
@@ -21,6 +21,7 @@ import {
   ClosedCaptioning,
   EyeSlash,
   SplitVertical,
+  Microphone,
 } from "@phosphor-icons/react";
 import useUser from "@/hooks/useUser";
 import { USER_BACKGROUND_COLOR } from "@/utils/constants";
@@ -280,6 +281,14 @@ const SidebarOptions = ({ user = null }) => (
       flex={true}
       allowedRole={["admin"]}
     />
+    <Option
+      href={paths.settings.audioPreference()}
+      btnText="Voice and Speech Support"
+      icon={<Microphone className="h-5 w-5 flex-shrink-0" />}
+      user={user}
+      flex={true}
+      allowedRole={["admin"]}
+    />
     <Option
       href={paths.settings.transcriptionPreference()}
       btnText="Transcription Model"

diff --git a/frontend/src/components/SpeechToText/BrowserNative/index.jsx b/frontend/src/components/SpeechToText/BrowserNative/index.jsx
@@ -0,0 +1,9 @@
+export default function BrowserNative() {
+  return (
+    <div className="w-full h-10 items-center flex">
+      <p className="text-sm font-base text-white text-opacity-60">
+        There is no configuration needed for this provider.
+      </p>
+    </div>
+  );
+}
diff --git a/frontend/src/components/TextToSpeech/BrowserNative/index.jsx b/frontend/src/components/TextToSpeech/BrowserNative/index.jsx
@@ -0,0 +1,9 @@
+export default function BrowserNative() {
+  return (
+    <div className="w-full h-10 items-center flex">
+      <p className="text-sm font-base text-white text-opacity-60">
+        There is no configuration needed for this provider.
+      </p>
+    </div>
+  );
+}
diff --git a/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx b/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx
@@ -0,0 +1,107 @@
+import { useState, useEffect } from "react";
+import System from "@/models/system";
+
+export default function ElevenLabsOptions({ settings }) {
+  const [inputValue, setInputValue] = useState(settings?.TTSElevenLabsKey);
+  const [openAIKey, setOpenAIKey] = useState(settings?.TTSElevenLabsKey);
+
+  return (
+    <div className="flex gap-x-4">
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          API Key
+        </label>
+        <input
+          type="password"
+          name="TTSElevenLabsKey"
+          className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
+          placeholder="ElevenLabs API Key"
+          defaultValue={settings?.TTSElevenLabsKey ? "*".repeat(20) : ""}
+          required={true}
+          autoComplete="off"
+          spellCheck={false}
+          onChange={(e) => setInputValue(e.target.value)}
+          onBlur={() => setOpenAIKey(inputValue)}
+        />
+      </div>
+      {!settings?.credentialsOnly && (
+        <ElevenLabsModelSelection settings={settings} apiKey={openAIKey} />
+      )}
+    </div>
+  );
+}
+
+function ElevenLabsModelSelection({ apiKey, settings }) {
+  const [groupedModels, setGroupedModels] = useState({});
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    async function findCustomModels() {
+      setLoading(true);
+      const { models } = await System.customModels(
+        "elevenlabs-tts",
+        typeof apiKey === "boolean" ? null : apiKey
+      );
+
+      if (models?.length > 0) {
+        const modelsByOrganization = models.reduce((acc, model) => {
+          acc[model.organization] = acc[model.organization] || [];
+          acc[model.organization].push(model);
+          return acc;
+        }, {});
+        setGroupedModels(modelsByOrganization);
+      }
+
+      setLoading(false);
+    }
+    findCustomModels();
+  }, [apiKey]);
+
+  if (loading) {
+    return (
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          Chat Model Selection
+        </label>
+        <select
+          name="TTSElevenLabsVoiceModel"
+          disabled={true}
+          className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+        >
+          <option disabled={true} selected={true}>
+            -- loading available models --
+          </option>
+        </select>
+      </div>
+    );
+  }
+
+  return (
+    <div className="flex flex-col w-60">
+      <label className="text-white text-sm font-semibold block mb-4">
+        Chat Model Selection
+      </label>
+      <select
+        name="TTSElevenLabsVoiceModel"
+        required={true}
+        className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+      >
+        {Object.keys(groupedModels)
+          .sort()
+          .map((organization) => (
+            <optgroup key={organization} label={organization}>
+              {groupedModels[organization].map((model) => (
+                <option
+                  key={model.id}
+                  value={model.id}
+                  selected={settings?.OpenAiModelPref === model.id}
+                >
+                  {model.name}
+                </option>
+              ))}
+            </optgroup>
+          ))}
+      </select>
+    </div>
+  );
+}
diff --git a/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx b/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx
@@ -0,0 +1,45 @@
+function toProperCase(string) {
+  return string.replace(/\w\S*/g, function (txt) {
+    return txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase();
+  });
+}
+
+export default function OpenAiTextToSpeechOptions({ settings }) {
+  const apiKey = settings?.TTSOpenAIKey;
+
+  return (
+    <div className="flex gap-x-4">
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          API Key
+        </label>
+        <input
+          type="password"
+          name="TTSOpenAIKey"
+          className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
+          placeholder="OpenAI API Key"
+          defaultValue={apiKey ? "*".repeat(20) : ""}
+          required={true}
+          autoComplete="off"
+          spellCheck={false}
+        />
+      </div>
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          Voice Model
+        </label>
+        <select
+          name="TTSOpenAIVoiceModel"
+          defaultValue={settings?.TTSOpenAIVoiceModel ?? "alloy"}
+          className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+        >
+          {["alloy", "echo", "fable", "onyx", "nova", "shimmer"].map(
+            (voice) => {
+              return <option value={voice}>{toProperCase(voice)}</option>;
+            }
+          )}
+        </select>
+      </div>
+    </div>
+  );
+}
diff --git a/.../WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx b/.../WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx
@@ -0,0 +1,94 @@
+import { useEffect, useState, useRef } from "react";
+import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react";
+import { Tooltip } from "react-tooltip";
+import Workspace from "@/models/workspace";
+import showToast from "@/utils/toast";
+
+export default function AsyncTTSMessage({ slug, chatId }) {
+  const playerRef = useRef(null);
+  const [speaking, setSpeaking] = useState(false);
+  const [loading, setLoading] = useState(false);
+  const [audioSrc, setAudioSrc] = useState(null);
+
+  function speakMessage() {
+    if (speaking) {
+      playerRef?.current?.pause();
+      return;
+    }
+
+    try {
+      if (!audioSrc) {
+        setLoading(true);
+        Workspace.ttsMessage(slug, chatId)
+          .then((audioBlob) => {
+            if (!audioBlob)
+              throw new Error("Failed to load or play TTS message response.");
+            setAudioSrc(audioBlob);
+          })
+          .catch((e) => showToast(e.message, "error", { clear: true }))
+          .finally(() => setLoading(false));
+      } else {
+        playerRef.current.play();
+      }
+    } catch (e) {
+      console.error(e);
+      setLoading(false);
+      setSpeaking(false);
+    }
+  }
+
+  useEffect(() => {
+    function setupPlayer() {
+      if (!playerRef?.current) return;
+      playerRef.current.addEventListener("play", () => {
+        setSpeaking(true);
+      });
+
+      playerRef.current.addEventListener("pause", () => {
+        playerRef.current.currentTime = 0;
+        setSpeaking(false);
+      });
+    }
+    setupPlayer();
+  }, []);
+
+  if (!chatId) return null;
+  return (
+    <div className="mt-3 relative">
+      <button
+        onClick={speakMessage}
+        data-tooltip-id="message-to-speech"
+        data-tooltip-content={
+          speaking ? "Pause TTS speech of message" : "TTS Speak message"
+        }
+        className="border-none text-zinc-300"
+        aria-label={speaking ? "Pause speech" : "Speak message"}
+      >
+        {speaking ? (
+          <PauseCircle size={18} className="mb-1" />
+        ) : (
+          <>
+            {loading ? (
+              <CircleNotch size={18} className="mb-1 animate-spin" />
+            ) : (
+              <SpeakerHigh size={18} className="mb-1" />
+            )}
+          </>
+        )}
+        <audio
+          ref={playerRef}
+          hidden={true}
+          src={audioSrc}
+          autoPlay={true}
+          controls={false}
+        />
+      </button>
+      <Tooltip
+        id="message-to-speech"
+        place="bottom"
+        delayShow={300}
+        className="tooltip !text-xs"
+      />
+    </div>
+  );
+}
diff --git a/...nts/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx b/...nts/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
@@ -0,0 +1,23 @@
+import { useEffect, useState } from "react";
+import NativeTTSMessage from "./native";
+import AsyncTTSMessage from "./asyncTts";
+import System from "@/models/system";
+
+export default function TTSMessage({ slug, chatId, message }) {
+  const [provider, setProvider] = useState("native");
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    async function getSettings() {
+      const _settings = await System.keys();
+      setProvider(_settings?.TextToSpeechProvider ?? "native");
+      setLoading(false);
+    }
+    getSettings();
+  }, []);
+
+  if (loading) return null;
+  if (provider !== "native")
+    return <AsyncTTSMessage slug={slug} chatId={chatId} />;
+  return <NativeTTSMessage message={message} />;
+}