diff --git a/frontend/src/components/EmbeddingSelection/JinaOptions/index.jsx b/frontend/src/components/EmbeddingSelection/JinaOptions/index.jsx new file mode 100644 index 00000000000..703e3fa4165 --- /dev/null +++ b/frontend/src/components/EmbeddingSelection/JinaOptions/index.jsx @@ -0,0 +1,104 @@ +import React, { useState } from "react"; +import { CaretDown, CaretUp } from "@phosphor-icons/react"; + +export default function JinaOptions({ settings }) { + const [showAdvancedControls, setShowAdvancedControls] = useState(false); + return ( +
+
+
+ + +
+
+ + +
+
+
+ +
+ {showAdvancedControls && ( +
+
+ + +
+
+ + +
+
+ )} +
+ ); +} diff --git a/frontend/src/media/embeddingprovider/jina.png b/frontend/src/media/embeddingprovider/jina.png new file mode 100644 index 00000000000..df6abef4f27 Binary files /dev/null and b/frontend/src/media/embeddingprovider/jina.png differ diff --git a/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx b/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx index de27acb80a2..95883e907ed 100644 --- a/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx +++ b/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx @@ -15,6 +15,7 @@ import VoyageAiLogo from "@/media/embeddingprovider/voyageai.png"; import LiteLLMLogo from "@/media/llmprovider/litellm.png"; import GenericOpenAiLogo from "@/media/llmprovider/generic-openai.png"; import MistralAiLogo from "@/media/llmprovider/mistral.jpeg"; +import JinaAiLogo from "@/media/embeddingprovider/jina.png"; import PreLoader from "@/components/Preloader"; import ChangeWarningModal from "@/components/ChangeWarning"; @@ -29,6 +30,7 @@ import CohereEmbeddingOptions from "@/components/EmbeddingSelection/CohereOption import VoyageAiOptions from "@/components/EmbeddingSelection/VoyageAiOptions"; import LiteLLMOptions from "@/components/EmbeddingSelection/LiteLLMOptions"; import GenericOpenAiEmbeddingOptions from "@/components/EmbeddingSelection/GenericOpenAiOptions"; +import JinaOptions from "@/components/EmbeddingSelection/JinaOptions"; import EmbedderItem from "@/components/EmbeddingSelection/EmbedderItem"; import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react"; @@ -127,6 +129,13 @@ const EMBEDDERS = [ ), description: "Run embedding models from any OpenAI compatible API service.", }, + { + name: "Jina AI", + value: "jina", + logo: JinaAiLogo, + options: (settings) => , + description: "Run powerful multilingual embedding models from Jina AI.", + }, ]; export default function GeneralEmbeddingPreference() { diff --git a/frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx b/frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx index bc48209da9c..2976e89ba50 100644 --- a/frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx +++ b/frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx @@ -38,6 +38,7 @@ import VoyageAiLogo from "@/media/embeddingprovider/voyageai.png"; import PPIOLogo from "@/media/llmprovider/ppio.png"; import PGVectorLogo from "@/media/vectordbs/pgvector.png"; import DPAISLogo from "@/media/llmprovider/dpais.png"; +import JinaAiLogo from "@/media/embeddingprovider/jina.png"; import React, { useState, useEffect } from "react"; import paths from "@/utils/paths"; import { useNavigate } from "react-router-dom"; @@ -187,6 +188,14 @@ export const LLM_SELECTION_PRIVACY = { ], logo: GenericOpenAiLogo, }, + jina: { + name: "Jina AI", + description: [ + "Your document text is sent to Jina AI's servers for processing", + "Your data is handled according to Jina AI's terms of service and privacy policy", + ], + logo: JinaAiLogo, + }, cohere: { name: "Cohere", description: [ @@ -393,6 +402,14 @@ export const EMBEDDING_ENGINE_PRIVACY = { ], logo: GenericOpenAiLogo, }, + jina: { + name: "Jina AI", + description: [ + "Your document text is sent to Jina AI's servers for processing", + "Your data is handled according to Jina AI's terms of service and privacy policy", + ], + logo: JinaAiLogo, + }, gemini: { name: "Google Gemini", description: [ @@ -493,8 +510,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {

@@ -513,8 +532,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {

@@ -534,8 +555,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {

diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index 89e397ea64e..ef952531caa 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -211,6 +211,8 @@ const SystemSettings = { EmbeddingModelMaxChunkLength: process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH, VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY, + JinaApiKey: !!process.env.JINA_API_KEY, + JinaTask: process.env.JINA_TASK, GenericOpenAiEmbeddingApiKey: !!process.env.GENERIC_OPEN_AI_EMBEDDING_API_KEY, GenericOpenAiEmbeddingMaxConcurrentChunks: @@ -336,7 +338,7 @@ const SystemSettings = { const updatePromises = []; for (const key of Object.keys(updates)) { let validatedValue = updates[key]; - if (this.validations.hasOwnProperty(key)) { + if (Object.prototype.hasOwnProperty.call(this.validations, key)) { if (this.validations[key].constructor.name === "AsyncFunction") { validatedValue = await this.validations[key](updates[key]); } else { diff --git a/server/utils/EmbeddingEngines/jina/index.js b/server/utils/EmbeddingEngines/jina/index.js new file mode 100644 index 00000000000..0f95f9abbcc --- /dev/null +++ b/server/utils/EmbeddingEngines/jina/index.js @@ -0,0 +1,133 @@ +const { toChunks, maximumChunkLength } = require("../../helpers"); + +class JinaEmbedder { + constructor() { + this.basePath = "https://api.jina.ai/v1"; + this.apiKey = process.env.JINA_API_KEY ?? null; + this.model = process.env.EMBEDDING_MODEL_PREF ?? "jina-embeddings-v3"; + this.task = process.env.JINA_TASK ?? null; + this.embeddingMaxChunkLength = maximumChunkLength(); + + // this.maxConcurrentChunks is delegated to the getter below. + // Refer to your specific model and provider you use this class with to determine a valid maxChunkLength + this.log(`Initialized ${this.model}`, { + baseURL: this.basePath, + maxConcurrentChunks: this.maxConcurrentChunks, + embeddingMaxChunkLength: this.embeddingMaxChunkLength, + }); + } + + log(text, ...args) { + console.log(`\x1b[36m[JinaEmbedder]\x1b[0m ${text}`, ...args); + } + + /** + * returns the `GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS` env variable as a number + * or 500 if the env variable is not set or is not a number. + * @returns {number} + */ + get maxConcurrentChunks() { + if (!process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS) + return 500; + if ( + isNaN(Number(process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS)) + ) + return 500; + return Number(process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS); + } + + async embedTextInput(textInput) { + const result = await this.embedChunks( + Array.isArray(textInput) ? textInput : [textInput] + ); + return result?.[0] || []; + } + + async embedChunks(textChunks = []) { + // Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb) + // we concurrently execute each max batch of text chunks possible. + // Refer to constructor maxConcurrentChunks for more info. + const embeddingRequests = []; + for (const chunk of toChunks(textChunks, this.maxConcurrentChunks)) { + embeddingRequests.push( + new Promise((resolve) => { + (async () => { + // We are using a fetch request here because the current openai library + // does not support the Jina API + try { + const response = await fetch(`${this.basePath}/embeddings`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify({ + model: this.model, + input: chunk, + ...(this.task ? { task: this.task } : {}), + }), + }); + + if (!response.ok) { + const error = await response.json(); + throw { + type: error?.error?.code || response.status, + message: error?.error?.message || response.statusText, + }; + } + + const result = await response.json(); + resolve({ data: result?.data, error: null }); + } catch (e) { + resolve({ + data: [], + error: { + type: e?.type || "failed_to_embed", + message: e?.message || "Failed to embed text", + }, + }); + } + })(); + }) + ); + } + + const { data = [], error = null } = await Promise.all( + embeddingRequests + ).then((results) => { + // If any errors were returned from OpenAI abort the entire sequence because the embeddings + // will be incomplete. + const errors = results + .filter((res) => !!res.error) + .map((res) => res.error) + .flat(); + if (errors.length > 0) { + let uniqueErrors = new Set(); + errors.map((error) => + uniqueErrors.add(`[${error.type}]: ${error.message}`) + ); + + return { + data: [], + error: Array.from(uniqueErrors).join(", "), + }; + } + return { + data: results.map((res) => res?.data || []).flat(), + error: null, + }; + }); + + if (!!error) throw new Error(`Jina Failed to embed: ${error}`); + return data.length > 0 && + data.every((embd) => + Object.prototype.hasOwnProperty.call(embd, "embedding") + ) + ? data.map((embd) => embd.embedding) + : null; + } +} + +module.exports = { + JinaEmbedder, +}; diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js index a069b0dd3ca..dbcbe17f0d4 100644 --- a/server/utils/helpers/index.js +++ b/server/utils/helpers/index.js @@ -260,6 +260,9 @@ function getEmbeddingEngineSelection() { case "gemini": const { GeminiEmbedder } = require("../EmbeddingEngines/gemini"); return new GeminiEmbedder(); + case "jina": + const { JinaEmbedder } = require("../EmbeddingEngines/jina"); + return new JinaEmbedder(); default: return new NativeEmbedder(); } diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index b69c96417f6..4157d2f988e 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -301,6 +301,16 @@ const KEY_MAPPING = { checks: [isNotEmpty], }, + // Jina Embedding Settings + JinaApiKey: { + envKey: "JINA_API_KEY", + checks: [isNotEmpty], + }, + JinaTask: { + envKey: "JINA_TASK", + checks: [], + }, + // Generic OpenAI Embedding Settings GenericOpenAiEmbeddingApiKey: { envKey: "GENERIC_OPEN_AI_EMBEDDING_API_KEY", @@ -817,6 +827,7 @@ function supportedEmbeddingModel(input = "") { "litellm", "generic-openai", "mistral", + "jina", ]; return supported.includes(input) ? null