θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content

Jina embedding provider #3960

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions frontend/src/components/EmbeddingSelection/JinaOptions/index.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import React, { useState } from "react";
import { CaretDown, CaretUp } from "@phosphor-icons/react";

export default function JinaOptions({ settings }) {
const [showAdvancedControls, setShowAdvancedControls] = useState(false);
return (
<div className="w-full flex flex-col gap-y-7">
<div className="w-full flex items-center gap-[36px] mt-1.5 flex-wrap">
<div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold block mb-3">
API Key
</label>
<input
type="password"
name="JinaApiKey"
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="Jina API Key"
defaultValue={settings?.JinaApiKey ? "*".repeat(20) : ""}
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
<div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold block mb-3">
Model Preference
</label>
<select
name="EmbeddingModelPref"
required={true}
defaultValue={settings?.EmbeddingModelPref}
className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
>
<optgroup label="Available embedding models">
{[
"jina-embeddings-v3",
"jina-embeddings-v2-base-en",
"jina-embeddings-v2-base-zh",
"jina-embeddings-v2-base-de",
"jina-embeddings-v2-base-es",
"jina-embeddings-v2-base-code",
"jina-clip-v2",
"jina-clip-v1",
].map((model) => {
return (
<option key={model} value={model}>
{model}
</option>
);
})}
</optgroup>
</select>
</div>
</div>
<div className="flex items-center gap-x-3">
<button
type="button"
onClick={() => setShowAdvancedControls(!showAdvancedControls)}
className="flex items-center gap-x-2 text-white text-sm font-semibold"
>
Advanced Settings
{showAdvancedControls ? (
<CaretUp size={16} weight="bold" />
) : (
<CaretDown size={16} weight="bold" />
)}
</button>
</div>
{showAdvancedControls && (
<div className="flex flex-col gap-y-4">
<div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold block mb-3">
Task Type
</label>
<input
type="text"
name="JinaTask"
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="e.g. retrieval.document"
defaultValue={settings?.JinaTask}
autoComplete="off"
spellCheck={false}
/>
</div>
<div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold block mb-3">
Max Chunk Length
</label>
<input
type="number"
name="EmbeddingModelMaxChunkLength"
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="8192"
defaultValue={settings?.EmbeddingModelMaxChunkLength || 8192}
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
</div>
)}
</div>
);
}
Binary file added frontend/src/media/embeddingprovider/jina.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import VoyageAiLogo from "@/media/embeddingprovider/voyageai.png";
import LiteLLMLogo from "@/media/llmprovider/litellm.png";
import GenericOpenAiLogo from "@/media/llmprovider/generic-openai.png";
import MistralAiLogo from "@/media/llmprovider/mistral.jpeg";
import JinaAiLogo from "@/media/embeddingprovider/jina.png";

import PreLoader from "@/components/Preloader";
import ChangeWarningModal from "@/components/ChangeWarning";
Expand All @@ -29,6 +30,7 @@ import CohereEmbeddingOptions from "@/components/EmbeddingSelection/CohereOption
import VoyageAiOptions from "@/components/EmbeddingSelection/VoyageAiOptions";
import LiteLLMOptions from "@/components/EmbeddingSelection/LiteLLMOptions";
import GenericOpenAiEmbeddingOptions from "@/components/EmbeddingSelection/GenericOpenAiOptions";
import JinaOptions from "@/components/EmbeddingSelection/JinaOptions";

import EmbedderItem from "@/components/EmbeddingSelection/EmbedderItem";
import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react";
Expand Down Expand Up @@ -127,6 +129,13 @@ const EMBEDDERS = [
),
description: "Run embedding models from any OpenAI compatible API service.",
},
{
name: "Jina AI",
value: "jina",
logo: JinaAiLogo,
options: (settings) => <JinaOptions settings={settings} />,
description: "Run powerful multilingual embedding models from Jina AI.",
},
];

export default function GeneralEmbeddingPreference() {
Expand Down
35 changes: 29 additions & 6 deletions frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import VoyageAiLogo from "@/media/embeddingprovider/voyageai.png";
import PPIOLogo from "@/media/llmprovider/ppio.png";
import PGVectorLogo from "@/media/vectordbs/pgvector.png";
import DPAISLogo from "@/media/llmprovider/dpais.png";
import JinaAiLogo from "@/media/embeddingprovider/jina.png";
import React, { useState, useEffect } from "react";
import paths from "@/utils/paths";
import { useNavigate } from "react-router-dom";
Expand Down Expand Up @@ -187,6 +188,14 @@ export const LLM_SELECTION_PRIVACY = {
],
logo: GenericOpenAiLogo,
},
jina: {
name: "Jina AI",
description: [
"Your document text is sent to Jina AI's servers for processing",
"Your data is handled according to Jina AI's terms of service and privacy policy",
],
logo: JinaAiLogo,
},
cohere: {
name: "Cohere",
description: [
Expand Down Expand Up @@ -393,6 +402,14 @@ export const EMBEDDING_ENGINE_PRIVACY = {
],
logo: GenericOpenAiLogo,
},
jina: {
name: "Jina AI",
description: [
"Your document text is sent to Jina AI's servers for processing",
"Your data is handled according to Jina AI's terms of service and privacy policy",
],
logo: JinaAiLogo,
},
gemini: {
name: "Google Gemini",
description: [
Expand Down Expand Up @@ -493,8 +510,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {
</p>
</div>
<ul className="flex flex-col list-disc ml-4">
{LLMSelection.description.map((desc) => (
<li className="text-theme-text-primary text-sm">{desc}</li>
{LLMSelection.description.map((desc, index) => (
<li key={index} className="text-theme-text-secondary text-sm">
{desc}
</li>
))}
</ul>
</div>
Expand All @@ -513,8 +532,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {
</p>
</div>
<ul className="flex flex-col list-disc ml-4">
{EmbeddingEngine.description.map((desc) => (
<li className="text-theme-text-primary text-sm">{desc}</li>
{EmbeddingEngine.description.map((desc, index) => (
<li key={index} className="text-theme-text-secondary text-sm">
{desc}
</li>
))}
</ul>
</div>
Expand All @@ -534,8 +555,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {
</p>
</div>
<ul className="flex flex-col list-disc ml-4">
{VectorDb.description.map((desc) => (
<li className="text-theme-text-primary text-sm">{desc}</li>
{VectorDb.description.map((desc, index) => (
<li key={index} className="text-theme-text-secondary text-sm">
{desc}
</li>
))}
</ul>
</div>
Expand Down
4 changes: 3 additions & 1 deletion server/models/systemSettings.js
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ const SystemSettings = {
EmbeddingModelMaxChunkLength:
process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,
JinaApiKey: !!process.env.JINA_API_KEY,
JinaTask: process.env.JINA_TASK,
GenericOpenAiEmbeddingApiKey:
!!process.env.GENERIC_OPEN_AI_EMBEDDING_API_KEY,
GenericOpenAiEmbeddingMaxConcurrentChunks:
Expand Down Expand Up @@ -336,7 +338,7 @@ const SystemSettings = {
const updatePromises = [];
for (const key of Object.keys(updates)) {
let validatedValue = updates[key];
if (this.validations.hasOwnProperty(key)) {
if (Object.prototype.hasOwnProperty.call(this.validations, key)) {
if (this.validations[key].constructor.name === "AsyncFunction") {
validatedValue = await this.validations[key](updates[key]);
} else {
Expand Down
133 changes: 133 additions & 0 deletions server/utils/EmbeddingEngines/jina/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
const { toChunks, maximumChunkLength } = require("../../helpers");

class JinaEmbedder {
constructor() {
this.basePath = "https://api.jina.ai/v1";
this.apiKey = process.env.JINA_API_KEY ?? null;
this.model = process.env.EMBEDDING_MODEL_PREF ?? "jina-embeddings-v3";
this.task = process.env.JINA_TASK ?? null;
this.embeddingMaxChunkLength = maximumChunkLength();

// this.maxConcurrentChunks is delegated to the getter below.
// Refer to your specific model and provider you use this class with to determine a valid maxChunkLength
this.log(`Initialized ${this.model}`, {
baseURL: this.basePath,
maxConcurrentChunks: this.maxConcurrentChunks,
embeddingMaxChunkLength: this.embeddingMaxChunkLength,
});
}

log(text, ...args) {
console.log(`\x1b[36m[JinaEmbedder]\x1b[0m ${text}`, ...args);
}

/**
* returns the `GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS` env variable as a number
* or 500 if the env variable is not set or is not a number.
* @returns {number}
*/
get maxConcurrentChunks() {
if (!process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS)
return 500;
if (
isNaN(Number(process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS))
)
return 500;
return Number(process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS);
}

async embedTextInput(textInput) {
const result = await this.embedChunks(
Array.isArray(textInput) ? textInput : [textInput]
);
return result?.[0] || [];
}

async embedChunks(textChunks = []) {
// Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb)
// we concurrently execute each max batch of text chunks possible.
// Refer to constructor maxConcurrentChunks for more info.
const embeddingRequests = [];
for (const chunk of toChunks(textChunks, this.maxConcurrentChunks)) {
embeddingRequests.push(
new Promise((resolve) => {
(async () => {
// We are using a fetch request here because the current openai library
// does not support the Jina API
try {
const response = await fetch(`${this.basePath}/embeddings`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
},
body: JSON.stringify({
model: this.model,
input: chunk,
...(this.task ? { task: this.task } : {}),
}),
});

if (!response.ok) {
const error = await response.json();
throw {
type: error?.error?.code || response.status,
message: error?.error?.message || response.statusText,
};
}

const result = await response.json();
resolve({ data: result?.data, error: null });
} catch (e) {
resolve({
data: [],
error: {
type: e?.type || "failed_to_embed",
message: e?.message || "Failed to embed text",
},
});
}
})();
})
);
}

const { data = [], error = null } = await Promise.all(
embeddingRequests
).then((results) => {
// If any errors were returned from OpenAI abort the entire sequence because the embeddings
// will be incomplete.
const errors = results
.filter((res) => !!res.error)
.map((res) => res.error)
.flat();
if (errors.length > 0) {
let uniqueErrors = new Set();
errors.map((error) =>
uniqueErrors.add(`[${error.type}]: ${error.message}`)
);

return {
data: [],
error: Array.from(uniqueErrors).join(", "),
};
}
return {
data: results.map((res) => res?.data || []).flat(),
error: null,
};
});

if (!!error) throw new Error(`Jina Failed to embed: ${error}`);
return data.length > 0 &&
data.every((embd) =>
Object.prototype.hasOwnProperty.call(embd, "embedding")
)
? data.map((embd) => embd.embedding)
: null;
}
}

module.exports = {
JinaEmbedder,
};
3 changes: 3 additions & 0 deletions server/utils/helpers/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,9 @@ function getEmbeddingEngineSelection() {
case "gemini":
const { GeminiEmbedder } = require("../EmbeddingEngines/gemini");
return new GeminiEmbedder();
case "jina":
const { JinaEmbedder } = require("../EmbeddingEngines/jina");
return new JinaEmbedder();
default:
return new NativeEmbedder();
}
Expand Down
11 changes: 11 additions & 0 deletions server/utils/helpers/updateENV.js
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,16 @@ const KEY_MAPPING = {
checks: [isNotEmpty],
},

// Jina Embedding Settings
JinaApiKey: {
envKey: "JINA_API_KEY",
checks: [isNotEmpty],
},
JinaTask: {
envKey: "JINA_TASK",
checks: [],
},

// Generic OpenAI Embedding Settings
GenericOpenAiEmbeddingApiKey: {
envKey: "GENERIC_OPEN_AI_EMBEDDING_API_KEY",
Expand Down Expand Up @@ -817,6 +827,7 @@ function supportedEmbeddingModel(input = "") {
"litellm",
"generic-openai",
"mistral",
"jina",
];
return supported.includes(input)
? null
Expand Down