θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,18 @@ export default function OllamaEmbeddingOptions({ settings }) {
const [maxChunkLength, setMaxChunkLength] = useState(
settings?.EmbeddingModelMaxChunkLength || 8192
);
const [batchSize, setBatchSize] = useState(
settings?.OllamaEmbeddingBatchSize || 1
);

const handleMaxChunkLengthChange = (e) => {
setMaxChunkLength(Number(e.target.value));
};

const handleBatchSizeChange = (e) => {
setBatchSize(Number(e.target.value));
};

return (
<div className="w-full flex flex-col gap-y-7">
<div className="w-full flex items-start gap-[36px] mt-1.5">
Expand Down Expand Up @@ -74,7 +81,7 @@ export default function OllamaEmbeddingOptions({ settings }) {
}}
className="border-none text-theme-text-primary hover:text-theme-text-secondary flex items-center text-sm"
>
{showAdvancedControls ? "Hide" : "Show"} Manual Endpoint Input
{showAdvancedControls ? "Hide" : "Show"} Advanced Settings
{showAdvancedControls ? (
<CaretUp size={14} className="ml-1" />
) : (
Expand Down Expand Up @@ -121,6 +128,41 @@ export default function OllamaEmbeddingOptions({ settings }) {
Enter the URL where Ollama is running.
</p>
</div>
<div className="flex flex-col w-60">
<div
data-tooltip-place="top"
data-tooltip-id="ollama-batch-size-tooltip"
className="flex gap-x-1 items-center mb-3"
>
<Info
size={16}
className="text-theme-text-secondary cursor-pointer"
/>
<label className="text-white text-sm font-semibold block">
Embedding batch size
</label>
<Tooltip id="ollama-batch-size-tooltip">
Number of text chunks to embed in parallel. Higher values
improve speed but use more memory. Default is 1.
</Tooltip>
</div>
<input
type="number"
name="OllamaEmbeddingBatchSize"
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="1"
min={1}
value={batchSize}
onChange={handleBatchSizeChange}
onScroll={(e) => e.target.blur()}
required={true}
autoComplete="off"
/>
<p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
Increase this value to process multiple chunks simultaneously for
faster embedding.
</p>
</div>
</div>
</div>
</div>
Expand Down
1 change: 1 addition & 0 deletions server/models/systemSettings.js
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ const SystemSettings = {
: process.env.EMBEDDING_MODEL_PREF,
EmbeddingModelMaxChunkLength:
process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
OllamaEmbeddingBatchSize: process.env.OLLAMA_EMBEDDING_BATCH_SIZE || 1,
VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,
GenericOpenAiEmbeddingApiKey:
!!process.env.GENERIC_OPEN_AI_EMBEDDING_API_KEY,
Expand Down
45 changes: 31 additions & 14 deletions server/utils/EmbeddingEngines/ollama/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ class OllamaEmbedder {
this.className = "OllamaEmbedder";
this.basePath = process.env.EMBEDDING_BASE_PATH;
this.model = process.env.EMBEDDING_MODEL_PREF;
// Limit of how many strings we can process in a single pass to stay with resource or network limits
this.maxConcurrentChunks = 1;
this.maxConcurrentChunks = process.env.OLLAMA_EMBEDDING_BATCH_SIZE
? Number(process.env.OLLAMA_EMBEDDING_BATCH_SIZE)
: 1;
this.embeddingMaxChunkLength = maximumChunkLength();
this.client = new Ollama({ host: this.basePath });
this.log(
`initialized with model ${this.model} at ${this.basePath}. num_ctx: ${this.embeddingMaxChunkLength}`
`initialized with model ${this.model} at ${this.basePath}. Batch size: ${this.maxConcurrentChunks}, num_ctx: ${this.embeddingMaxChunkLength}`
);
}

Expand Down Expand Up @@ -46,14 +47,14 @@ class OllamaEmbedder {

/**
* This function takes an array of text chunks and embeds them using the Ollama API.
* chunks are processed sequentially to avoid overwhelming the API with too many requests
* or running out of resources on the endpoint running the ollama instance.
* Chunks are processed in batches based on the maxConcurrentChunks setting to balance
* resource usage on the Ollama endpoint.
*
* We will use the num_ctx option to set the maximum context window to the max chunk length defined by the user in the settings
* so that the maximum context window is used and content is not truncated.
*
* We also assume the default keep alive option. This could cause issues with models being unloaded and reloaded
* on load memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
* on low memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
* constantly being loaded and unloaded, the user should use another LLM or Embedder to avoid this issue.
* @param {string[]} textChunks - An array of text chunks to embed.
* @returns {Promise<Array<number[]>>} - A promise that resolves to an array of embeddings.
Expand All @@ -64,29 +65,45 @@ class OllamaEmbedder {
`Ollama service could not be reached. Is Ollama running?`
);
this.log(
`Embedding ${textChunks.length} chunks of text with ${this.model}.`
`Embedding ${textChunks.length} chunks of text with ${this.model} in batches of ${this.maxConcurrentChunks}.`
);

let data = [];
let error = null;

for (const chunk of textChunks) {
// Process chunks in batches based on maxConcurrentChunks
const totalBatches = Math.ceil(
textChunks.length / this.maxConcurrentChunks
);
let currentBatch = 0;

for (let i = 0; i < textChunks.length; i += this.maxConcurrentChunks) {
const batch = textChunks.slice(i, i + this.maxConcurrentChunks);
currentBatch++;

try {
const res = await this.client.embeddings({
// Use input param instead of prompt param to support batch processing
const res = await this.client.embed({
model: this.model,
prompt: chunk,
input: batch.length === 1 ? batch[0] : batch,
options: {
// Always set the num_ctx to the max chunk length defined by the user in the settings
// so that the maximum context window is used and content is not truncated.
num_ctx: this.embeddingMaxChunkLength,
},
});

const { embedding } = res;
if (!Array.isArray(embedding) || embedding.length === 0)
throw new Error("Ollama returned an empty embedding for chunk!");
const { embeddings } = res;
if (!Array.isArray(embeddings) || embeddings.length === 0)
throw new Error("Ollama returned empty embeddings for batch!");

data.push(embedding);
// Using prompt param in embed() would return a single embedding (number[])
// but input param returns an array of embeddings (number[][]) for batch processing.
// This is why we spread the embeddings array into the data array.
data.push(...embeddings);
this.log(
`Batch ${currentBatch}/${totalBatches}: Embedded ${embeddings.length} chunks. Total: ${data.length}/${textChunks.length}`
);
} catch (err) {
this.log(err.message);
error = err.message;
Expand Down
4 changes: 4 additions & 0 deletions server/utils/helpers/updateENV.js
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,10 @@ const KEY_MAPPING = {
envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
checks: [nonZero],
},
OllamaEmbeddingBatchSize: {
envKey: "OLLAMA_EMBEDDING_BATCH_SIZE",
checks: [nonZero],
},

// Gemini Embedding Settings
GeminiEmbeddingApiKey: {
Expand Down