diff --git a/docker/.env.example b/docker/.env.example index 27fa1c013c6..50dfbd41f73 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -27,6 +27,7 @@ GID='1000' # LLM_PROVIDER='anthropic' # ANTHROPIC_API_KEY=sk-ant-xxxx # ANTHROPIC_MODEL_PREF='claude-2' +# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts. # LLM_PROVIDER='lmstudio' # LMSTUDIO_BASE_PATH='http://your-server:1234/v1' diff --git a/server/.env.example b/server/.env.example index 3dc0bd596c4..70d25eb4fb9 100644 --- a/server/.env.example +++ b/server/.env.example @@ -24,6 +24,7 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long. # LLM_PROVIDER='anthropic' # ANTHROPIC_API_KEY=sk-ant-xxxx # ANTHROPIC_MODEL_PREF='claude-2' +# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts. # LLM_PROVIDER='lmstudio' # LMSTUDIO_BASE_PATH='http://your-server:1234/v1' diff --git a/server/utils/AiProviders/anthropic/index.js b/server/utils/AiProviders/anthropic/index.js index 2170ba83f71..0b9f0df613b 100644 --- a/server/utils/AiProviders/anthropic/index.js +++ b/server/utils/AiProviders/anthropic/index.js @@ -34,6 +34,9 @@ class AnthropicLLM { this.embedder = embedder ?? new NativeEmbedder(); this.defaultTemp = 0.7; + this.cacheControl = this.#parseCacheControl( + process.env.ANTHROPIC_CACHE_CONTROL + ); this.log(`Initialized with ${this.model}`); } @@ -57,6 +60,62 @@ class AnthropicLLM { return true; } + /** + * Parses the cache control ENV variable + * @param {string} value - The ENV value (5m or 1h) + * @returns {null|object} Cache control configuration + */ + #parseCacheControl(value) { + if (!value) return null; + const normalized = value.toLowerCase().trim(); + if (normalized === "5m" || normalized === "1h") { + return { type: "ephemeral", ttl: normalized }; + } + return null; + } + + /** + * Checks if content meets minimum requirements for caching + * Per Anthropic docs: minimum 1024 tokens + * + * Certain models (Haiku 3.5, 3) have a minimum of 2048 tokens but + * after testing, 1024 tokens can be passed with no errors and + * Anthropic will automatically ignore it unless it's above the minimum of 2048 tokens. + * https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations + * @param {string} content - The content to check + * @returns {boolean} + */ + #shouldCache(content) { + if (!this.cacheControl || !content) return false; + // Rough token estimate: ~4 chars per token + // Minimum 1024 tokens = ~4096 characters + const estimatedTokens = content.length / 4; + return estimatedTokens >= 1024; + } + + /** + * Builds system parameter with cache control if applicable + * @param {string} systemContent - The system prompt content + * @returns {string|array} System parameter for API call + */ + #buildSystemWithCache(systemContent) { + if (!systemContent) return systemContent; + + // If caching is enabled and content is large enough + // apply cache control + if (this.#shouldCache(systemContent)) { + return [ + { + type: "text", + text: systemContent, + cache_control: this.cacheControl, + }, + ]; + } + + return systemContent; + } + /** * Generates appropriate content array for a message + attachments. * @param {{userPrompt:string, attachments: import("../../helpers").Attachment[]}} @@ -105,11 +164,12 @@ class AnthropicLLM { async getChatCompletion(messages = null, { temperature = 0.7 }) { try { + const systemContent = messages[0].content; const result = await LLMPerformanceMonitor.measureAsyncFunction( this.anthropic.messages.create({ model: this.model, max_tokens: 4096, - system: messages[0].content, // Strip out the system message + system: this.#buildSystemWithCache(systemContent), // Apply cache control if enabled messages: messages.slice(1), // Pop off the system message temperature: Number(temperature ?? this.defaultTemp), }) @@ -117,6 +177,7 @@ class AnthropicLLM { const promptTokens = result.output.usage.input_tokens; const completionTokens = result.output.usage.output_tokens; + return { textResponse: result.output.content[0].text, metrics: { @@ -134,11 +195,12 @@ class AnthropicLLM { } async streamGetChatCompletion(messages = null, { temperature = 0.7 }) { + const systemContent = messages[0].content; const measuredStreamRequest = await LLMPerformanceMonitor.measureStream( this.anthropic.messages.stream({ model: this.model, max_tokens: 4096, - system: messages[0].content, // Strip out the system message + system: this.#buildSystemWithCache(systemContent), // Apply cache control if enabled messages: messages.slice(1), // Pop off the system message temperature: Number(temperature ?? this.defaultTemp), }), diff --git a/server/utils/agents/aibitat/providers/anthropic.js b/server/utils/agents/aibitat/providers/anthropic.js index 815ac88ed01..e81cd16cf8f 100644 --- a/server/utils/agents/aibitat/providers/anthropic.js +++ b/server/utils/agents/aibitat/providers/anthropic.js @@ -25,12 +25,72 @@ class AnthropicProvider extends Provider { super(client); this.model = model; + this.cacheControl = this.#parseCacheControl( + process.env.ANTHROPIC_CACHE_CONTROL + ); } get supportsAgentStreaming() { return true; } + /** + * Parses the cache control ENV variable + * @param {string} value - The ENV value (5m or 1h) + * @returns {null|object} Cache control configuration + */ + #parseCacheControl(value) { + if (!value) return null; + const normalized = value.toLowerCase().trim(); + if (normalized === "5m" || normalized === "1h") { + return { type: "ephemeral", ttl: normalized }; + } + return null; + } + + /** + * Checks if content meets minimum requirements for caching + * Per Anthropic docs: minimum 1024 tokens + * + * Certain models (Haiku 3.5, 3) have a minimum of 2048 tokens but + * after testing, 1024 tokens can be passed with no errors and + * Anthropic will automatically ignore it unless it's above the minimum of 2048 tokens. + * https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations + * @param {string} content - The content to check + * @returns {boolean} + */ + #shouldCache(content) { + if (!this.cacheControl || !content) return false; + // Rough token estimate: ~4 chars per token + // Minimum 1024 tokens = ~4096 characters + const estimatedTokens = content.length / 4; + return estimatedTokens >= 1024; + } + + /** + * Builds system parameter with cache control if applicable + * @param {string} systemContent - The system prompt content + * @returns {string|array} System parameter for API call + */ + #buildSystemWithCache(systemContent) { + if (!systemContent) return systemContent; + + // If caching is enabled and content is large enough + // apply cache control + if (this.#shouldCache(systemContent)) { + return [ + { + type: "text", + text: systemContent, + cache_control: this.cacheControl, + }, + ]; + } + + // Otherwise, return as plain string (no caching) + return systemContent; + } + #prepareMessages(messages = []) { // Extract system prompt and filter out any system messages from the main chat. let systemPrompt = @@ -149,7 +209,7 @@ class AnthropicProvider extends Provider { { model: this.model, max_tokens: 4096, - system: systemPrompt, + system: this.#buildSystemWithCache(systemPrompt), // Apply cache control if enabled messages: chats, stream: true, ...(Array.isArray(functions) && functions?.length > 0 @@ -276,7 +336,7 @@ class AnthropicProvider extends Provider { { model: this.model, max_tokens: 4096, - system: systemPrompt, + system: this.#buildSystemWithCache(systemPrompt), // Apply cache control if enabled messages: chats, stream: false, ...(Array.isArray(functions) && functions?.length > 0