diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml index f9aca9fee9b..583eeff7a5b 100644 --- a/.github/workflows/dev-build.yaml +++ b/.github/workflows/dev-build.yaml @@ -6,7 +6,7 @@ concurrency: on: push: - branches: ['upload-ui-ux'] # put your current branch to create a build. Core team only. + branches: ['3999-chromium-flags'] # put your current branch to create a build. Core team only. paths-ignore: - '**.md' - 'cloud-deployments/*' diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index 84589197749..8f7560fb6de 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -5,6 +5,7 @@ const { const { writeToServerDocuments } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); +const RuntimeSettings = require("../../utils/runtimeSettings"); /** * Scrape a generic URL and return the content in the specified format @@ -106,10 +107,12 @@ function validatedHeaders(headers = {}) { async function getPageContent({ link, captureAs = "text", headers = {} }) { try { let pageContents = []; + const runtimeSettings = new RuntimeSettings(); const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new", ignoreHTTPSErrors: true, + args: runtimeSettings.get("browserLaunchArgs"), }, gotoOptions: { waitUntil: "networkidle2", diff --git a/collector/utils/runtimeSettings/index.js b/collector/utils/runtimeSettings/index.js index 1d15fdc44e9..da60a123432 100644 --- a/collector/utils/runtimeSettings/index.js +++ b/collector/utils/runtimeSettings/index.js @@ -27,6 +27,16 @@ class RuntimeSettings { // Value must be explicitly "true" or "false" as a string validate: (value) => String(value) === "true", }, + browserLaunchArgs: { + default: [], + validate: (value) => { + let args = []; + if (Array.isArray(value)) args = value.map((arg) => String(arg.trim())); + if (typeof value === "string") + args = value.split(",").map((arg) => arg.trim()); + return args; + }, + }, }; constructor() { diff --git a/docker/.env.example b/docker/.env.example index e93a6c9949f..f0fe46d1365 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -363,4 +363,9 @@ GID='1000' # Specify the target languages for when using OCR to parse images and PDFs. # This is a comma separated list of language codes as a string. Unsupported languages will be ignored. # Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. -# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol \ No newline at end of file +# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol + +# Runtime flags for built-in pupeeteer Chromium instance +# This is only required on Linux machines running AnythingLLM via Docker +# and do not want to use the --cap-add=SYS_ADMIN docker argument +# ANYTHINGLLM_CHROMIUM_ARGS="--no-sandbox,--disable-setuid-sandbox" \ No newline at end of file diff --git a/server/.env.example b/server/.env.example index 24453045f52..e1f5ebfdd94 100644 --- a/server/.env.example +++ b/server/.env.example @@ -362,3 +362,8 @@ TTS_PROVIDER="native" # This is a comma separated list of language codes as a string. Unsupported languages will be ignored. # Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. # TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol + +# Runtime flags for built-in pupeeteer Chromium instance +# This is only required on Linux machines running AnythingLLM via Docker +# and do not want to use the --cap-add=SYS_ADMIN docker argument +# ANYTHINGLLM_CHROMIUM_ARGS="--no-sandbox,--disable-setuid-sandbox" \ No newline at end of file diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index ef56f0c9255..5dcabe10a1e 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -38,6 +38,7 @@ class CollectorApi { }, runtimeSettings: { allowAnyIp: process.env.COLLECTOR_ALLOW_ANY_IP ?? "false", + browserLaunchArgs: process.env.ANYTHINGLLM_CHROMIUM_ARGS ?? [], }, }; } diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index d570e94a87d..9032237833e 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -1167,6 +1167,9 @@ function dumpENV() { // Allow disabling of streaming for generic openai "GENERIC_OPENAI_STREAMING_DISABLED", + + // Specify Chromium args for collector + "ANYTHINGLLM_CHROMIUM_ARGS", ]; // Simple sanitization of each value to prevent ENV injection via newline or quote escaping.