θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content

Allow custom headers in upload-link endpoint #3695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions collector/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,13 @@ app.post(
"/process-link",
[verifyPayloadIntegrity],
async function (request, response) {
const { link } = reqBody(request);
const { link, scraperHeaders = {} } = reqBody(request);
try {
const { success, reason, documents = [] } = await processLink(link);
const {
success,
reason,
documents = [],
} = await processLink(link, scraperHeaders);
response.status(200).json({ url: link, success, reason, documents });
} catch (e) {
console.error(e);
Expand Down
84 changes: 71 additions & 13 deletions collector/processLink/convert/generic.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,25 @@ const { default: slugify } = require("slugify");

/**
* Scrape a generic URL and return the content in the specified format
* @param {string} link - The URL to scrape
* @param {('html' | 'text')} captureAs - The format to capture the page content as
* @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly
* @param {Object} config - The configuration object
* @param {string} config.link - The URL to scrape
* @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
* @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
* @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
* @returns {Promise<Object>} - The content of the page
*/
async function scrapeGenericUrl(
async function scrapeGenericUrl({
link,
captureAs = "text",
processAsDocument = true
) {
processAsDocument = true,
scraperHeaders = {},
}) {
console.log(`-- Working URL ${link} => (${captureAs}) --`);
const content = await getPageContent(link, captureAs);
const content = await getPageContent({
link,
captureAs,
headers: scraperHeaders,
});

if (!content.length) {
console.error(`Resulting URL content was empty at ${link}.`);
Expand Down Expand Up @@ -63,13 +70,38 @@ async function scrapeGenericUrl(
return { success: true, reason: null, documents: [document] };
}

/**
* Validate the headers object
* - Keys & Values must be strings and not empty
* - Assemble a new object with only the valid keys and values
* @param {{[key: string]: string}} headers - The headers object to validate
* @returns {{[key: string]: string}} - The validated headers object
*/
function validatedHeaders(headers = {}) {
try {
if (Object.keys(headers).length === 0) return {};
let validHeaders = {};
for (const key of Object.keys(headers)) {
if (!key?.trim()) continue;
if (typeof headers[key] !== "string" || !headers[key]?.trim()) continue;
validHeaders[key] = headers[key].trim();
}
return validHeaders;
} catch (error) {
console.error("Error validating headers", error);
return {};
}
}

/**
* Get the content of a page
* @param {string} link - The URL to get the content of
* @param {('html' | 'text')} captureAs - The format to capture the page content as
* @param {Object} config - The configuration object
* @param {string} config.link - The URL to get the content of
* @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
* @param {{[key: string]: string}} config.headers - Custom headers to use when making the request
* @returns {Promise<string>} - The content of the page
*/
async function getPageContent(link, captureAs = "text") {
async function getPageContent({ link, captureAs = "text", headers = {} }) {
try {
let pageContents = [];
const loader = new PuppeteerWebBaseLoader(link, {
Expand All @@ -91,12 +123,37 @@ async function getPageContent(link, captureAs = "text") {
},
});

const docs = await loader.load();
// Override scrape method if headers are available
let overrideHeaders = validatedHeaders(headers);
if (Object.keys(overrideHeaders).length > 0) {
loader.scrape = async function () {
const { launch } = await PuppeteerWebBaseLoader.imports();
const browser = await launch({
headless: "new",
defaultViewport: null,
ignoreDefaultArgs: ["--disable-extensions"],
...this.options?.launchOptions,
});
const page = await browser.newPage();
await page.setExtraHTTPHeaders(overrideHeaders);

await page.goto(this.webPath, {
timeout: 180000,
waitUntil: "networkidle2",
...this.options?.gotoOptions,
});

for (const doc of docs) {
pageContents.push(doc.pageContent);
const bodyHTML = this.options?.evaluate
? await this.options.evaluate(page, browser)
: await page.evaluate(() => document.body.innerHTML);

await browser.close();
return bodyHTML;
};
}

const docs = await loader.load();
for (const doc of docs) pageContents.push(doc.pageContent);
return pageContents.join(" ");
} catch (error) {
console.error(
Expand All @@ -112,6 +169,7 @@ async function getPageContent(link, captureAs = "text") {
"Content-Type": "text/plain",
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)",
...validatedHeaders(headers),
},
}).then((res) => res.text());
return pageText;
Expand Down
25 changes: 21 additions & 4 deletions collector/processLink/index.js
Original file line number Diff line number Diff line change
@@ -1,20 +1,37 @@
const { validURL } = require("../utils/url");
const { scrapeGenericUrl } = require("./convert/generic");

async function processLink(link) {
/**
* Process a link and return the text content. This util will save the link as a document
* so it can be used for embedding later.
* @param {string} link - The link to process
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function processLink(link, scraperHeaders = {}) {
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl(link);
return await scrapeGenericUrl({
link,
captureAs: "text",
processAsDocument: true,
scraperHeaders,
});
}

/**
* Get the text content of a link
* Get the text content of a link - does not save the link as a document
* Mostly used in agentic flows/tools calls to get the text content of a link
* @param {string} link - The link to get the text content of
* @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function getLinkText(link, captureAs = "text") {
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl(link, captureAs, false);
return await scrapeGenericUrl({
link,
captureAs,
processAsDocument: false,
});
}

module.exports = {
Expand Down
18 changes: 14 additions & 4 deletions server/endpoints/api/document/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,11 @@ function apiDocumentEndpoints(app) {
type: 'object',
example: {
"link": "https://anythingllm.com",
"addToWorkspaces": "workspace1,workspace2"
"addToWorkspaces": "workspace1,workspace2",
"scraperHeaders": {
"Authorization": "Bearer token123",
"My-Custom-Header": "value"
}
}
}
}
Expand Down Expand Up @@ -365,7 +369,11 @@ function apiDocumentEndpoints(app) {
*/
try {
const Collector = new CollectorApi();
const { link, addToWorkspaces = "" } = reqBody(request);
const {
link,
addToWorkspaces = "",
scraperHeaders = {},
} = reqBody(request);
const processingOnline = await Collector.online();

if (!processingOnline) {
Expand All @@ -379,8 +387,10 @@ function apiDocumentEndpoints(app) {
return;
}

const { success, reason, documents } =
await Collector.processLink(link);
const { success, reason, documents } = await Collector.processLink(
link,
scraperHeaders
);
if (!success) {
response
.status(500)
Expand Down
6 changes: 5 additions & 1 deletion server/swagger/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1092,7 +1092,11 @@
"type": "object",
"example": {
"link": "https://anythingllm.com",
"addToWorkspaces": "workspace1,workspace2"
"addToWorkspaces": "workspace1,workspace2",
"scraperHeaders": {
"Authorization": "Bearer token123",
"My-Custom-Header": "value"
}
}
}
}
Expand Down
10 changes: 8 additions & 2 deletions server/utils/collectorApi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,18 @@ class CollectorApi {
* Process a link
* - Will append the options to the request body
* @param {string} link - The link to process
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply to the web-scraping request URL
* @returns {Promise<Object>} - The response from the collector API
*/
async processLink(link = "") {
async processLink(link = "", scraperHeaders = {}) {
if (!link) return false;

const data = JSON.stringify({ link, options: this.#attachOptions() });
const data = JSON.stringify({
link,
scraperHeaders,
options: this.#attachOptions(),
});

return await fetch(`${this.endpoint}/process-link`, {
method: "POST",
headers: {
Expand Down