diff --git a/collector/index.js b/collector/index.js index 7c41002dabe..b307b58a45a 100644 --- a/collector/index.js +++ b/collector/index.js @@ -83,9 +83,9 @@ app.post( "/util/get-link", [verifyPayloadIntegrity], async function (request, response) { - const { link } = reqBody(request); + const { link, captureAs = "text" } = reqBody(request); try { - const { success, content = null } = await getLinkText(link); + const { success, content = null } = await getLinkText(link, captureAs); response.status(200).json({ url: link, success, content }); } catch (e) { console.error(e); diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index 4afb9b95483..a22166d4ce2 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -6,9 +6,20 @@ const { writeToServerDocuments } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function scrapeGenericUrl(link, textOnly = false) { - console.log(`-- Working URL ${link} --`); - const content = await getPageContent(link); +/** + * Scrape a generic URL and return the content in the specified format + * @param {string} link - The URL to scrape + * @param {('html' | 'text')} captureAs - The format to capture the page content as + * @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly + * @returns {Promise} - The content of the page + */ +async function scrapeGenericUrl( + link, + captureAs = "text", + processAsDocument = true +) { + console.log(`-- Working URL ${link} => (${captureAs}) --`); + const content = await getPageContent(link, captureAs); if (!content.length) { console.error(`Resulting URL content was empty at ${link}.`); @@ -19,7 +30,7 @@ async function scrapeGenericUrl(link, textOnly = false) { }; } - if (textOnly) { + if (!processAsDocument) { return { success: true, content, @@ -52,7 +63,13 @@ async function scrapeGenericUrl(link, textOnly = false) { return { success: true, reason: null, documents: [document] }; } -async function getPageContent(link) { +/** + * Get the content of a page + * @param {string} link - The URL to get the content of + * @param {('html' | 'text')} captureAs - The format to capture the page content as + * @returns {Promise} - The content of the page + */ +async function getPageContent(link, captureAs = "text") { try { let pageContents = []; const loader = new PuppeteerWebBaseLoader(link, { @@ -64,7 +81,11 @@ async function getPageContent(link) { waitUntil: "networkidle2", }, async evaluate(page, browser) { - const result = await page.evaluate(() => document.body.innerText); + const result = await page.evaluate((captureAs) => { + if (captureAs === "text") return document.body.innerText; + if (captureAs === "html") return document.documentElement.innerHTML; + return document.body.innerText; + }, captureAs); await browser.close(); return result; }, diff --git a/collector/processLink/index.js b/collector/processLink/index.js index afa517cae19..ac0c5916b07 100644 --- a/collector/processLink/index.js +++ b/collector/processLink/index.js @@ -6,9 +6,15 @@ async function processLink(link) { return await scrapeGenericUrl(link); } -async function getLinkText(link) { +/** + * Get the text content of a link + * @param {string} link - The link to get the text content of + * @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as + * @returns {Promise<{success: boolean, content: string}>} - Response from collector + */ +async function getLinkText(link, captureAs = "text") { if (!validURL(link)) return { success: false, reason: "Not a valid URL." }; - return await scrapeGenericUrl(link, true); + return await scrapeGenericUrl(link, captureAs, false); } module.exports = { diff --git a/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx b/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx index c937f109dd8..f3d674aee6f 100644 --- a/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx +++ b/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx @@ -126,6 +126,8 @@ const BLOCK_INFO = { description: "Scrape content from a webpage", defaultConfig: { url: "", + captureAs: "text", + querySelector: "", resultVariable: "", }, getSummary: (config) => config.url || "No URL specified", diff --git a/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx b/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx index fda51e34d3a..76655af74c8 100644 --- a/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx +++ b/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx @@ -25,6 +25,48 @@ export default function WebScrapingNode({ /> +
+ + +
+ + {config.captureAs === "querySelector" && ( +
+ +

+ Enter a valid CSS selector to scrape the content of the page. +

+ onConfigChange({ querySelector: e.target.value })} + placeholder=".article-content, #content, .main-content, etc." + className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5" + /> +
+ )} +