Mintplex-Labs · timothycarambat · Feb 14, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/collector/index.js b/collector/index.js
@@ -83,9 +83,9 @@ app.post(
   "/util/get-link",
   [verifyPayloadIntegrity],
   async function (request, response) {
-    const { link } = reqBody(request);
+    const { link, captureAs = "text" } = reqBody(request);
     try {
-      const { success, content = null } = await getLinkText(link);
+      const { success, content = null } = await getLinkText(link, captureAs);
       response.status(200).json({ url: link, success, content });
     } catch (e) {
       console.error(e);

diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
@@ -6,9 +6,20 @@ const { writeToServerDocuments } = require("../../utils/files");
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function scrapeGenericUrl(link, textOnly = false) {
-  console.log(`-- Working URL ${link} --`);
-  const content = await getPageContent(link);
+/**
+ * Scrape a generic URL and return the content in the specified format
+ * @param {string} link - The URL to scrape
+ * @param {('html' | 'text')} captureAs - The format to capture the page content as
+ * @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly
+ * @returns {Promise<Object>} - The content of the page
+ */
+async function scrapeGenericUrl(
+  link,
+  captureAs = "text",
+  processAsDocument = true
+) {
+  console.log(`-- Working URL ${link} => (${captureAs}) --`);
+  const content = await getPageContent(link, captureAs);
 
   if (!content.length) {
     console.error(`Resulting URL content was empty at ${link}.`);
@@ -19,7 +30,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
     };
   }
 
-  if (textOnly) {
+  if (!processAsDocument) {
     return {
       success: true,
       content,
@@ -52,7 +63,13 @@ async function scrapeGenericUrl(link, textOnly = false) {
   return { success: true, reason: null, documents: [document] };
 }
 
-async function getPageContent(link) {
+/**
+ * Get the content of a page
+ * @param {string} link - The URL to get the content of
+ * @param {('html' | 'text')} captureAs - The format to capture the page content as
+ * @returns {Promise<string>} - The content of the page
+ */
+async function getPageContent(link, captureAs = "text") {
   try {
     let pageContents = [];
     const loader = new PuppeteerWebBaseLoader(link, {
@@ -64,7 +81,11 @@ async function getPageContent(link) {
         waitUntil: "networkidle2",
       },
       async evaluate(page, browser) {
-        const result = await page.evaluate(() => document.body.innerText);
+        const result = await page.evaluate((captureAs) => {
+          if (captureAs === "text") return document.body.innerText;
+          if (captureAs === "html") return document.documentElement.innerHTML;
+          return document.body.innerText;
+        }, captureAs);
         await browser.close();
         return result;
       },

diff --git a/collector/processLink/index.js b/collector/processLink/index.js
@@ -6,9 +6,15 @@ async function processLink(link) {
   return await scrapeGenericUrl(link);
 }
 
-async function getLinkText(link) {
+/**
+ * Get the text content of a link
+ * @param {string} link - The link to get the text content of
+ * @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as
+ * @returns {Promise<{success: boolean, content: string}>} - Response from collector
+ */
+async function getLinkText(link, captureAs = "text") {
   if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
-  return await scrapeGenericUrl(link, true);
+  return await scrapeGenericUrl(link, captureAs, false);
 }
 
 module.exports = {

diff --git a/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx b/frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx
@@ -126,6 +126,8 @@ const BLOCK_INFO = {
     description: "Scrape content from a webpage",
     defaultConfig: {
       url: "",
+      captureAs: "text",
+      querySelector: "",
       resultVariable: "",
     },
     getSummary: (config) => config.url || "No URL specified",

diff --git a/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx b/frontend/src/pages/Admin/AgentBuilder/nodes/WebScrapingNode/index.jsx
@@ -25,6 +25,48 @@ export default function WebScrapingNode({
         />
       </div>
 
+      <div>
+        <label className="block text-sm font-medium text-theme-text-primary mb-2">
+          Capture Page Content As
+        </label>
+        <select
+          value={config.captureAs}
+          onChange={(e) => onConfigChange({ captureAs: e.target.value })}
+          className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5"
+        >
+          {[
+            { label: "Text content only", value: "text" },
+            { label: "Raw HTML", value: "html" },
+            { label: "CSS Query Selector", value: "querySelector" },
+          ].map((captureAs) => (
+            <option
+              key={captureAs.value}
+              value={captureAs.value}
+              className="bg-theme-settings-input-bg"
+            >
+              {captureAs.label}
+            </option>
+          ))}
+        </select>
+      </div>
+
+      {config.captureAs === "querySelector" && (
+        <div>
+          <label className="block text-sm font-medium text-theme-text-primary mb-2">
+            Query Selector
+          </label>
+          <p className="text-xs text-theme-text-secondary mb-2">
+            Enter a valid CSS selector to scrape the content of the page.
+          </p>
+          <input
+            value={config.querySelector}
+            onChange={(e) => onConfigChange({ querySelector: e.target.value })}
+            placeholder=".article-content, #content, .main-content, etc."
+            className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5"
+          />
+        </div>
+      )}
+
       <div>
         <label className="block text-sm font-medium text-theme-text-primary mb-2">
           Result Variable

diff --git a/server/package.json b/server/package.json
@@ -43,6 +43,7 @@
     "body-parser": "^1.20.2",
     "chalk": "^4",
     "check-disk-space": "^3.4.0",
+    "cheerio": "^1.0.0",
     "chromadb": "^1.5.2",
     "cohere-ai": "^7.9.5",
     "cors": "^2.8.5",

diff --git a/server/utils/agentFlows/executors/web-scraping.js b/server/utils/agentFlows/executors/web-scraping.js
@@ -10,15 +10,22 @@ const { summarizeContent } = require("../../agents/aibitat/utils/summarize");
  * @returns {Promise<string>} Scraped content
  */
 async function executeWebScraping(config, context) {
-  const { url } = config;
+  const { url, captureAs = "text" } = config;
   const { introspect, model, provider } = context;
 
   if (!url) {
     throw new Error("URL is required for web scraping");
   }
 
-  introspect(`Scraping the content of ${url}`);
-  const { success, content } = await new CollectorApi().getLinkContent(url);
+  // Remap the captureAs to the correct mode for the CollectorApi
+  const captureMode = captureAs === "querySelector" ? "html" : captureAs;
+  introspect(`Scraping the content of ${url} as ${captureAs}`);
+  const { success, content } = await new CollectorApi()
+    .getLinkContent(url, captureMode)
+    .then((res) => {
+      if (captureAs !== "querySelector") return res;
+      return parseHTMLwithSelector(res.content, config.querySelector, context);
+    });
 
   if (!success) {
     introspect(`Could not scrape ${url}. Cannot use this page's content.`);
@@ -52,4 +59,38 @@ async function executeWebScraping(config, context) {
   return summary;
 }
 
+/**
+ * Parse HTML with a CSS selector
+ * @param {string} html - The HTML to parse
+ * @param {string|null} selector - The CSS selector to use (as text string)
+ * @param {{introspect: Function}} context - The context object
+ * @returns {Object} The parsed content
+ */
+function parseHTMLwithSelector(html, selector = null, context) {
+  if (!selector || selector.length === 0) {
+    context.introspect("No selector provided. Returning the entire HTML.");
+    return { success: true, content: html };
+  }
+
+  const Cheerio = require("cheerio");
+  const $ = Cheerio.load(html);
+  const selectedElements = $(selector);
+
+  let content;
+  if (selectedElements.length === 0) {
+    return { success: false, content: null };
+  } else if (selectedElements.length === 1) {
+    content = selectedElements.html();
+  } else {
+    context.introspect(
+      `Found ${selectedElements.length} elements matching selector: ${selector}`
+    );
+    content = selectedElements
+      .map((_, element) => $(element).html())
+      .get()
+      .join("\n");
+  }
+  return { success: true, content };
+}
+
 module.exports = executeWebScraping;
diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js
@@ -148,10 +148,10 @@ class CollectorApi {
       });
   }
 
-  async getLinkContent(link = "") {
+  async getLinkContent(link = "", captureAs = "text") {
     if (!link) return false;
 
-    const data = JSON.stringify({ link });
+    const data = JSON.stringify({ link, captureAs });
     return await fetch(`${this.endpoint}/util/get-link`, {
       method: "POST",
       headers: {