θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions collector/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ app.post(
"/util/get-link",
[verifyPayloadIntegrity],
async function (request, response) {
const { link } = reqBody(request);
const { link, captureAs = "text" } = reqBody(request);
try {
const { success, content = null } = await getLinkText(link);
const { success, content = null } = await getLinkText(link, captureAs);
response.status(200).json({ url: link, success, content });
} catch (e) {
console.error(e);
Expand Down
33 changes: 27 additions & 6 deletions collector/processLink/convert/generic.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,20 @@ const { writeToServerDocuments } = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

async function scrapeGenericUrl(link, textOnly = false) {
console.log(`-- Working URL ${link} --`);
const content = await getPageContent(link);
/**
* Scrape a generic URL and return the content in the specified format
* @param {string} link - The URL to scrape
* @param {('html' | 'text')} captureAs - The format to capture the page content as
* @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly
* @returns {Promise<Object>} - The content of the page
*/
async function scrapeGenericUrl(
link,
captureAs = "text",
processAsDocument = true
) {
console.log(`-- Working URL ${link} => (${captureAs}) --`);
const content = await getPageContent(link, captureAs);

if (!content.length) {
console.error(`Resulting URL content was empty at ${link}.`);
Expand All @@ -19,7 +30,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
};
}

if (textOnly) {
if (!processAsDocument) {
return {
success: true,
content,
Expand Down Expand Up @@ -52,7 +63,13 @@ async function scrapeGenericUrl(link, textOnly = false) {
return { success: true, reason: null, documents: [document] };
}

async function getPageContent(link) {
/**
* Get the content of a page
* @param {string} link - The URL to get the content of
* @param {('html' | 'text')} captureAs - The format to capture the page content as
* @returns {Promise<string>} - The content of the page
*/
async function getPageContent(link, captureAs = "text") {
try {
let pageContents = [];
const loader = new PuppeteerWebBaseLoader(link, {
Expand All @@ -64,7 +81,11 @@ async function getPageContent(link) {
waitUntil: "networkidle2",
},
async evaluate(page, browser) {
const result = await page.evaluate(() => document.body.innerText);
const result = await page.evaluate((captureAs) => {
if (captureAs === "text") return document.body.innerText;
if (captureAs === "html") return document.documentElement.innerHTML;
return document.body.innerText;
}, captureAs);
await browser.close();
return result;
},
Expand Down
10 changes: 8 additions & 2 deletions collector/processLink/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,15 @@ async function processLink(link) {
return await scrapeGenericUrl(link);
}

async function getLinkText(link) {
/**
* Get the text content of a link
* @param {string} link - The link to get the text content of
* @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function getLinkText(link, captureAs = "text") {
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl(link, true);
return await scrapeGenericUrl(link, captureAs, false);
}

module.exports = {
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/pages/Admin/AgentBuilder/BlockList/index.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ const BLOCK_INFO = {
description: "Scrape content from a webpage",
defaultConfig: {
url: "",
captureAs: "text",
querySelector: "",
resultVariable: "",
},
getSummary: (config) => config.url || "No URL specified",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,48 @@ export default function WebScrapingNode({
/>
</div>

<div>
<label className="block text-sm font-medium text-theme-text-primary mb-2">
Capture Page Content As
</label>
<select
value={config.captureAs}
onChange={(e) => onConfigChange({ captureAs: e.target.value })}
className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5"
>
{[
{ label: "Text content only", value: "text" },
{ label: "Raw HTML", value: "html" },
{ label: "CSS Query Selector", value: "querySelector" },
].map((captureAs) => (
<option
key={captureAs.value}
value={captureAs.value}
className="bg-theme-settings-input-bg"
>
{captureAs.label}
</option>
))}
</select>
</div>

{config.captureAs === "querySelector" && (
<div>
<label className="block text-sm font-medium text-theme-text-primary mb-2">
Query Selector
</label>
<p className="text-xs text-theme-text-secondary mb-2">
Enter a valid CSS selector to scrape the content of the page.
</p>
<input
value={config.querySelector}
onChange={(e) => onConfigChange({ querySelector: e.target.value })}
placeholder=".article-content, #content, .main-content, etc."
className="w-full border-none bg-theme-settings-input-bg text-theme-text-primary text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none p-2.5"
/>
</div>
)}

<div>
<label className="block text-sm font-medium text-theme-text-primary mb-2">
Result Variable
Expand Down
1 change: 1 addition & 0 deletions server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
"body-parser": "^1.20.2",
"chalk": "^4",
"check-disk-space": "^3.4.0",
"cheerio": "^1.0.0",
"chromadb": "^1.5.2",
"cohere-ai": "^7.9.5",
"cors": "^2.8.5",
Expand Down
47 changes: 44 additions & 3 deletions server/utils/agentFlows/executors/web-scraping.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,22 @@ const { summarizeContent } = require("../../agents/aibitat/utils/summarize");
* @returns {Promise<string>} Scraped content
*/
async function executeWebScraping(config, context) {
const { url } = config;
const { url, captureAs = "text" } = config;
const { introspect, model, provider } = context;

if (!url) {
throw new Error("URL is required for web scraping");
}

introspect(`Scraping the content of ${url}`);
const { success, content } = await new CollectorApi().getLinkContent(url);
// Remap the captureAs to the correct mode for the CollectorApi
const captureMode = captureAs === "querySelector" ? "html" : captureAs;
introspect(`Scraping the content of ${url} as ${captureAs}`);
const { success, content } = await new CollectorApi()
.getLinkContent(url, captureMode)
.then((res) => {
if (captureAs !== "querySelector") return res;
return parseHTMLwithSelector(res.content, config.querySelector, context);
});

if (!success) {
introspect(`Could not scrape ${url}. Cannot use this page's content.`);
Expand Down Expand Up @@ -52,4 +59,38 @@ async function executeWebScraping(config, context) {
return summary;
}

/**
* Parse HTML with a CSS selector
* @param {string} html - The HTML to parse
* @param {string|null} selector - The CSS selector to use (as text string)
* @param {{introspect: Function}} context - The context object
* @returns {Object} The parsed content
*/
function parseHTMLwithSelector(html, selector = null, context) {
if (!selector || selector.length === 0) {
context.introspect("No selector provided. Returning the entire HTML.");
return { success: true, content: html };
}

const Cheerio = require("cheerio");
const $ = Cheerio.load(html);
const selectedElements = $(selector);

let content;
if (selectedElements.length === 0) {
return { success: false, content: null };
} else if (selectedElements.length === 1) {
content = selectedElements.html();
} else {
context.introspect(
`Found ${selectedElements.length} elements matching selector: ${selector}`
);
content = selectedElements
.map((_, element) => $(element).html())
.get()
.join("\n");
}
return { success: true, content };
}

module.exports = executeWebScraping;
4 changes: 2 additions & 2 deletions server/utils/collectorApi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,10 @@ class CollectorApi {
});
}

async getLinkContent(link = "") {
async getLinkContent(link = "", captureAs = "text") {
if (!link) return false;

const data = JSON.stringify({ link });
const data = JSON.stringify({ link, captureAs });
return await fetch(`${this.endpoint}/util/get-link`, {
method: "POST",
headers: {
Expand Down
Loading