From 4944c06d1f96e20edd1472fbbadea0ab874f0e63 Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Wed, 11 Dec 2024 12:14:31 -0800 Subject: [PATCH 1/4] fix scraping failed bug in link/bulk link scrapers --- collector/processLink/convert/generic.js | 2 +- collector/utils/extensions/WebsiteDepth/index.js | 4 ++-- embed | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index c12d79ade58..8979ad49448 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -61,7 +61,7 @@ async function getPageContent(link) { ignoreHTTPSErrors: true, }, gotoOptions: { - waitUntil: "domcontentloaded", + waitUntil: "networkidle0", }, async evaluate(page, browser) { const result = await page.evaluate(() => document.body.innerText); diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index d8b23144dc2..49092dab7ff 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -48,7 +48,7 @@ async function getPageLinks(url, baseUrl) { try { const loader = new PuppeteerWebBaseLoader(url, { launchOptions: { headless: "new" }, - gotoOptions: { waitUntil: "domcontentloaded" }, + gotoOptions: { waitUntil: "networkidle0" }, }); const docs = await loader.load(); const html = docs[0].pageContent; @@ -92,7 +92,7 @@ async function bulkScrapePages(links, outFolderPath) { try { const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new" }, - gotoOptions: { waitUntil: "domcontentloaded" }, + gotoOptions: { waitUntil: "networkidle0" }, async evaluate(page, browser) { const result = await page.evaluate(() => document.body.innerText); await browser.close(); diff --git a/embed b/embed index 6bd51d251ff..30a535a6598 160000 --- a/embed +++ b/embed @@ -1 +1 @@ -Subproject commit 6bd51d251ff1b204d7d88cdda0061df00676386e +Subproject commit 30a535a65989c6fc9abf90cdf575aee462aa0acf From e6a3d9b6ccc1e79a61d1df4ff8cb14b361244c8f Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 11 Dec 2024 13:54:16 -0800 Subject: [PATCH 2/4] reset submodule --- embed | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/embed b/embed index 30a535a6598..6bd51d251ff 160000 --- a/embed +++ b/embed @@ -1 +1 @@ -Subproject commit 30a535a65989c6fc9abf90cdf575aee462aa0acf +Subproject commit 6bd51d251ff1b204d7d88cdda0061df00676386e From 3a8a030cb8b7fdacc53eb044991584d2492129dd Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 11 Dec 2024 13:59:00 -0800 Subject: [PATCH 3/4] swap to networkidle2 as a safe mix for SPA and API-loaded pages, but also not hang on request heavy pages --- collector/processLink/convert/generic.js | 2 +- collector/utils/extensions/WebsiteDepth/index.js | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index 8979ad49448..a5eb20ca945 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -61,7 +61,7 @@ async function getPageContent(link) { ignoreHTTPSErrors: true, }, gotoOptions: { - waitUntil: "networkidle0", + waitUntil: "networkidle2", }, async evaluate(page, browser) { const result = await page.evaluate(() => document.body.innerText); diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index 49092dab7ff..aab0d8476d1 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -48,7 +48,7 @@ async function getPageLinks(url, baseUrl) { try { const loader = new PuppeteerWebBaseLoader(url, { launchOptions: { headless: "new" }, - gotoOptions: { waitUntil: "networkidle0" }, + gotoOptions: { waitUntil: "networkidle2" }, }); const docs = await loader.load(); const html = docs[0].pageContent; @@ -92,7 +92,7 @@ async function bulkScrapePages(links, outFolderPath) { try { const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new" }, - gotoOptions: { waitUntil: "networkidle0" }, + gotoOptions: { waitUntil: "networkidle2" }, async evaluate(page, browser) { const result = await page.evaluate(() => document.body.innerText); await browser.close(); @@ -145,9 +145,9 @@ async function websiteScraper(startUrl, depth = 1, maxLinks = 20) { const outFolderPath = process.env.NODE_ENV === "development" ? path.resolve( - __dirname, - `../../../../server/storage/documents/${outFolder}` - ) + __dirname, + `../../../../server/storage/documents/${outFolder}` + ) : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`); console.log("Discovering links..."); From 6dab199458a1a357050d9d483f465a596de6a775 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 11 Dec 2024 14:01:28 -0800 Subject: [PATCH 4/4] lint --- collector/utils/extensions/WebsiteDepth/index.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index aab0d8476d1..e680c0233b7 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -145,9 +145,9 @@ async function websiteScraper(startUrl, depth = 1, maxLinks = 20) { const outFolderPath = process.env.NODE_ENV === "development" ? path.resolve( - __dirname, - `../../../../server/storage/documents/${outFolder}` - ) + __dirname, + `../../../../server/storage/documents/${outFolder}` + ) : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`); console.log("Discovering links...");