From 428aa855a40d813236915ba7a5899acabe5f33a1 Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Tue, 29 Jul 2025 14:42:13 -0700 Subject: [PATCH 1/5] create parse endpoint in collector --- collector/index.js | 33 ++++++++++++++++++- .../processSingleFile/convert/asAudio.js | 4 ++- collector/processSingleFile/convert/asDocx.js | 6 ++-- collector/processSingleFile/convert/asEPub.js | 6 ++-- .../processSingleFile/convert/asImage.js | 4 ++- collector/processSingleFile/convert/asMbox.js | 4 ++- .../processSingleFile/convert/asOfficeMime.js | 4 ++- .../processSingleFile/convert/asPDF/index.js | 4 ++- collector/processSingleFile/convert/asTxt.js | 6 ++-- collector/processSingleFile/convert/asXlsx.js | 20 ++++++----- collector/utils/files/index.js | 15 ++++++++- 11 files changed, 84 insertions(+), 22 deletions(-) diff --git a/collector/index.js b/collector/index.js index 73091efcdee..e328d5ba047 100644 --- a/collector/index.js +++ b/collector/index.js @@ -11,7 +11,7 @@ const { ACCEPTED_MIMES } = require("./utils/constants"); const { reqBody } = require("./utils/http"); const { processSingleFile } = require("./processSingleFile"); const { processLink, getLinkText } = require("./processLink"); -const { wipeCollectorStorage } = require("./utils/files"); +const { wipeCollectorStorage, cleanupTempDocuments } = require("./utils/files"); const extensions = require("./extensions"); const { processRawText } = require("./processRawText"); const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity"); @@ -58,6 +58,36 @@ app.post( } ); +app.post( + "/parse", + [verifyPayloadIntegrity], + async function (request, response) { + const { filename, options = {} } = reqBody(request); + try { + const targetFilename = path + .normalize(filename) + .replace(/^(\.\.(\/|\\|$))+/, ""); + const { + success, + reason, + documents = [], + } = await processSingleFile(targetFilename, { ...options, parseOnly: true }); + response + .status(200) + .json({ filename: targetFilename, success, reason, documents }); + } catch (e) { + console.error(e); + response.status(200).json({ + filename: filename, + success: false, + reason: "A processing error occurred.", + documents: [], + }); + } + return; + } +); + app.post( "/process-link", [verifyPayloadIntegrity], @@ -143,6 +173,7 @@ app.all("*", function (_, response) { app .listen(8888, async () => { await wipeCollectorStorage(); + await cleanupTempDocuments(); console.log(`Document processor app listening on port 8888`); }) .on("error", function (_) { diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index 5f033af74a9..298e69cdd93 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -61,7 +61,9 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) { const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}` + `${slugify(filename)}-${data.id}`, + null, + options ); trashFile(fullFilePath); console.log( diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index d33a46b9433..3f24e54ca0f 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -8,7 +8,7 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function asDocX({ fullFilePath = "", filename = "" }) { +async function asDocX({ fullFilePath = "", filename = "", options = {} }) { const loader = new DocxLoader(fullFilePath); console.log(`-- Working ${filename} --`); @@ -47,7 +47,9 @@ async function asDocX({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}` + `${slugify(filename)}-${data.id}`, + null, + options ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js index 51bb20c809b..2186c60c48e 100644 --- a/collector/processSingleFile/convert/asEPub.js +++ b/collector/processSingleFile/convert/asEPub.js @@ -8,7 +8,7 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asEPub({ fullFilePath = "", filename = "" }) { +async function asEPub({ fullFilePath = "", filename = "", options = {} }) { let content = ""; try { const loader = new EPubLoader(fullFilePath, { splitChapters: false }); @@ -45,7 +45,9 @@ async function asEPub({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}` + `${slugify(filename)}-${data.id}`, + null, + options ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js index d2d197b227a..5625fe3beee 100644 --- a/collector/processSingleFile/convert/asImage.js +++ b/collector/processSingleFile/convert/asImage.js @@ -40,7 +40,9 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) { const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}` + `${slugify(filename)}-${data.id}`, + null, + options ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index 48de60fa37a..f16f03596aa 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -59,7 +59,9 @@ async function asMbox({ fullFilePath = "", filename = "" }) { item++; const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}-msg-${item}` + `${slugify(filename)}-${data.id}`, + null, + options ); documents.push(document); } diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index 09e320d1689..cce1a067633 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -43,7 +43,9 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}` + `${slugify(filename)}-${data.id}`, + null, + options ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index b929fbda16f..17c3c977d1d 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -64,7 +64,9 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) { const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}` + `${slugify(filename)}-${data.id}`, + null, + options ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index bc95969e14d..ba4f19d5027 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -8,7 +8,7 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asTxt({ fullFilePath = "", filename = "" }) { +async function asTxt({ fullFilePath = "", filename = "", options = {} }) { let content = ""; try { content = fs.readFileSync(fullFilePath, "utf8"); @@ -43,7 +43,9 @@ async function asTxt({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}` + `${slugify(filename)}-${data.id}`, + null, + options ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index ca9b8ebac9d..f71140bed08 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -25,20 +25,21 @@ function convertToCSV(data) { .join("\n"); } -async function asXlsx({ fullFilePath = "", filename = "" }) { +async function asXlsx({ fullFilePath = "", filename = "", options = {} }) { const documents = []; const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { lower: true, trim: true, }); - const outFolderPath = - process.env.NODE_ENV === "development" - ? path.resolve( - __dirname, - `../../../server/storage/documents/${folderName}` - ) - : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); + const outFolderPath = options.parseOnly + ? path.resolve(__dirname, "../../../server/storage/temp-documents") + : process.env.NODE_ENV === "development" + ? path.resolve( + __dirname, + `../../../server/storage/documents/${folderName}` + ) + : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); try { const workSheetsFromFile = xlsx.parse(fullFilePath); @@ -73,7 +74,8 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments( sheetData, `sheet-${slugify(name)}`, - outFolderPath + outFolderPath, + options ); documents.push(document); console.log( diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 14b6cf92159..03c543b12ba 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -99,10 +99,13 @@ function createdDate(filepath) { function writeToServerDocuments( data = {}, filename, - destinationOverride = null + destinationOverride = null, + options = {} ) { const destination = destinationOverride ? path.resolve(destinationOverride) + : options.parseOnly + ? path.resolve(__dirname, "../../../server/storage/temp-documents") : path.resolve( __dirname, "../../../server/storage/documents/custom-documents" @@ -121,9 +124,18 @@ function writeToServerDocuments( // that will work since we know the location exists and since we only allow // 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube. location: destinationFilePath.split("/").slice(-2).join("/"), + isTemporary: !!options.parseOnly, }; } +function cleanupTempDocuments() { + const tempDir = path.resolve(__dirname, "../../../server/storage/temp-documents"); + if (fs.existsSync(tempDir)) { + fs.rmSync(tempDir, { recursive: true, force: true }); + fs.mkdirSync(tempDir, { recursive: true }); + } +} + // When required we can wipe the entire collector hotdir and tmp storage in case // there were some large file failures that we unable to be removed a reboot will // force remove them. @@ -199,4 +211,5 @@ module.exports = { isWithin, sanitizeFileName, documentsFolder, + cleanupTempDocuments, }; From eec1c88fe44faeb33d0fdd383852a4d8b65c65af Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Tue, 29 Jul 2025 14:44:44 -0700 Subject: [PATCH 2/5] revert cleanup temp util call --- collector/index.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/collector/index.js b/collector/index.js index e328d5ba047..011946390d9 100644 --- a/collector/index.js +++ b/collector/index.js @@ -11,7 +11,7 @@ const { ACCEPTED_MIMES } = require("./utils/constants"); const { reqBody } = require("./utils/http"); const { processSingleFile } = require("./processSingleFile"); const { processLink, getLinkText } = require("./processLink"); -const { wipeCollectorStorage, cleanupTempDocuments } = require("./utils/files"); +const { wipeCollectorStorage } = require("./utils/files"); const extensions = require("./extensions"); const { processRawText } = require("./processRawText"); const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity"); @@ -173,7 +173,6 @@ app.all("*", function (_, response) { app .listen(8888, async () => { await wipeCollectorStorage(); - await cleanupTempDocuments(); console.log(`Document processor app listening on port 8888`); }) .on("error", function (_) { From 1a6c522cd31a71e565aa122e141923ceaee3eca0 Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Tue, 29 Jul 2025 14:47:00 -0700 Subject: [PATCH 3/5] lint --- collector/index.js | 5 ++++- collector/processSingleFile/convert/asXlsx.js | 5 +---- collector/utils/files/index.js | 5 ++++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/collector/index.js b/collector/index.js index 011946390d9..962ee6f8c65 100644 --- a/collector/index.js +++ b/collector/index.js @@ -71,7 +71,10 @@ app.post( success, reason, documents = [], - } = await processSingleFile(targetFilename, { ...options, parseOnly: true }); + } = await processSingleFile(targetFilename, { + ...options, + parseOnly: true, + }); response .status(200) .json({ filename: targetFilename, success, reason, documents }); diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index f71140bed08..4fefea8868f 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -35,10 +35,7 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {} }) { const outFolderPath = options.parseOnly ? path.resolve(__dirname, "../../../server/storage/temp-documents") : process.env.NODE_ENV === "development" - ? path.resolve( - __dirname, - `../../../server/storage/documents/${folderName}` - ) + ? path.resolve(__dirname, `../../../server/storage/documents/${folderName}`) : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); try { diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 03c543b12ba..5ffc7e159fe 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -129,7 +129,10 @@ function writeToServerDocuments( } function cleanupTempDocuments() { - const tempDir = path.resolve(__dirname, "../../../server/storage/temp-documents"); + const tempDir = path.resolve( + __dirname, + "../../../server/storage/temp-documents" + ); if (fs.existsSync(tempDir)) { fs.rmSync(tempDir, { recursive: true, force: true }); fs.mkdirSync(tempDir, { recursive: true }); From 35013b38072c4bfed58988006e0023786e401dae Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Tue, 29 Jul 2025 17:16:30 -0700 Subject: [PATCH 4/5] remove unused cleanupTempDocuments function --- collector/utils/files/index.js | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 5ffc7e159fe..b841192ad68 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -128,17 +128,6 @@ function writeToServerDocuments( }; } -function cleanupTempDocuments() { - const tempDir = path.resolve( - __dirname, - "../../../server/storage/temp-documents" - ); - if (fs.existsSync(tempDir)) { - fs.rmSync(tempDir, { recursive: true, force: true }); - fs.mkdirSync(tempDir, { recursive: true }); - } -} - // When required we can wipe the entire collector hotdir and tmp storage in case // there were some large file failures that we unable to be removed a reboot will // force remove them. @@ -214,5 +203,4 @@ module.exports = { isWithin, sanitizeFileName, documentsFolder, - cleanupTempDocuments, }; From 1540bba612ffefdcc6de8a7f51df60a8bd1f9ce3 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Tue, 29 Jul 2025 17:37:16 -0700 Subject: [PATCH 5/5] revert slug change minor change for destinations --- collector/processSingleFile/convert/asMbox.js | 2 +- collector/processSingleFile/convert/asXlsx.js | 9 +++---- collector/utils/files/index.js | 26 ++++++++++++------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index f16f03596aa..d7286d22dd5 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -59,7 +59,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) { item++; const document = writeToServerDocuments( data, - `${slugify(filename)}-${data.id}`, + `${slugify(filename)}-${data.id}-msg-${item}`, null, options ); diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 4fefea8868f..2f8c713626b 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -6,6 +6,8 @@ const { createdDate, trashFile, writeToServerDocuments, + documentsFolder, + directUploadsFolder, } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); @@ -31,12 +33,9 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {} }) { lower: true, trim: true, }); - const outFolderPath = options.parseOnly - ? path.resolve(__dirname, "../../../server/storage/temp-documents") - : process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../../server/storage/documents/${folderName}`) - : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); + ? path.resolve(directUploadsFolder, folderName) + : path.resolve(documentsFolder, folderName); try { const workSheetsFromFile = xlsx.parse(fullFilePath); diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index b841192ad68..34786897970 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -11,6 +11,16 @@ const documentsFolder = ? path.resolve(__dirname, `../../../server/storage/documents`) : path.resolve(process.env.STORAGE_DIR, `documents`); +/** + * The folder where direct uploads are stored to be stored when + * processed by the collector. These are files that were DnD'd into UI + * and are not to be embedded or selectable from the file picker. + */ +const directUploadsFolder = + process.env.NODE_ENV === "development" + ? path.resolve(__dirname, `../../../server/storage/direct-uploads`) + : path.resolve(process.env.STORAGE_DIR, `direct-uploads`); + /** * Checks if a file is text by checking the mime type and then falling back to buffer inspection. * This way we can capture all the cases where the mime type is not known but still parseable as text @@ -102,14 +112,11 @@ function writeToServerDocuments( destinationOverride = null, options = {} ) { - const destination = destinationOverride - ? path.resolve(destinationOverride) - : options.parseOnly - ? path.resolve(__dirname, "../../../server/storage/temp-documents") - : path.resolve( - __dirname, - "../../../server/storage/documents/custom-documents" - ); + let destination = null; + if (destinationOverride) destination = path.resolve(destinationOverride); + else if (options.parseOnly) destination = path.resolve(directUploadsFolder); + else destination = path.resolve(documentsFolder, "custom-documents"); + if (!fs.existsSync(destination)) fs.mkdirSync(destination, { recursive: true }); const destinationFilePath = path.resolve(destination, filename) + ".json"; @@ -124,7 +131,7 @@ function writeToServerDocuments( // that will work since we know the location exists and since we only allow // 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube. location: destinationFilePath.split("/").slice(-2).join("/"), - isTemporary: !!options.parseOnly, + isDirectUpload: options.parseOnly || false, }; } @@ -203,4 +210,5 @@ module.exports = { isWithin, sanitizeFileName, documentsFolder, + directUploadsFolder, };