diff --git a/collector/index.js b/collector/index.js index 73091efcdee..962ee6f8c65 100644 --- a/collector/index.js +++ b/collector/index.js @@ -58,6 +58,39 @@ app.post( } ); +app.post( + "/parse", + [verifyPayloadIntegrity], + async function (request, response) { + const { filename, options = {} } = reqBody(request); + try { + const targetFilename = path + .normalize(filename) + .replace(/^(\.\.(\/|\\|$))+/, ""); + const { + success, + reason, + documents = [], + } = await processSingleFile(targetFilename, { + ...options, + parseOnly: true, + }); + response + .status(200) + .json({ filename: targetFilename, success, reason, documents }); + } catch (e) { + console.error(e); + response.status(200).json({ + filename: filename, + success: false, + reason: "A processing error occurred.", + documents: [], + }); + } + return; + } +); + app.post( "/process-link", [verifyPayloadIntegrity], diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index 8b179a52994..dccb2b9522a 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -62,6 +62,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) { const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}`, + options: { parseOnly: options.parseOnly }, }); trashFile(fullFilePath); console.log( diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index 2dfad739d06..6e2133b1e38 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -8,7 +8,7 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function asDocX({ fullFilePath = "", filename = "" }) { +async function asDocX({ fullFilePath = "", filename = "", options = {} }) { const loader = new DocxLoader(fullFilePath); console.log(`-- Working ${filename} --`); @@ -48,6 +48,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}`, + options: { parseOnly: options.parseOnly }, }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js index 3b7f7295c3c..15a01b23fc1 100644 --- a/collector/processSingleFile/convert/asEPub.js +++ b/collector/processSingleFile/convert/asEPub.js @@ -8,7 +8,7 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asEPub({ fullFilePath = "", filename = "" }) { +async function asEPub({ fullFilePath = "", filename = "", options = {} }) { let content = ""; try { const loader = new EPubLoader(fullFilePath, { splitChapters: false }); @@ -46,6 +46,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}`, + options: { parseOnly: options.parseOnly }, }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js index 8bf8adf738b..05eff2d5696 100644 --- a/collector/processSingleFile/convert/asImage.js +++ b/collector/processSingleFile/convert/asImage.js @@ -41,6 +41,7 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) { const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}`, + options: { parseOnly: options.parseOnly }, }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index 74427046a1f..e5a3a98d60f 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -60,6 +60,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}-msg-${item}`, + options: { parseOnly: options.parseOnly }, }); documents.push(document); } diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index 66a13588530..ac8ae31e7c3 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -44,6 +44,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}`, + options: { parseOnly: options.parseOnly }, }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index db66c7660b4..5971dd4e416 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -65,6 +65,7 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) { const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}`, + options: { parseOnly: options.parseOnly }, }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index 1abe49532e5..7e3bd92c06e 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -8,7 +8,7 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asTxt({ fullFilePath = "", filename = "" }) { +async function asTxt({ fullFilePath = "", filename = "", options = {} }) { let content = ""; try { content = fs.readFileSync(fullFilePath, "utf8"); @@ -44,6 +44,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) { const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}`, + options: { parseOnly: options.parseOnly }, }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index a64e8e20390..832e44a7df7 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -7,6 +7,7 @@ const { trashFile, writeToServerDocuments, documentsFolder, + directUploadsFolder, } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); @@ -26,14 +27,16 @@ function convertToCSV(data) { .join("\n"); } -async function asXlsx({ fullFilePath = "", filename = "" }) { +async function asXlsx({ fullFilePath = "", filename = "", options = {} }) { const documents = []; const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { lower: true, trim: true, }); + const outFolderPath = options.parseOnly + ? path.resolve(directUploadsFolder, folderName) + : path.resolve(documentsFolder, folderName); - const outFolderPath = path.resolve(documentsFolder, folderName); try { const workSheetsFromFile = xlsx.parse(fullFilePath); if (!fs.existsSync(outFolderPath)) @@ -68,6 +71,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { data: sheetData, filename: `sheet-${slugify(name)}`, destinationOverride: outFolderPath, + options: { parseOnly: options.parseOnly }, }); documents.push(document); console.log( diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index f1b9d93edd9..dd0f9315060 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -11,6 +11,16 @@ const documentsFolder = ? path.resolve(__dirname, `../../../server/storage/documents`) : path.resolve(process.env.STORAGE_DIR, `documents`); +/** + * The folder where direct uploads are stored to be stored when + * processed by the collector. These are files that were DnD'd into UI + * and are not to be embedded or selectable from the file picker. + */ +const directUploadsFolder = + process.env.NODE_ENV === "development" + ? path.resolve(__dirname, `../../../server/storage/direct-uploads`) + : path.resolve(process.env.STORAGE_DIR, `direct-uploads`); + /** * Checks if a file is text by checking the mime type and then falling back to buffer inspection. * This way we can capture all the cases where the mime type is not known but still parseable as text @@ -102,17 +112,21 @@ function createdDate(filepath) { * @param {Object} params.data - The data to write to the file. Must look like a document object. * @param {string} params.filename - The name of the file to write to. * @param {string|null} params.destinationOverride - A forced destination to write to - will be honored if provided. + * @param {Object} params.options - The options for the function. + * @param {boolean} params.options.parseOnly - If true, the file will be written to the direct uploads folder instead of the documents folder. Will be ignored if destinationOverride is provided. * @returns {Object} - The data with the location added. */ function writeToServerDocuments({ data = {}, - filename = null, + filename, destinationOverride = null, + options = {}, }) { if (!filename) throw new Error("Filename is required!"); let destination = null; if (destinationOverride) destination = path.resolve(destinationOverride); + else if (options.parseOnly) destination = path.resolve(directUploadsFolder); else destination = path.resolve(documentsFolder, "custom-documents"); if (!fs.existsSync(destination)) @@ -129,6 +143,7 @@ function writeToServerDocuments({ // that will work since we know the location exists and since we only allow // 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube. location: destinationFilePath.split("/").slice(-2).join("/"), + isDirectUpload: options.parseOnly || false, }; } @@ -207,4 +222,5 @@ module.exports = { isWithin, sanitizeFileName, documentsFolder, + directUploadsFolder, };