diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index fd85d6a61ff..d75aaf41449 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -62,10 +62,10 @@ async function scrapeGenericUrl({ token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `url-${slugify(filename)}-${data.id}` - ); + filename: `url-${slugify(filename)}-${data.id}`, + }); console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; } diff --git a/collector/processRawText/index.js b/collector/processRawText/index.js index a29eb63c37d..e3ce2ade2aa 100644 --- a/collector/processRawText/index.js +++ b/collector/processRawText/index.js @@ -58,10 +58,10 @@ async function processRawText(textContent, metadata) { token_count_estimate: tokenizeString(textContent), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `raw-${stripAndSlug(metadata.title)}-${data.id}` - ); + filename: `raw-${stripAndSlug(metadata.title)}-${data.id}`, + }); console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; } diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index 5f033af74a9..8b179a52994 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -59,10 +59,10 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) { token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `${slugify(filename)}-${data.id}` - ); + filename: `${slugify(filename)}-${data.id}`, + }); trashFile(fullFilePath); console.log( `[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n` diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index d33a46b9433..2dfad739d06 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -45,10 +45,10 @@ async function asDocX({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `${slugify(filename)}-${data.id}` - ); + filename: `${slugify(filename)}-${data.id}`, + }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js index 51bb20c809b..3b7f7295c3c 100644 --- a/collector/processSingleFile/convert/asEPub.js +++ b/collector/processSingleFile/convert/asEPub.js @@ -43,10 +43,10 @@ async function asEPub({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `${slugify(filename)}-${data.id}` - ); + filename: `${slugify(filename)}-${data.id}`, + }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js index d2d197b227a..8bf8adf738b 100644 --- a/collector/processSingleFile/convert/asImage.js +++ b/collector/processSingleFile/convert/asImage.js @@ -38,10 +38,10 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) { token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `${slugify(filename)}-${data.id}` - ); + filename: `${slugify(filename)}-${data.id}`, + }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index 48de60fa37a..74427046a1f 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -57,10 +57,10 @@ async function asMbox({ fullFilePath = "", filename = "" }) { }; item++; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `${slugify(filename)}-${data.id}-msg-${item}` - ); + filename: `${slugify(filename)}-${data.id}-msg-${item}`, + }); documents.push(document); } diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index 09e320d1689..66a13588530 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -41,10 +41,10 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `${slugify(filename)}-${data.id}` - ); + filename: `${slugify(filename)}-${data.id}`, + }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index b929fbda16f..db66c7660b4 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -62,10 +62,10 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) { token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `${slugify(filename)}-${data.id}` - ); + filename: `${slugify(filename)}-${data.id}`, + }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index bc95969e14d..1abe49532e5 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -41,10 +41,10 @@ async function asTxt({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( + const document = writeToServerDocuments({ data, - `${slugify(filename)}-${data.id}` - ); + filename: `${slugify(filename)}-${data.id}`, + }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 3aa8473da1c..a64e8e20390 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -64,11 +64,11 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content), }; - const document = writeToServerDocuments( - sheetData, - `sheet-${slugify(name)}`, - outFolderPath - ); + const document = writeToServerDocuments({ + data: sheetData, + filename: `sheet-${slugify(name)}`, + destinationOverride: outFolderPath, + }); documents.push(document); console.log( `[SUCCESS]: Sheet "${name}" converted & ready for embedding.` diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index 4d17d56befe..7e31077799e 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -116,7 +116,11 @@ async function loadConfluence( const fileName = sanitizeFileName( `${slugify(doc.metadata.title)}-${data.id}` ); - writeToServerDocuments(data, fileName, outFolderPath); + writeToServerDocuments({ + data, + filename: fileName, + destinationOverride: outFolderPath, + }); }); return { diff --git a/collector/utils/extensions/DrupalWiki/DrupalWiki/index.js b/collector/utils/extensions/DrupalWiki/DrupalWiki/index.js index e9d7e777b6f..9ca6259336d 100644 --- a/collector/utils/extensions/DrupalWiki/DrupalWiki/index.js +++ b/collector/utils/extensions/DrupalWiki/DrupalWiki/index.js @@ -202,7 +202,11 @@ class DrupalWiki { console.log( `[DrupalWiki Loader]: Saving page '${page.title}' (${page.id}) to '${this.storagePath}/${fileName}'` ); - writeToServerDocuments(data, fileName, this.storagePath); + writeToServerDocuments({ + data, + filename: fileName, + destinationOverride: this.storagePath, + }); } /** diff --git a/collector/utils/extensions/ObsidianVault/index.js b/collector/utils/extensions/ObsidianVault/index.js index 18d62be3ac7..52d4bbb8e06 100644 --- a/collector/utils/extensions/ObsidianVault/index.js +++ b/collector/utils/extensions/ObsidianVault/index.js @@ -66,7 +66,11 @@ async function loadObsidianVault({ files = [] }) { const targetFileName = sanitizeFileName( `${slugify(file.name)}-${data.id}` ); - writeToServerDocuments(data, targetFileName, outFolderPath); + writeToServerDocuments({ + data, + filename: targetFileName, + destinationOverride: outFolderPath, + }); results.push({ file: file.path, status: "success" }); } catch (e) { console.error(`Failed to process ${file.path}:`, e); diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/index.js index ae3396ccb45..fae6ef491c4 100644 --- a/collector/utils/extensions/RepoLoader/GithubRepo/index.js +++ b/collector/utils/extensions/RepoLoader/GithubRepo/index.js @@ -71,11 +71,11 @@ async function loadGithubRepo(args, response) { console.log( `[GitHub Loader]: Saving ${doc.metadata.source} to ${outFolder}` ); - writeToServerDocuments( + writeToServerDocuments({ data, - `${slugify(doc.metadata.source)}-${data.id}`, - outFolderPath - ); + filename: `${slugify(doc.metadata.source)}-${data.id}`, + destinationOverride: outFolderPath, + }); } return { diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js index 01ee4e73e16..5c312f02259 100644 --- a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js +++ b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js @@ -98,11 +98,11 @@ async function loadGitlabRepo(args, response) { `[GitLab Loader]: Saving ${doc.metadata.source} to ${outFolder}` ); - writeToServerDocuments( + writeToServerDocuments({ data, - sanitizeFileName(`${slugify(doc.metadata.source)}-${data.id}`), - outFolderPath - ); + filename: sanitizeFileName(`${slugify(doc.metadata.source)}-${data.id}`), + destinationOverride: outFolderPath, + }); } return { diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index 4801a45aeba..dca22558561 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -125,7 +125,11 @@ async function bulkScrapePages(links, outFolderPath) { token_count_estimate: tokenizeString(content), }; - writeToServerDocuments(data, data.title, outFolderPath); + writeToServerDocuments({ + data, + filename: data.title, + destinationOverride: outFolderPath, + }); scrapedData.push(data); console.log(`Successfully scraped ${link}.`); diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js index dfca0c669b3..b0b4f1313f1 100644 --- a/collector/utils/extensions/YoutubeTranscript/index.js +++ b/collector/utils/extensions/YoutubeTranscript/index.js @@ -116,11 +116,11 @@ async function loadYouTubeTranscript({ url }) { }; console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`); - writeToServerDocuments( + writeToServerDocuments({ data, - sanitizeFileName(`${slugify(metadata.title)}-${data.id}`), - outFolderPath - ); + filename: sanitizeFileName(`${slugify(metadata.title)}-${data.id}`), + destinationOverride: outFolderPath, + }); return { success: true, diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 29866771b32..f1b9d93edd9 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -96,11 +96,21 @@ function createdDate(filepath) { } } -function writeToServerDocuments( +/** + * Writes a document to the server documents folder. + * @param {Object} params - The parameters for the function. + * @param {Object} params.data - The data to write to the file. Must look like a document object. + * @param {string} params.filename - The name of the file to write to. + * @param {string|null} params.destinationOverride - A forced destination to write to - will be honored if provided. + * @returns {Object} - The data with the location added. + */ +function writeToServerDocuments({ data = {}, - filename, - destinationOverride = null -) { + filename = null, + destinationOverride = null, +}) { + if (!filename) throw new Error("Filename is required!"); + let destination = null; if (destinationOverride) destination = path.resolve(destinationOverride); else destination = path.resolve(documentsFolder, "custom-documents");