From 590833aa94cd0688da20cbdc7fd4a464c0d62110 Mon Sep 17 00:00:00 2001 From: jazelly Date: Sat, 22 Jun 2024 13:17:14 +0930 Subject: [PATCH 1/3] fix: sanitize filename before writing Fixes: https://github.com/Mintplex-Labs/anything-llm/issues/1737 --- collector/utils/extensions/Confluence/index.js | 12 ++++++------ collector/utils/files/index.js | 6 ++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index 0bee1561459..61ea110140e 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -3,11 +3,11 @@ const path = require("path"); const { default: slugify } = require("slugify"); const { v4 } = require("uuid"); const UrlPattern = require("url-pattern"); -const { writeToServerDocuments } = require("../../files"); +const { writeToServerDocuments, sanitizeFileName } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); const { ConfluencePagesLoader, -} = require("langchain/document_loaders/web/confluence"); +} = require("@langchain/community/document_loaders/web/confluence"); async function loadConfluence({ pageUrl, username, accessToken }) { if (!pageUrl || !username || !accessToken) { @@ -89,11 +89,11 @@ async function loadConfluence({ pageUrl, username, accessToken }) { console.log( `[Confluence Loader]: Saving ${doc.metadata.title} to ${outFolder}` ); - writeToServerDocuments( - data, - `${slugify(doc.metadata.title)}-${data.id}`, - outFolderPath + + const fileName = sanitizeFileName( + `${slugify(doc.metadata.title)}-${data.id}` ); + writeToServerDocuments(data, fileName, outFolderPath); }); return { diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 9b56bb5b4d0..8d429832298 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -129,6 +129,11 @@ function normalizePath(filepath = "") { return result; } +function sanitizeFileName(fileName) { + if (!fileName) return fileName; + return fileName.replace(/[<>:"\/\\|?*]/g, "_"); +} + module.exports = { trashFile, isTextType, @@ -137,4 +142,5 @@ module.exports = { wipeCollectorStorage, normalizePath, isWithin, + sanitizeFileName, }; From 575493bf89f330f9ad4370b6922b53018b40dfb7 Mon Sep 17 00:00:00 2001 From: jazelly Date: Sat, 22 Jun 2024 13:20:09 +0930 Subject: [PATCH 2/3] fixup --- collector/utils/files/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 8d429832298..86b50c364c3 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -131,7 +131,7 @@ function normalizePath(filepath = "") { function sanitizeFileName(fileName) { if (!fileName) return fileName; - return fileName.replace(/[<>:"\/\\|?*]/g, "_"); + return fileName.replace(/[<>:"\/\\|?*]/g, ""); } module.exports = { From 21036b916fd5aba32a157a72ef7fe1fe3129e9be Mon Sep 17 00:00:00 2001 From: jazelly Date: Sat, 22 Jun 2024 14:12:47 +0930 Subject: [PATCH 3/3] fixup --- collector/utils/extensions/Confluence/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index 61ea110140e..d85e125c51c 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -7,7 +7,7 @@ const { writeToServerDocuments, sanitizeFileName } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); const { ConfluencePagesLoader, -} = require("@langchain/community/document_loaders/web/confluence"); +} = require("langchain/document_loaders/web/confluence"); async function loadConfluence({ pageUrl, username, accessToken }) { if (!pageUrl || !username || !accessToken) {