From 757476d2e98dbd513ba51782bb1e8aa3c6288149 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Mon, 26 Feb 2024 13:24:23 -0800 Subject: [PATCH 1/3] Do not block any file upload fallback unknown/unsupported types to text if possible --- collector/processSingleFile/index.js | 18 ++++++++---- collector/utils/files/index.js | 29 +++++++++++++++++++ .../UploadFile/FileUploadProgress/index.jsx | 4 +-- .../Documents/UploadFile/index.jsx | 3 -- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index 9efd3a70f81..b8cb2646952 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -4,7 +4,7 @@ const { WATCH_DIRECTORY, SUPPORTED_FILETYPE_CONVERTERS, } = require("../utils/constants"); -const { trashFile } = require("../utils/files"); +const { trashFile, isTextType } = require("../utils/files"); const RESERVED_FILES = ["__HOTDIR__.md"]; async function processSingleFile(targetFilename) { @@ -31,17 +31,25 @@ async function processSingleFile(targetFilename) { }; } - if (!Object.keys(SUPPORTED_FILETYPE_CONVERTERS).includes(fileExtension)) { - trashFile(fullFilePath); + let processFileAs = fileExtension; + if ( + !SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(fileExtension) && + isTextType(fullFilePath) + ) { + console.log( + `\x1b[33m[Collector]\x1b[0m The provided filetype of ${fileExtension} does not have a preset and will be processed as .txt.` + ); + processFileAs = ".txt"; + } else { return { success: false, - reason: `File extension ${fileExtension} not supported for parsing.`, + reason: `File extension ${fileExtension} not supported for parsing and cannot be assumed as text file type.`, documents: [], }; } const FileTypeProcessor = require(SUPPORTED_FILETYPE_CONVERTERS[ - fileExtension + processFileAs ]); return await FileTypeProcessor({ fullFilePath, diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index caf33c888a2..3e6ce3445e1 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -1,5 +1,33 @@ const fs = require("fs"); const path = require("path"); +const { getType } = require("mime"); + +function isTextType(filepath) { + if (!fs.existsSync(filepath)) return false; + // These are types of mime primary classes that for sure + // cannot also for forced into a text type. + const nonTextTypes = ["multipart", "image", "model", "audio", "video"]; + // These are full-mimes we for sure cannot parse or interpret as text + // documents + const BAD_MIMES = [ + "application/octet-stream", + "application/zip", + "application/pkcs8", + "application/vnd.microsoft.portable-executable", + "application/x-msdownload", + ]; + + try { + const mime = getType(filepath); + if (BAD_MIMES.includes(mime)) return false; + + const type = mime.split("/")[0]; + if (nonTextTypes.includes(type)) return false; + return true; + } catch { + return false; + } +} function trashFile(filepath) { if (!fs.existsSync(filepath)) return; @@ -94,6 +122,7 @@ async function wipeCollectorStorage() { module.exports = { trashFile, + isTextType, createdDate, writeToServerDocuments, wipeCollectorStorage, diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/UploadFile/FileUploadProgress/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/UploadFile/FileUploadProgress/index.jsx index 31cf57943bb..c375aa2e363 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/UploadFile/FileUploadProgress/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/UploadFile/FileUploadProgress/index.jsx @@ -61,7 +61,7 @@ function FileUploadProgressComponent({ if (status === "failed") { return (
-
+
@@ -76,7 +76,7 @@ function FileUploadProgressComponent({ return (
-
+
{status !== "complete" ? (
diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/UploadFile/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/UploadFile/index.jsx index a6cee8c8035..6d5cd67ed5b 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/UploadFile/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/UploadFile/index.jsx @@ -76,9 +76,6 @@ export default function UploadFile({ const { getRootProps, getInputProps } = useDropzone({ onDrop, - accept: { - ...fileTypes, - }, disabled: !ready, }); From 07470d238e92a86d7a3d760a22f0e7d88d5a9a51 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Mon, 26 Feb 2024 13:31:10 -0800 Subject: [PATCH 2/3] reduce call for frontend --- collector/processSingleFile/index.js | 1 + .../MangeWorkspace/Documents/Directory/index.jsx | 3 --- .../MangeWorkspace/Documents/UploadFile/index.jsx | 11 ++--------- .../Modals/MangeWorkspace/Documents/index.jsx | 7 +------ .../src/components/Modals/MangeWorkspace/index.jsx | 13 +++---------- 5 files changed, 7 insertions(+), 28 deletions(-) diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index b8cb2646952..e0a3c399e5f 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -41,6 +41,7 @@ async function processSingleFile(targetFilename) { ); processFileAs = ".txt"; } else { + trashFile(fullFilePath); return { success: false, reason: `File extension ${fileExtension} not supported for parsing and cannot be assumed as text file type.`, diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx index 557fe418145..158719445ac 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx @@ -8,7 +8,6 @@ function Directory({ files, loading, setLoading, - fileTypes, workspace, fetchKeys, selectedItems, @@ -135,9 +134,7 @@ function Directory({
)}
-
- {Object.values(fileTypes ?? []) - .flat() - .join(" ")} + supports text files, csv's, spreadsheets, audio files, and more!
) : ( diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx index e8b63c903ca..736a1476f6b 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx @@ -15,11 +15,7 @@ const MODEL_COSTS = { "text-embedding-3-large": 0.00000013, // $0.00013 / 1K tokens }; -export default function DocumentSettings({ - workspace, - fileTypes, - systemSettings, -}) { +export default function DocumentSettings({ workspace, systemSettings }) { const [highlightWorkspace, setHighlightWorkspace] = useState(false); const [availableDocs, setAvailableDocs] = useState([]); const [loading, setLoading] = useState(true); @@ -201,7 +197,6 @@ export default function DocumentSettings({ loading={loading} loadingMessage={loadingMessage} setLoading={setLoading} - fileTypes={fileTypes} workspace={workspace} fetchKeys={fetchKeys} selectedItems={selectedItems} diff --git a/frontend/src/components/Modals/MangeWorkspace/index.jsx b/frontend/src/components/Modals/MangeWorkspace/index.jsx index 6696a875697..ef3a58afb7b 100644 --- a/frontend/src/components/Modals/MangeWorkspace/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/index.jsx @@ -11,17 +11,14 @@ const noop = () => {}; const ManageWorkspace = ({ hideModal = noop, providedSlug = null }) => { const { slug } = useParams(); const [workspace, setWorkspace] = useState(null); - const [fileTypes, setFileTypes] = useState(null); const [settings, setSettings] = useState({}); useEffect(() => { - async function checkSupportedFiletypes() { - const acceptedTypes = await System.acceptedDocumentTypes(); + async function getSettings() { const _settings = await System.keys(); - setFileTypes(acceptedTypes ?? {}); setSettings(_settings ?? {}); } - checkSupportedFiletypes(); + getSettings(); }, []); useEffect(() => { @@ -78,11 +75,7 @@ const ManageWorkspace = ({ hideModal = noop, providedSlug = null }) => {
- +
From 1ea7429413698652e7fdd8eae20c15847aa32857 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Mon, 26 Feb 2024 13:38:45 -0800 Subject: [PATCH 3/3] patch --- collector/processSingleFile/index.js | 29 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index e0a3c399e5f..569a2cde27a 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -32,21 +32,20 @@ async function processSingleFile(targetFilename) { } let processFileAs = fileExtension; - if ( - !SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(fileExtension) && - isTextType(fullFilePath) - ) { - console.log( - `\x1b[33m[Collector]\x1b[0m The provided filetype of ${fileExtension} does not have a preset and will be processed as .txt.` - ); - processFileAs = ".txt"; - } else { - trashFile(fullFilePath); - return { - success: false, - reason: `File extension ${fileExtension} not supported for parsing and cannot be assumed as text file type.`, - documents: [], - }; + if (!SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(fileExtension)) { + if (isTextType(fullFilePath)) { + console.log( + `\x1b[33m[Collector]\x1b[0m The provided filetype of ${fileExtension} does not have a preset and will be processed as .txt.` + ); + processFileAs = ".txt"; + } else { + trashFile(fullFilePath); + return { + success: false, + reason: `File extension ${fileExtension} not supported for parsing and cannot be assumed as text file type.`, + documents: [], + }; + } } const FileTypeProcessor = require(SUPPORTED_FILETYPE_CONVERTERS[