From 935e6654264bb5e04cd98b544e2b5f481d2d6e0a Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Tue, 18 Mar 2025 17:43:10 -0700 Subject: [PATCH] perf: improve loading performance when large files are in picker --- server/package.json | 1 + server/utils/files/index.js | 65 +++++++++++++++++++++++++++++-------- server/yarn.lock | 18 ++++++++++ 3 files changed, 70 insertions(+), 14 deletions(-) diff --git a/server/package.json b/server/package.json index 2744cd0e378..58704e3a136 100644 --- a/server/package.json +++ b/server/package.json @@ -38,6 +38,7 @@ "@qdrant/js-client-rest": "^1.9.0", "@xenova/transformers": "^2.14.0", "@zilliz/milvus2-sdk-node": "^2.3.5", + "JSONStream": "^1.3.5", "adm-zip": "^0.5.16", "bcrypt": "^5.1.0", "body-parser": "^1.20.2", diff --git a/server/utils/files/index.js b/server/utils/files/index.js index 4b33fbc0c82..5a34d70736c 100644 --- a/server/utils/files/index.js +++ b/server/utils/files/index.js @@ -1,5 +1,6 @@ const fs = require("fs"); const path = require("path"); +const JSONStream = require("JSONStream"); const { v5: uuidv5 } = require("uuid"); const { Document } = require("../../models/documents"); const { DocumentSyncQueue } = require("../../models/documentSyncQueue"); @@ -24,8 +25,15 @@ async function fileData(filePath = null) { return JSON.parse(data); } +/** + * Returns a directory object with the documents and subfolders in the documents folder. + * Will stream-parse files that exceed the FILE_SIZE_THRESHOLD to avoid memory issues on large files + * otherwise it will read the file into memory to grab the metadata for the file we render in the picker. + * @returns {Promise<{localFiles: {name: string, type: string, items: {name: string, type: string, items: {name: string, type: string}[]}[]}>>} - A promise that resolves to a directory object + */ async function viewLocalFiles() { if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath); + const FILE_SIZE_THRESHOLD = 10 * 1024 * 1024; const liveSyncAvailable = await DocumentSyncQueue.enabled(); const directory = { name: "documents", @@ -49,21 +57,50 @@ async function viewLocalFiles() { for (const subfile of subfiles) { if (path.extname(subfile) !== ".json") continue; const filePath = path.join(folderPath, subfile); - const rawData = fs.readFileSync(filePath, "utf8"); const cachefilename = `${file}/${subfile}`; - const { pageContent, ...metadata } = JSON.parse(rawData); - subdocs.items.push({ - name: subfile, - type: "file", - ...metadata, - cached: await cachedVectorInformation(cachefilename, true), - canWatch: liveSyncAvailable - ? DocumentSyncQueue.canWatch(metadata) - : false, - // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document - // watched: false, // boolean to indicate if this document is watched in ANY workspace - }); - filenames[cachefilename] = subfile; + + try { + const stats = fs.statSync(filePath); + let metadata = {}; + if (stats.size <= FILE_SIZE_THRESHOLD) { + const rawData = fs.readFileSync(filePath, "utf8"); + metadata = JSON.parse(rawData); + } else { + console.log( + `Stream-parsing ${path.basename(filePath)} because it exceeds the ${FILE_SIZE_THRESHOLD} byte limit.` + ); + const fileStream = fs.createReadStream(filePath, { + encoding: "utf8", + }); + const parser = JSONStream.parse("$*"); + metadata = await new Promise((resolve, reject) => { + let result = {}; + parser.on("data", (data) => { + if (data.key === "pageContent") return; + result[data.key] = data.value; + }); + parser.on("end", () => resolve(result)); + parser.on("error", reject); + fileStream.pipe(parser); + }); + } + + subdocs.items.push({ + name: subfile, + type: "file", + ...metadata, + cached: await cachedVectorInformation(cachefilename, true), + canWatch: liveSyncAvailable + ? DocumentSyncQueue.canWatch(metadata) + : false, + pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document + watched: false, // boolean to indicate if this document is watched in ANY workspace + }); + filenames[cachefilename] = subfile; + } catch (error) { + console.error(`Error streaming JSON from ${filePath}:`, error); + continue; + } } // Grab the pinned workspaces and watched documents for this folder's documents diff --git a/server/yarn.lock b/server/yarn.lock index 751be0dd1dd..8ff118d3a97 100644 --- a/server/yarn.lock +++ b/server/yarn.lock @@ -1984,6 +1984,14 @@ protobufjs "^7.2.6" winston "^3.9.0" +JSONStream@^1.3.5: + version "1.3.5" + resolved "https://registry.yarnpkg.com/JSONStream/-/JSONStream-1.3.5.tgz#3208c1f08d3a4d99261ab64f92302bc15e111ca0" + integrity sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ== + dependencies: + jsonparse "^1.2.0" + through ">=2.2.7 <3" + abbrev@1: version "1.1.1" resolved "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz" @@ -4493,6 +4501,11 @@ json5@^2.2.3: resolved "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz" integrity sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg== +jsonparse@^1.2.0: + version "1.3.1" + resolved "https://registry.yarnpkg.com/jsonparse/-/jsonparse-1.3.1.tgz#3f4dae4a91fac315f71062f8521cc239f1366280" + integrity sha512-POQXvpdL69+CluYsillJ7SUhKvytYjW9vG/GKpnf+xP8UWgYEM/RaMzHHofbALDiKbbP1W8UEYmgGl39WkPZsg== + jsonpointer@^5.0.1: version "5.0.1" resolved "https://registry.npmjs.org/jsonpointer/-/jsonpointer-5.0.1.tgz" @@ -6458,6 +6471,11 @@ text-table@^0.2.0: resolved "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz" integrity sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw== +"through@>=2.2.7 <3": + version "2.3.8" + resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5" + integrity sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg== + tmp@^0.0.33: version "0.0.33" resolved "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz"