θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content

3416 file picker load times #3833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 15, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 109 additions & 16 deletions server/utils/files/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ async function fileData(filePath = null) {

async function viewLocalFiles() {
if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
const filePromises = [];
const liveSyncAvailable = await DocumentSyncQueue.enabled();
const directory = {
name: "documents",
Expand All @@ -43,28 +44,26 @@ async function viewLocalFiles() {
type: "folder",
items: [],
};

const subfiles = fs.readdirSync(folderPath);
const filenames = {};

for (const subfile of subfiles) {
if (path.extname(subfile) !== ".json") continue;
const filePath = path.join(folderPath, subfile);
const rawData = fs.readFileSync(filePath, "utf8");
for (let i = 0; i < subfiles.length; i++) {
const subfile = subfiles[i];
const cachefilename = `${file}/${subfile}`;
const { pageContent, ...metadata } = JSON.parse(rawData);
subdocs.items.push({
name: subfile,
type: "file",
...metadata,
cached: await cachedVectorInformation(cachefilename, true),
canWatch: liveSyncAvailable
? DocumentSyncQueue.canWatch(metadata)
: false,
// pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
// watched: false, // boolean to indicate if this document is watched in ANY workspace
});
if (path.extname(subfile) !== ".json") continue;
filePromises.push(
fileToPickerData({
pathToFile: path.join(folderPath, subfile),
liveSyncAvailable,
})
);
filenames[cachefilename] = subfile;
}
const results = await Promise.all(filePromises).then((results) =>
results.filter((i) => !!i)
); // Filter out any null results
subdocs.items.push(...results);

// Grab the pinned workspaces and watched documents for this folder's documents
// at the time of the query so we don't have to re-query the database for each file
Expand Down Expand Up @@ -335,6 +334,100 @@ function purgeEntireVectorCache() {
return;
}

/**
* File size threshold for files that are too large to be read into memory (MB)
*
* If the file is larger than this, we will stream it and parse it in chunks
* This is to prevent us from using too much memory when parsing large files
* or loading the files in the file picker.
* @TODO - When lazy loading for folders is implemented, we should increase this threshold (512MB)
* since it will always be faster to readSync than to stream the file and parse it in chunks.
*/
const FILE_READ_SIZE_THRESHOLD = 150 * (1024 * 1024);

/**
* Converts a file to picker data
* @param {string} pathToFile - The path to the file to convert
* @param {boolean} liveSyncAvailable - Whether live sync is available
* @returns {Promise<{name: string, type: string, [string]: any, cached: boolean, canWatch: boolean}>} - The picker data
*/
async function fileToPickerData({ pathToFile, liveSyncAvailable = false }) {
let metadata = {};
const filename = path.basename(pathToFile);
const fileStats = fs.statSync(pathToFile);
const cachedStatus = await cachedVectorInformation(pathToFile, true);
const canWatchStatus = liveSyncAvailable
? DocumentSyncQueue.canWatch(metadata)
: false;

if (fileStats.size < FILE_READ_SIZE_THRESHOLD) {
const rawData = fs.readFileSync(pathToFile, "utf8");
try {
metadata = JSON.parse(rawData);
// Remove the pageContent field from the metadata - it is large and not needed for the picker
delete metadata.pageContent;
} catch (err) {
console.error("Error parsing file", err);
return null;
}

return {
name: filename,
type: "file",
...metadata,
cached: cachedStatus,
canWatch: canWatchStatus,
// pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
// watched: false, // boolean to indicate if this document is watched in ANY workspace
};
}

console.log(
`Stream-parsing ${path.basename(pathToFile)} because it exceeds the ${FILE_READ_SIZE_THRESHOLD} byte limit.`
);
const stream = fs.createReadStream(pathToFile, { encoding: "utf8" });
try {
let fileContent = "";
metadata = await new Promise((resolve, reject) => {
stream
.on("data", (chunk) => {
fileContent += chunk;
})
.on("end", () => {
metadata = JSON.parse(fileContent);
// Remove the pageContent field from the metadata - it is large and not needed for the picker
delete metadata.pageContent;
resolve(metadata);
})
.on("error", (err) => {
console.error("Error parsing file", err);
reject(null);
});
}).catch((err) => {
console.error("Error parsing file", err);
});
} catch (err) {
console.error("Error parsing file", err);
metadata = null;
} finally {
stream.destroy();
}

// If the metadata is empty or something went wrong, return null
if (!metadata || !Object.keys(metadata)?.length) {
console.log(`Stream-parsing failed for ${path.basename(pathToFile)}`);
return null;
}

return {
name: filename,
type: "file",
...metadata,
cached: cachedStatus,
canWatch: canWatchStatus,
};
}

module.exports = {
findDocumentInDocuments,
cachedVectorInformation,
Expand Down