θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions collector/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,39 @@ app.post(
}
);

app.post(
"/parse",
[verifyPayloadIntegrity],
async function (request, response) {
const { filename, options = {} } = reqBody(request);
try {
const targetFilename = path
.normalize(filename)
.replace(/^(\.\.(\/|\\|$))+/, "");
const {
success,
reason,
documents = [],
} = await processSingleFile(targetFilename, {
...options,
parseOnly: true,
});
response
.status(200)
.json({ filename: targetFilename, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
filename: filename,
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
}
);

app.post(
"/process-link",
[verifyPayloadIntegrity],
Expand Down
1 change: 1 addition & 0 deletions collector/processSingleFile/convert/asAudio.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
const document = writeToServerDocuments({
data,
filename: `${slugify(filename)}-${data.id}`,
options: { parseOnly: options.parseOnly },
});
trashFile(fullFilePath);
console.log(
Expand Down
3 changes: 2 additions & 1 deletion collector/processSingleFile/convert/asDocx.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const {
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

async function asDocX({ fullFilePath = "", filename = "" }) {
async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
const loader = new DocxLoader(fullFilePath);

console.log(`-- Working ${filename} --`);
Expand Down Expand Up @@ -48,6 +48,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
const document = writeToServerDocuments({
data,
filename: `${slugify(filename)}-${data.id}`,
options: { parseOnly: options.parseOnly },
});
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
Expand Down
3 changes: 2 additions & 1 deletion collector/processSingleFile/convert/asEPub.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const {
} = require("../../utils/files");
const { default: slugify } = require("slugify");

async function asEPub({ fullFilePath = "", filename = "" }) {
async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
let content = "";
try {
const loader = new EPubLoader(fullFilePath, { splitChapters: false });
Expand Down Expand Up @@ -46,6 +46,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) {
const document = writeToServerDocuments({
data,
filename: `${slugify(filename)}-${data.id}`,
options: { parseOnly: options.parseOnly },
});
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
Expand Down
1 change: 1 addition & 0 deletions collector/processSingleFile/convert/asImage.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) {
const document = writeToServerDocuments({
data,
filename: `${slugify(filename)}-${data.id}`,
options: { parseOnly: options.parseOnly },
});
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
Expand Down
1 change: 1 addition & 0 deletions collector/processSingleFile/convert/asMbox.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
const document = writeToServerDocuments({
data,
filename: `${slugify(filename)}-${data.id}-msg-${item}`,
options: { parseOnly: options.parseOnly },
});
documents.push(document);
}
Expand Down
1 change: 1 addition & 0 deletions collector/processSingleFile/convert/asOfficeMime.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
const document = writeToServerDocuments({
data,
filename: `${slugify(filename)}-${data.id}`,
options: { parseOnly: options.parseOnly },
});
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
Expand Down
1 change: 1 addition & 0 deletions collector/processSingleFile/convert/asPDF/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
const document = writeToServerDocuments({
data,
filename: `${slugify(filename)}-${data.id}`,
options: { parseOnly: options.parseOnly },
});
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
Expand Down
3 changes: 2 additions & 1 deletion collector/processSingleFile/convert/asTxt.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const {
} = require("../../utils/files");
const { default: slugify } = require("slugify");

async function asTxt({ fullFilePath = "", filename = "" }) {
async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
let content = "";
try {
content = fs.readFileSync(fullFilePath, "utf8");
Expand Down Expand Up @@ -44,6 +44,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
const document = writeToServerDocuments({
data,
filename: `${slugify(filename)}-${data.id}`,
options: { parseOnly: options.parseOnly },
});
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
Expand Down
8 changes: 6 additions & 2 deletions collector/processSingleFile/convert/asXlsx.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const {
trashFile,
writeToServerDocuments,
documentsFolder,
directUploadsFolder,
} = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");
Expand All @@ -26,14 +27,16 @@ function convertToCSV(data) {
.join("\n");
}

async function asXlsx({ fullFilePath = "", filename = "" }) {
async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
const documents = [];
const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
lower: true,
trim: true,
});
const outFolderPath = options.parseOnly
? path.resolve(directUploadsFolder, folderName)
: path.resolve(documentsFolder, folderName);

const outFolderPath = path.resolve(documentsFolder, folderName);
try {
const workSheetsFromFile = xlsx.parse(fullFilePath);
if (!fs.existsSync(outFolderPath))
Expand Down Expand Up @@ -68,6 +71,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) {
data: sheetData,
filename: `sheet-${slugify(name)}`,
destinationOverride: outFolderPath,
options: { parseOnly: options.parseOnly },
});
documents.push(document);
console.log(
Expand Down
18 changes: 17 additions & 1 deletion collector/utils/files/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ const documentsFolder =
? path.resolve(__dirname, `../../../server/storage/documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);

/**
* The folder where direct uploads are stored to be stored when
* processed by the collector. These are files that were DnD'd into UI
* and are not to be embedded or selectable from the file picker.
*/
const directUploadsFolder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../../server/storage/direct-uploads`)
: path.resolve(process.env.STORAGE_DIR, `direct-uploads`);

/**
* Checks if a file is text by checking the mime type and then falling back to buffer inspection.
* This way we can capture all the cases where the mime type is not known but still parseable as text
Expand Down Expand Up @@ -102,17 +112,21 @@ function createdDate(filepath) {
* @param {Object} params.data - The data to write to the file. Must look like a document object.
* @param {string} params.filename - The name of the file to write to.
* @param {string|null} params.destinationOverride - A forced destination to write to - will be honored if provided.
* @param {Object} params.options - The options for the function.
* @param {boolean} params.options.parseOnly - If true, the file will be written to the direct uploads folder instead of the documents folder. Will be ignored if destinationOverride is provided.
* @returns {Object} - The data with the location added.
*/
function writeToServerDocuments({
data = {},
filename = null,
filename,
destinationOverride = null,
options = {},
}) {
if (!filename) throw new Error("Filename is required!");

let destination = null;
if (destinationOverride) destination = path.resolve(destinationOverride);
else if (options.parseOnly) destination = path.resolve(directUploadsFolder);
else destination = path.resolve(documentsFolder, "custom-documents");

if (!fs.existsSync(destination))
Expand All @@ -129,6 +143,7 @@ function writeToServerDocuments({
// that will work since we know the location exists and since we only allow
// 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
location: destinationFilePath.split("/").slice(-2).join("/"),
isDirectUpload: options.parseOnly || false,
};
}

Expand Down Expand Up @@ -207,4 +222,5 @@ module.exports = {
isWithin,
sanitizeFileName,
documentsFolder,
directUploadsFolder,
};