θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"mime": "^3.0.0",
"moment": "^2.29.4",
"node-html-parser": "^6.1.13",
"node-xlsx": "^0.24.0",
"officeparser": "^4.0.5",
"openai": "4.38.5",
"pdf-parse": "^1.1.1",
Expand All @@ -48,4 +49,4 @@
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
}
}
113 changes: 113 additions & 0 deletions collector/processSingleFile/convert/asXlsx.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
const { v4 } = require("uuid");
const xlsx = require("node-xlsx").default;
const path = require("path");
const fs = require("fs");
const {
createdDate,
trashFile,
writeToServerDocuments,
} = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

function convertToCSV(data) {
return data
.map((row) =>
row
.map((cell) => {
if (cell === null || cell === undefined) return "";
if (typeof cell === "string" && cell.includes(","))
return `"${cell}"`;
return cell;
})
.join(",")
)
.join("\n");
}

async function asXlsx({ fullFilePath = "", filename = "" }) {
const documents = [];
const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
lower: true,
trim: true,
});

const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../server/storage/documents/${folderName}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`);

try {
const workSheetsFromFile = xlsx.parse(fullFilePath);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });

for (const sheet of workSheetsFromFile) {
try {
const { name, data } = sheet;
const content = convertToCSV(data);

if (!content?.length) {
console.warn(`Sheet "${name}" is empty. Skipping.`);
continue;
}

console.log(`-- Processing sheet: ${name} --`);
const sheetData = {
id: v4(),
url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
title: `${filename} - Sheet:${name}`,
docAuthor: "Unknown",
description: `Spreadsheet data from sheet: ${name}`,
docSource: "an xlsx file uploaded by the user.",
chunkSource: "",
published: createdDate(fullFilePath),
wordCount: content.split(/\s+/).length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};

const document = writeToServerDocuments(
sheetData,
`sheet-${slugify(name)}`,
outFolderPath
);
documents.push(document);
console.log(
`[SUCCESS]: Sheet "${name}" converted & ready for embedding.`
);
} catch (err) {
console.error(`Error processing sheet "${name}":`, err);
continue;
}
}
} catch (err) {
console.error("Could not process xlsx file!", err);
return {
success: false,
reason: `Error processing ${filename}: ${err.message}`,
documents: [],
};
} finally {
trashFile(fullFilePath);
}

if (documents.length === 0) {
console.error(`No valid sheets found in ${filename}.`);
return {
success: false,
reason: `No valid sheets found in ${filename}.`,
documents: [],
};
}

console.log(
`[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`
);
return { success: true, reason: null, documents };
}

module.exports = asXlsx;
6 changes: 6 additions & 0 deletions collector/utils/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ const ACCEPTED_MIMES = {
".pptx",
],

"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [
".xlsx",
],

"application/vnd.oasis.opendocument.text": [".odt"],
"application/vnd.oasis.opendocument.presentation": [".odp"],

Expand Down Expand Up @@ -41,6 +45,8 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
".odt": "./convert/asOfficeMime.js",
".odp": "./convert/asOfficeMime.js",

".xlsx": "./convert/asXlsx.js",

".mbox": "./convert/asMbox.js",

".epub": "./convert/asEPub.js",
Expand Down
11 changes: 11 additions & 0 deletions collector/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2326,6 +2326,13 @@ node-html-parser@^6.1.13:
css-select "^5.1.0"
he "1.2.0"

node-xlsx@^0.24.0:
version "0.24.0"
resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f"
integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg==
dependencies:
xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz"

nodemailer@6.9.13:
version "6.9.13"
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6"
Expand Down Expand Up @@ -3528,6 +3535,10 @@ ws@8.14.2:
resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f"
integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==

"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz":
version "0.20.2"
resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d"

xml2js@^0.6.2:
version "0.6.2"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499"
Expand Down