From c6b3ae1eee3000a5650589233cfb2513e0a06c4b Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Tue, 1 Oct 2024 15:17:15 -0700 Subject: [PATCH 1/7] support xlsx files --- collector/package.json | 3 +- collector/processSingleFile/convert/asXlsx.js | 56 ++++++++++++++++++ collector/utils/constants.js | 4 ++ collector/yarn.lock | 58 +++++++++++++++++++ 4 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 collector/processSingleFile/convert/asXlsx.js diff --git a/collector/package.json b/collector/package.json index 4ce85e68e10..cd73408d208 100644 --- a/collector/package.json +++ b/collector/package.json @@ -42,10 +42,11 @@ "uuid": "^9.0.0", "wavefile": "^11.0.0", "winston": "^3.13.0", + "xlsx": "^0.18.5", "youtubei.js": "^9.1.0" }, "devDependencies": { "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} \ No newline at end of file +} diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js new file mode 100644 index 00000000000..45644cbe351 --- /dev/null +++ b/collector/processSingleFile/convert/asXlsx.js @@ -0,0 +1,56 @@ +const { v4 } = require("uuid"); +const XLSX = require("xlsx"); +const { + createdDate, + trashFile, + writeToServerDocuments, +} = require("../../utils/files"); +const { tokenizeString } = require("../../utils/tokenizer"); +const { default: slugify } = require("slugify"); + +async function asXlsx({ fullFilePath = "", filename = "" }) { + let content = ""; + try { + const workbook = XLSX.readFile(fullFilePath); + const sheetName = workbook.SheetNames[0]; + const sheet = workbook.Sheets[sheetName]; + content = XLSX.utils.sheet_to_csv(sheet); + } catch (err) { + console.error("Could not read xlsx file!", err); + } + + if (!content?.length) { + console.error(`Resulting text content was empty for ${filename}.`); + trashFile(fullFilePath); + return { + success: false, + reason: `No text content found in ${filename}.`, + documents: [], + }; + } + + console.log(`-- Working ${filename} --`); + const data = { + id: v4(), + url: "file://" + fullFilePath, + title: filename, + docAuthor: "Unknown", + description: "Spreadsheet data", + docSource: "an xlsx file uploaded by the user.", + chunkSource: "", + published: createdDate(fullFilePath), + wordCount: content.split(/\s+/).length, + pageContent: content, + token_count_estimate: tokenizeString(content).length, + }; + + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}` + ); + trashFile(fullFilePath); + console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); + return { success: true, reason: null, documents: [document] }; +} + +module.exports = asXlsx; \ No newline at end of file diff --git a/collector/utils/constants.js b/collector/utils/constants.js index ee9ad22ae0d..f9cbc57586d 100644 --- a/collector/utils/constants.js +++ b/collector/utils/constants.js @@ -11,6 +11,8 @@ const ACCEPTED_MIMES = { ".pptx", ], + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [".xlsx"], + "application/vnd.oasis.opendocument.text": [".odt"], "application/vnd.oasis.opendocument.presentation": [".odp"], @@ -41,6 +43,8 @@ const SUPPORTED_FILETYPE_CONVERTERS = { ".odt": "./convert/asOfficeMime.js", ".odp": "./convert/asOfficeMime.js", + ".xlsx": "./convert/asXlsx.js", + ".mbox": "./convert/asMbox.js", ".epub": "./convert/asEPub.js", diff --git a/collector/yarn.lock b/collector/yarn.lock index 2786692e091..01f8c027436 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -357,6 +357,11 @@ acorn@^8.8.0: resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.11.3.tgz#71e0b14e13a4ec160724b38fb7b0f233b1b81d7a" integrity sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg== +adler-32@~1.3.0: + version "1.3.1" + resolved "https://registry.yarnpkg.com/adler-32/-/adler-32-1.3.1.tgz#1dbf0b36dda0012189a32b3679061932df1821e2" + integrity sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A== + adm-zip@^0.5.10: version "0.5.12" resolved "https://registry.yarnpkg.com/adm-zip/-/adm-zip-0.5.12.tgz#87786328e91d54b37358d8a50f954c4cd73ba60b" @@ -668,6 +673,14 @@ camelcase@6: resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== +cfb@~1.2.1: + version "1.2.2" + resolved "https://registry.yarnpkg.com/cfb/-/cfb-1.2.2.tgz#94e687628c700e5155436dac05f74e08df23bc44" + integrity sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA== + dependencies: + adler-32 "~1.3.0" + crc-32 "~1.2.0" + chalk@^2.4.2: version "2.4.2" resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" @@ -724,6 +737,11 @@ cliui@^8.0.1: strip-ansi "^6.0.1" wrap-ansi "^7.0.0" +codepage@~1.15.0: + version "1.15.0" + resolved "https://registry.yarnpkg.com/codepage/-/codepage-1.15.0.tgz#2e00519024b39424ec66eeb3ec07227e692618ab" + integrity sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA== + color-convert@^1.9.0, color-convert@^1.9.3: version "1.9.3" resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-1.9.3.tgz#bb71850690e1f136567de629d2d5471deda4c1e8" @@ -857,6 +875,11 @@ cosmiconfig@8.3.6: parse-json "^5.2.0" path-type "^4.0.0" +crc-32@~1.2.0, crc-32@~1.2.1: + version "1.2.2" + resolved "https://registry.yarnpkg.com/crc-32/-/crc-32-1.2.2.tgz#3cad35a934b8bf71f25ca524b6da51fb7eace2ff" + integrity sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ== + crlf-normalize@^1.0.19: version "1.0.20" resolved "https://registry.yarnpkg.com/crlf-normalize/-/crlf-normalize-1.0.20.tgz#0b3105d3de807bce8a7599113235d725fe9361a8" @@ -1401,6 +1424,11 @@ forwarded@0.2.0: resolved "https://registry.yarnpkg.com/forwarded/-/forwarded-0.2.0.tgz#2269936428aad4c15c7ebe9779a84bf0b2a81811" integrity sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow== +frac@~1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/frac/-/frac-1.1.2.tgz#3d74f7f6478c88a1b5020306d747dc6313c74d0b" + integrity sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA== + fresh@0.5.2: version "0.5.2" resolved "https://registry.yarnpkg.com/fresh/-/fresh-0.5.2.tgz#3d8cadd90d976569fa835ab1f8e4b23a105605a7" @@ -3113,6 +3141,13 @@ sprintf-js@~1.0.2: resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c" integrity sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g== +ssf@~0.11.2: + version "0.11.2" + resolved "https://registry.yarnpkg.com/ssf/-/ssf-0.11.2.tgz#0b99698b237548d088fc43cdf2b70c1a7512c06c" + integrity sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g== + dependencies: + frac "~1.1.2" + stack-trace@0.0.x: version "0.0.10" resolved "https://registry.yarnpkg.com/stack-trace/-/stack-trace-0.0.10.tgz#547c70b347e8d32b4e108ea1a2a159e5fdde19c0" @@ -3509,6 +3544,16 @@ winston@^3.13.0: triple-beam "^1.3.0" winston-transport "^4.7.0" +wmf@~1.0.1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/wmf/-/wmf-1.0.2.tgz#7d19d621071a08c2bdc6b7e688a9c435298cc2da" + integrity sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw== + +word@~0.3.0: + version "0.3.0" + resolved "https://registry.yarnpkg.com/word/-/word-0.3.0.tgz#8542157e4f8e849f4a363a288992d47612db9961" + integrity sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA== + wrap-ansi@^7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" @@ -3528,6 +3573,19 @@ ws@8.14.2: resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f" integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g== +xlsx@^0.18.5: + version "0.18.5" + resolved "https://registry.yarnpkg.com/xlsx/-/xlsx-0.18.5.tgz#16711b9113c848076b8a177022799ad356eba7d0" + integrity sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ== + dependencies: + adler-32 "~1.3.0" + cfb "~1.2.1" + codepage "~1.15.0" + crc-32 "~1.2.1" + ssf "~0.11.2" + wmf "~1.0.1" + word "~0.3.0" + xml2js@^0.6.2: version "0.6.2" resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499" From 638694a23c9effb50d96bf51da30299d9476c7cb Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Tue, 1 Oct 2024 15:20:06 -0700 Subject: [PATCH 2/7] lint --- collector/processSingleFile/convert/asXlsx.js | 2 +- collector/utils/constants.js | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 45644cbe351..36aa0871f10 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -53,4 +53,4 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { return { success: true, reason: null, documents: [document] }; } -module.exports = asXlsx; \ No newline at end of file +module.exports = asXlsx; diff --git a/collector/utils/constants.js b/collector/utils/constants.js index f9cbc57586d..c7beeb4b259 100644 --- a/collector/utils/constants.js +++ b/collector/utils/constants.js @@ -11,7 +11,9 @@ const ACCEPTED_MIMES = { ".pptx", ], - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [".xlsx"], + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [ + ".xlsx", + ], "application/vnd.oasis.opendocument.text": [".odt"], "application/vnd.oasis.opendocument.presentation": [".odp"], From f4c975287dcab0ff2d9b8dd9e4d44409a120fb92 Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Wed, 2 Oct 2024 13:00:03 -0700 Subject: [PATCH 3/7] create seperate docs for each xlsx sheet --- collector/processSingleFile/convert/asXlsx.js | 85 ++++++++++++------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 36aa0871f10..f1847787086 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -1,5 +1,7 @@ const { v4 } = require("uuid"); const XLSX = require("xlsx"); +const path = require("path"); +const fs = require("fs"); const { createdDate, trashFile, @@ -9,48 +11,73 @@ const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); async function asXlsx({ fullFilePath = "", filename = "" }) { - let content = ""; + const documents = []; + const folderName = slugify(`${path.basename(filename, path.extname(filename))}-${v4().slice(0, 4)}`).toLowerCase(); + const outFolderPath = process.env.NODE_ENV === "development" + ? path.resolve(__dirname, `../../../server/storage/documents/${folderName}`) + : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); + try { const workbook = XLSX.readFile(fullFilePath); - const sheetName = workbook.SheetNames[0]; - const sheet = workbook.Sheets[sheetName]; - content = XLSX.utils.sheet_to_csv(sheet); + + if (!fs.existsSync(outFolderPath)) { + fs.mkdirSync(outFolderPath, { recursive: true }); + } + + for (const sheetName of workbook.SheetNames) { + const sheet = workbook.Sheets[sheetName]; + const content = XLSX.utils.sheet_to_csv(sheet); + + if (!content?.length) { + console.warn(`Sheet "${sheetName}" is empty. Skipping.`); + continue; + } + + console.log(`-- Processing sheet: ${sheetName} --`); + const data = { + id: v4(), + url: `file://${path.join(outFolderPath, `${slugify(sheetName)}.csv`)}`, + title: `${filename} - ${sheetName}`, + docAuthor: "Unknown", + description: `Spreadsheet data from sheet: ${sheetName}`, + docSource: "an xlsx file uploaded by the user.", + chunkSource: "", + published: createdDate(fullFilePath), + wordCount: content.split(/\s+/).length, + pageContent: content, + token_count_estimate: tokenizeString(content).length, + }; + + const document = writeToServerDocuments( + data, + `${slugify(sheetName)}-${data.id}`, + outFolderPath + ); + documents.push(document); + console.log(`[SUCCESS]: Sheet "${sheetName}" converted & ready for embedding.`); + } } catch (err) { - console.error("Could not read xlsx file!", err); + console.error("Could not process xlsx file!", err); + return { + success: false, + reason: `Error processing ${filename}: ${err.message}`, + documents: [], + }; } - if (!content?.length) { - console.error(`Resulting text content was empty for ${filename}.`); + if (documents.length === 0) { + console.error(`No valid sheets found in ${filename}.`); trashFile(fullFilePath); return { success: false, - reason: `No text content found in ${filename}.`, + reason: `No valid sheets found in ${filename}.`, documents: [], }; } - console.log(`-- Working ${filename} --`); - const data = { - id: v4(), - url: "file://" + fullFilePath, - title: filename, - docAuthor: "Unknown", - description: "Spreadsheet data", - docSource: "an xlsx file uploaded by the user.", - chunkSource: "", - published: createdDate(fullFilePath), - wordCount: content.split(/\s+/).length, - pageContent: content, - token_count_estimate: tokenizeString(content).length, - }; - - const document = writeToServerDocuments( - data, - `${slugify(filename)}-${data.id}` - ); trashFile(fullFilePath); - console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null, documents: [document] }; + console.log(`[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`); + return { success: true, reason: null, documents }; } module.exports = asXlsx; From 19e0647b585dee37fbf1252ce60521e4ff0587ec Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Wed, 2 Oct 2024 13:04:32 -0700 Subject: [PATCH 4/7] lint --- collector/processSingleFile/convert/asXlsx.js | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index f1847787086..4ccacdf0df2 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -12,10 +12,16 @@ const { default: slugify } = require("slugify"); async function asXlsx({ fullFilePath = "", filename = "" }) { const documents = []; - const folderName = slugify(`${path.basename(filename, path.extname(filename))}-${v4().slice(0, 4)}`).toLowerCase(); - const outFolderPath = process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../../server/storage/documents/${folderName}`) - : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); + const folderName = slugify( + `${path.basename(filename, path.extname(filename))}-${v4().slice(0, 4)}` + ).toLowerCase(); + const outFolderPath = + process.env.NODE_ENV === "development" + ? path.resolve( + __dirname, + `../../../server/storage/documents/${folderName}` + ) + : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); try { const workbook = XLSX.readFile(fullFilePath); @@ -54,7 +60,9 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { outFolderPath ); documents.push(document); - console.log(`[SUCCESS]: Sheet "${sheetName}" converted & ready for embedding.`); + console.log( + `[SUCCESS]: Sheet "${sheetName}" converted & ready for embedding.` + ); } } catch (err) { console.error("Could not process xlsx file!", err); @@ -76,7 +84,9 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { } trashFile(fullFilePath); - console.log(`[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`); + console.log( + `[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n` + ); return { success: true, reason: null, documents }; } From 382344726ccc0cfb3c131c8516af30332c432020 Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Wed, 2 Oct 2024 13:53:31 -0700 Subject: [PATCH 5/7] use node-xlsx pkg for parsing xslx files --- collector/package.json | 2 +- collector/processSingleFile/convert/asXlsx.js | 36 ++++++---- collector/yarn.lock | 67 +++---------------- 3 files changed, 33 insertions(+), 72 deletions(-) diff --git a/collector/package.json b/collector/package.json index cd73408d208..bf6498c065a 100644 --- a/collector/package.json +++ b/collector/package.json @@ -33,6 +33,7 @@ "mime": "^3.0.0", "moment": "^2.29.4", "node-html-parser": "^6.1.13", + "node-xlsx": "^0.24.0", "officeparser": "^4.0.5", "openai": "4.38.5", "pdf-parse": "^1.1.1", @@ -42,7 +43,6 @@ "uuid": "^9.0.0", "wavefile": "^11.0.0", "winston": "^3.13.0", - "xlsx": "^0.18.5", "youtubei.js": "^9.1.0" }, "devDependencies": { diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 4ccacdf0df2..1e27cbe6e10 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -1,5 +1,5 @@ const { v4 } = require("uuid"); -const XLSX = require("xlsx"); +const xlsx = require('node-xlsx').default; const path = require("path"); const fs = require("fs"); const { @@ -10,6 +10,14 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); +function convertToCSV(data) { + return data.map(row => row.map(cell => { + if (cell === null || cell === undefined) return ''; + if (typeof cell === 'string' && cell.includes(',')) return `"${cell}"`; + return cell; + }).join(',')).join('\n'); +} + async function asXlsx({ fullFilePath = "", filename = "" }) { const documents = []; const folderName = slugify( @@ -24,28 +32,28 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); try { - const workbook = XLSX.readFile(fullFilePath); + const workSheetsFromFile = xlsx.parse(fullFilePath); if (!fs.existsSync(outFolderPath)) { fs.mkdirSync(outFolderPath, { recursive: true }); } - for (const sheetName of workbook.SheetNames) { - const sheet = workbook.Sheets[sheetName]; - const content = XLSX.utils.sheet_to_csv(sheet); + for (const sheet of workSheetsFromFile) { + const { name, data } = sheet; + const content = convertToCSV(data); if (!content?.length) { - console.warn(`Sheet "${sheetName}" is empty. Skipping.`); + console.warn(`Sheet "${name}" is empty. Skipping.`); continue; } - console.log(`-- Processing sheet: ${sheetName} --`); - const data = { + console.log(`-- Processing sheet: ${name} --`); + const sheetData = { id: v4(), - url: `file://${path.join(outFolderPath, `${slugify(sheetName)}.csv`)}`, - title: `${filename} - ${sheetName}`, + url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, + title: `${filename} - ${name}`, docAuthor: "Unknown", - description: `Spreadsheet data from sheet: ${sheetName}`, + description: `Spreadsheet data from sheet: ${name}`, docSource: "an xlsx file uploaded by the user.", chunkSource: "", published: createdDate(fullFilePath), @@ -55,13 +63,13 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { }; const document = writeToServerDocuments( - data, - `${slugify(sheetName)}-${data.id}`, + sheetData, + `${slugify(name)}-${sheetData.id}`, outFolderPath ); documents.push(document); console.log( - `[SUCCESS]: Sheet "${sheetName}" converted & ready for embedding.` + `[SUCCESS]: Sheet "${name}" converted & ready for embedding.` ); } } catch (err) { diff --git a/collector/yarn.lock b/collector/yarn.lock index 01f8c027436..f991b43faec 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -357,11 +357,6 @@ acorn@^8.8.0: resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.11.3.tgz#71e0b14e13a4ec160724b38fb7b0f233b1b81d7a" integrity sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg== -adler-32@~1.3.0: - version "1.3.1" - resolved "https://registry.yarnpkg.com/adler-32/-/adler-32-1.3.1.tgz#1dbf0b36dda0012189a32b3679061932df1821e2" - integrity sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A== - adm-zip@^0.5.10: version "0.5.12" resolved "https://registry.yarnpkg.com/adm-zip/-/adm-zip-0.5.12.tgz#87786328e91d54b37358d8a50f954c4cd73ba60b" @@ -673,14 +668,6 @@ camelcase@6: resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== -cfb@~1.2.1: - version "1.2.2" - resolved "https://registry.yarnpkg.com/cfb/-/cfb-1.2.2.tgz#94e687628c700e5155436dac05f74e08df23bc44" - integrity sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA== - dependencies: - adler-32 "~1.3.0" - crc-32 "~1.2.0" - chalk@^2.4.2: version "2.4.2" resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" @@ -737,11 +724,6 @@ cliui@^8.0.1: strip-ansi "^6.0.1" wrap-ansi "^7.0.0" -codepage@~1.15.0: - version "1.15.0" - resolved "https://registry.yarnpkg.com/codepage/-/codepage-1.15.0.tgz#2e00519024b39424ec66eeb3ec07227e692618ab" - integrity sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA== - color-convert@^1.9.0, color-convert@^1.9.3: version "1.9.3" resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-1.9.3.tgz#bb71850690e1f136567de629d2d5471deda4c1e8" @@ -875,11 +857,6 @@ cosmiconfig@8.3.6: parse-json "^5.2.0" path-type "^4.0.0" -crc-32@~1.2.0, crc-32@~1.2.1: - version "1.2.2" - resolved "https://registry.yarnpkg.com/crc-32/-/crc-32-1.2.2.tgz#3cad35a934b8bf71f25ca524b6da51fb7eace2ff" - integrity sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ== - crlf-normalize@^1.0.19: version "1.0.20" resolved "https://registry.yarnpkg.com/crlf-normalize/-/crlf-normalize-1.0.20.tgz#0b3105d3de807bce8a7599113235d725fe9361a8" @@ -1424,11 +1401,6 @@ forwarded@0.2.0: resolved "https://registry.yarnpkg.com/forwarded/-/forwarded-0.2.0.tgz#2269936428aad4c15c7ebe9779a84bf0b2a81811" integrity sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow== -frac@~1.1.2: - version "1.1.2" - resolved "https://registry.yarnpkg.com/frac/-/frac-1.1.2.tgz#3d74f7f6478c88a1b5020306d747dc6313c74d0b" - integrity sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA== - fresh@0.5.2: version "0.5.2" resolved "https://registry.yarnpkg.com/fresh/-/fresh-0.5.2.tgz#3d8cadd90d976569fa835ab1f8e4b23a105605a7" @@ -2354,6 +2326,13 @@ node-html-parser@^6.1.13: css-select "^5.1.0" he "1.2.0" +node-xlsx@^0.24.0: + version "0.24.0" + resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f" + integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg== + dependencies: + xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz" + nodemailer@6.9.13: version "6.9.13" resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6" @@ -3141,13 +3120,6 @@ sprintf-js@~1.0.2: resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c" integrity sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g== -ssf@~0.11.2: - version "0.11.2" - resolved "https://registry.yarnpkg.com/ssf/-/ssf-0.11.2.tgz#0b99698b237548d088fc43cdf2b70c1a7512c06c" - integrity sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g== - dependencies: - frac "~1.1.2" - stack-trace@0.0.x: version "0.0.10" resolved "https://registry.yarnpkg.com/stack-trace/-/stack-trace-0.0.10.tgz#547c70b347e8d32b4e108ea1a2a159e5fdde19c0" @@ -3544,16 +3516,6 @@ winston@^3.13.0: triple-beam "^1.3.0" winston-transport "^4.7.0" -wmf@~1.0.1: - version "1.0.2" - resolved "https://registry.yarnpkg.com/wmf/-/wmf-1.0.2.tgz#7d19d621071a08c2bdc6b7e688a9c435298cc2da" - integrity sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw== - -word@~0.3.0: - version "0.3.0" - resolved "https://registry.yarnpkg.com/word/-/word-0.3.0.tgz#8542157e4f8e849f4a363a288992d47612db9961" - integrity sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA== - wrap-ansi@^7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" @@ -3573,18 +3535,9 @@ ws@8.14.2: resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f" integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g== -xlsx@^0.18.5: - version "0.18.5" - resolved "https://registry.yarnpkg.com/xlsx/-/xlsx-0.18.5.tgz#16711b9113c848076b8a177022799ad356eba7d0" - integrity sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ== - dependencies: - adler-32 "~1.3.0" - cfb "~1.2.1" - codepage "~1.15.0" - crc-32 "~1.2.1" - ssf "~0.11.2" - wmf "~1.0.1" - word "~0.3.0" +"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz": + version "0.20.2" + resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d" xml2js@^0.6.2: version "0.6.2" From 1dd8859e899add8baf63ef7fa2eae9b7587d9981 Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Wed, 2 Oct 2024 13:54:59 -0700 Subject: [PATCH 6/7] lint --- collector/processSingleFile/convert/asXlsx.js | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 1e27cbe6e10..5be33ee2e0c 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -1,5 +1,5 @@ const { v4 } = require("uuid"); -const xlsx = require('node-xlsx').default; +const xlsx = require("node-xlsx").default; const path = require("path"); const fs = require("fs"); const { @@ -11,11 +11,18 @@ const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); function convertToCSV(data) { - return data.map(row => row.map(cell => { - if (cell === null || cell === undefined) return ''; - if (typeof cell === 'string' && cell.includes(',')) return `"${cell}"`; - return cell; - }).join(',')).join('\n'); + return data + .map((row) => + row + .map((cell) => { + if (cell === null || cell === undefined) return ""; + if (typeof cell === "string" && cell.includes(",")) + return `"${cell}"`; + return cell; + }) + .join(",") + ) + .join("\n"); } async function asXlsx({ fullFilePath = "", filename = "" }) { From 30fe9af917b6c4e32cd8addc366fb997e2cea283 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Thu, 3 Oct 2024 13:43:32 -0700 Subject: [PATCH 7/7] update error handling --- collector/processSingleFile/convert/asXlsx.js | 79 ++++++++++--------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 5be33ee2e0c..f21c6f1d9bf 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -27,9 +27,11 @@ function convertToCSV(data) { async function asXlsx({ fullFilePath = "", filename = "" }) { const documents = []; - const folderName = slugify( - `${path.basename(filename, path.extname(filename))}-${v4().slice(0, 4)}` - ).toLowerCase(); + const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { + lower: true, + trim: true, + }); + const outFolderPath = process.env.NODE_ENV === "development" ? path.resolve( @@ -40,44 +42,47 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { try { const workSheetsFromFile = xlsx.parse(fullFilePath); - - if (!fs.existsSync(outFolderPath)) { + if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath, { recursive: true }); - } for (const sheet of workSheetsFromFile) { - const { name, data } = sheet; - const content = convertToCSV(data); + try { + const { name, data } = sheet; + const content = convertToCSV(data); - if (!content?.length) { - console.warn(`Sheet "${name}" is empty. Skipping.`); - continue; - } + if (!content?.length) { + console.warn(`Sheet "${name}" is empty. Skipping.`); + continue; + } - console.log(`-- Processing sheet: ${name} --`); - const sheetData = { - id: v4(), - url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, - title: `${filename} - ${name}`, - docAuthor: "Unknown", - description: `Spreadsheet data from sheet: ${name}`, - docSource: "an xlsx file uploaded by the user.", - chunkSource: "", - published: createdDate(fullFilePath), - wordCount: content.split(/\s+/).length, - pageContent: content, - token_count_estimate: tokenizeString(content).length, - }; + console.log(`-- Processing sheet: ${name} --`); + const sheetData = { + id: v4(), + url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, + title: `${filename} - Sheet:${name}`, + docAuthor: "Unknown", + description: `Spreadsheet data from sheet: ${name}`, + docSource: "an xlsx file uploaded by the user.", + chunkSource: "", + published: createdDate(fullFilePath), + wordCount: content.split(/\s+/).length, + pageContent: content, + token_count_estimate: tokenizeString(content).length, + }; - const document = writeToServerDocuments( - sheetData, - `${slugify(name)}-${sheetData.id}`, - outFolderPath - ); - documents.push(document); - console.log( - `[SUCCESS]: Sheet "${name}" converted & ready for embedding.` - ); + const document = writeToServerDocuments( + sheetData, + `sheet-${slugify(name)}`, + outFolderPath + ); + documents.push(document); + console.log( + `[SUCCESS]: Sheet "${name}" converted & ready for embedding.` + ); + } catch (err) { + console.error(`Error processing sheet "${name}":`, err); + continue; + } } } catch (err) { console.error("Could not process xlsx file!", err); @@ -86,11 +91,12 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { reason: `Error processing ${filename}: ${err.message}`, documents: [], }; + } finally { + trashFile(fullFilePath); } if (documents.length === 0) { console.error(`No valid sheets found in ${filename}.`); - trashFile(fullFilePath); return { success: false, reason: `No valid sheets found in ${filename}.`, @@ -98,7 +104,6 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { }; } - trashFile(fullFilePath); console.log( `[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n` );