diff --git a/collector/index.js b/collector/index.js index 962ee6f8c65..a0dd0e5644d 100644 --- a/collector/index.js +++ b/collector/index.js @@ -32,7 +32,7 @@ app.post( "/process", [verifyPayloadIntegrity], async function (request, response) { - const { filename, options = {} } = reqBody(request); + const { filename, options = {}, metadata = {} } = reqBody(request); try { const targetFilename = path .normalize(filename) @@ -41,7 +41,7 @@ app.post( success, reason, documents = [], - } = await processSingleFile(targetFilename, options); + } = await processSingleFile(targetFilename, options, metadata); response .status(200) .json({ filename: targetFilename, success, reason, documents }); @@ -95,13 +95,13 @@ app.post( "/process-link", [verifyPayloadIntegrity], async function (request, response) { - const { link, scraperHeaders = {} } = reqBody(request); + const { link, scraperHeaders = {}, metadata = {} } = reqBody(request); try { const { success, reason, documents = [], - } = await processLink(link, scraperHeaders); + } = await processLink(link, scraperHeaders, metadata); response.status(200).json({ url: link, success, reason, documents }); } catch (e) { console.error(e); diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index d75aaf41449..84589197749 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -13,6 +13,7 @@ const { default: slugify } = require("slugify"); * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text' * @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request + * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document * @returns {Promise} - The content of the page */ async function scrapeGenericUrl({ @@ -20,6 +21,7 @@ async function scrapeGenericUrl({ captureAs = "text", processAsDocument = true, scraperHeaders = {}, + metadata = {}, }) { console.log(`-- Working URL ${link} => (${captureAs}) --`); const content = await getPageContent({ @@ -51,10 +53,10 @@ async function scrapeGenericUrl({ const data = { id: v4(), url: "file://" + slugify(filename) + ".html", - title: slugify(filename) + ".html", - docAuthor: "no author found", - description: "No description found.", - docSource: "URL link uploaded by the user.", + title: metadata.title || slugify(filename) + ".html", + docAuthor: metadata.docAuthor || "no author found", + description: metadata.description || "No description found.", + docSource: metadata.docSource || "URL link uploaded by the user.", chunkSource: `link://${link}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, diff --git a/collector/processLink/index.js b/collector/processLink/index.js index 819b1863f94..bcbbfb9e659 100644 --- a/collector/processLink/index.js +++ b/collector/processLink/index.js @@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic"); * so it can be used for embedding later. * @param {string} link - The link to process * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link + * @param {Object} metadata - Optional metadata to attach to the document * @returns {Promise<{success: boolean, content: string}>} - Response from collector */ -async function processLink(link, scraperHeaders = {}) { +async function processLink(link, scraperHeaders = {}, metadata = {}) { if (!validURL(link)) return { success: false, reason: "Not a valid URL." }; return await scrapeGenericUrl({ link, captureAs: "text", processAsDocument: true, scraperHeaders, + metadata, }); } diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index dccb2b9522a..b4ddc8eb76d 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = { local: LocalWhisper, }; -async function asAudio({ fullFilePath = "", filename = "", options = {} }) { +async function asAudio({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty( options?.whisperProvider ) @@ -48,11 +53,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "no author found", - description: "No description found.", - docSource: "pdf file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "no author found", + description: metadata.description || "No description found.", + docSource: metadata.docSource || "audio file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index 6e2133b1e38..1f77e772002 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -8,7 +8,12 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function asDocX({ fullFilePath = "", filename = "", options = {} }) { +async function asDocX({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { const loader = new DocxLoader(fullFilePath); console.log(`-- Working ${filename} --`); @@ -34,11 +39,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "no author found", - description: "No description found.", - docSource: "pdf file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "no author found", + description: metadata.description || "No description found.", + docSource: metadata.docSource || "docx file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js index 15a01b23fc1..283eb1e2018 100644 --- a/collector/processSingleFile/convert/asEPub.js +++ b/collector/processSingleFile/convert/asEPub.js @@ -8,7 +8,12 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asEPub({ fullFilePath = "", filename = "", options = {} }) { +async function asEPub({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { let content = ""; try { const loader = new EPubLoader(fullFilePath, { splitChapters: false }); @@ -32,11 +37,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "Unknown", // TODO: Find a better author - description: "Unknown", // TODO: Find a better description - docSource: "a epub file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "Unknown", + description: metadata.description || "Unknown", + docSource: metadata.docSource || "epub file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js index 05eff2d5696..77052f14a18 100644 --- a/collector/processSingleFile/convert/asImage.js +++ b/collector/processSingleFile/convert/asImage.js @@ -8,7 +8,12 @@ const { const OCRLoader = require("../../utils/OCRLoader"); const { default: slugify } = require("slugify"); -async function asImage({ fullFilePath = "", filename = "", options = {} }) { +async function asImage({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { let content = await new OCRLoader({ targetLanguages: options?.ocr?.langList, }).ocrImage(fullFilePath); @@ -27,11 +32,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "Unknown", // TODO: Find a better author - description: "Unknown", // TODO: Find a better description - docSource: "a text file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "Unknown", + description: metadata.description || "Unknown", + docSource: metadata.docSource || "image file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index a89b54ca9d8..8927616a019 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -9,7 +9,12 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function asMbox({ fullFilePath = "", filename = "", options = {} }) { +async function asMbox({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { console.log(`-- Working ${filename} --`); const mails = await mboxParser(fs.createReadStream(fullFilePath)) @@ -43,13 +48,16 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: mail?.subject - ? slugify(mail?.subject?.replace(".", "")) + ".mbox" - : `msg_${item}-${filename}`, - docAuthor: mail?.from?.text, - description: "No description found.", - docSource: "Mbox message file uploaded by the user.", - chunkSource: "", + title: + metadata.title || + (mail?.subject + ? slugify(mail?.subject?.replace(".", "")) + ".mbox" + : `msg_${item}-${filename}`), + docAuthor: metadata.docAuthor || mail?.from?.text, + description: metadata.description || "No description found.", + docSource: + metadata.docSource || "Mbox message file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index 41a2faa8515..dcd084144eb 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -12,6 +12,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "", options = {}, + metadata = {}, }) { console.log(`-- Working ${filename} --`); let content = ""; @@ -34,11 +35,11 @@ async function asOfficeMime({ const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "no author found", - description: "No description found.", - docSource: "Office file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "no author found", + description: metadata.description || "No description found.", + docSource: metadata.docSource || "Office file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index 5971dd4e416..bacfdaf53bf 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -9,7 +9,12 @@ const { default: slugify } = require("slugify"); const PDFLoader = require("./PDFLoader"); const OCRLoader = require("../../../utils/OCRLoader"); -async function asPdf({ fullFilePath = "", filename = "", options = {} }) { +async function asPdf({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { const pdfLoader = new PDFLoader(fullFilePath, { splitPages: true, }); @@ -51,11 +56,17 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found", - description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.", - docSource: "pdf file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: + metadata.docAuthor || + docs[0]?.metadata?.pdf?.info?.Creator || + "no author found", + description: + metadata.description || + docs[0]?.metadata?.pdf?.info?.Title || + "No description found.", + docSource: metadata.docSource || "pdf file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index 7e3bd92c06e..d32cebce551 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -8,7 +8,12 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asTxt({ fullFilePath = "", filename = "", options = {} }) { +async function asTxt({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { let content = ""; try { content = fs.readFileSync(fullFilePath, "utf8"); @@ -30,11 +35,11 @@ async function asTxt({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "Unknown", // TODO: Find a better author - description: "Unknown", // TODO: Find a better description - docSource: "a text file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "Unknown", + description: metadata.description || "Unknown", + docSource: metadata.docSource || "a text file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 832e44a7df7..c01bc86688c 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -27,7 +27,12 @@ function convertToCSV(data) { .join("\n"); } -async function asXlsx({ fullFilePath = "", filename = "", options = {} }) { +async function asXlsx({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { const documents = []; const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { lower: true, @@ -56,11 +61,12 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {} }) { const sheetData = { id: v4(), url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, - title: `${filename} - Sheet:${name}`, - docAuthor: "Unknown", - description: `Spreadsheet data from sheet: ${name}`, - docSource: "an xlsx file uploaded by the user.", - chunkSource: "", + title: metadata.title || `${filename} - Sheet:${name}`, + docAuthor: metadata.docAuthor || "Unknown", + description: + metadata.description || `Spreadsheet data from sheet: ${name}`, + docSource: metadata.docSource || "an xlsx file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(/\s+/).length, pageContent: content, diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index a00b139ed4b..3bf31a70004 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -12,7 +12,14 @@ const { } = require("../utils/files"); const RESERVED_FILES = ["__HOTDIR__.md"]; -async function processSingleFile(targetFilename, options = {}) { +/** + * Process a single file and return the documents + * @param {string} targetFilename - The filename to process + * @param {Object} options - The options for the file processing + * @param {Object} metadata - The metadata for the file processing + * @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing + */ +async function processSingleFile(targetFilename, options = {}, metadata = {}) { const fullFilePath = path.resolve( WATCH_DIRECTORY, normalizePath(targetFilename) @@ -70,6 +77,7 @@ async function processSingleFile(targetFilename, options = {}) { fullFilePath, filename: targetFilename, options, + metadata, }); } diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index 0795be7a3f8..3226a740b2b 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -8,7 +8,7 @@ const { normalizePath, isWithin, } = require("../../../utils/files"); -const { reqBody } = require("../../../utils/http"); +const { reqBody, safeJsonParse } = require("../../../utils/http"); const { EventLogs } = require("../../../models/eventLogs"); const { CollectorApi } = require("../../../utils/collectorApi"); const fs = require("fs"); @@ -29,7 +29,7 @@ function apiDocumentEndpoints(app) { async (request, response) => { /* #swagger.tags = ['Documents'] - #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.' + #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding, with optional metadata.' #swagger.requestBody = { description: 'File to be uploaded.', required: true, @@ -47,6 +47,11 @@ function apiDocumentEndpoints(app) { addToWorkspaces: { type: 'string', description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2', + }, + metadata: { + type: 'object', + description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.', + example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' } } }, required: ['file'] @@ -91,7 +96,12 @@ function apiDocumentEndpoints(app) { try { const Collector = new CollectorApi(); const { originalname } = request.file; - const { addToWorkspaces = "" } = reqBody(request); + const { addToWorkspaces = "", metadata: _metadata = {} } = + reqBody(request); + const metadata = + typeof _metadata === "string" + ? safeJsonParse(_metadata, {}) + : _metadata; const processingOnline = await Collector.online(); if (!processingOnline) { @@ -105,14 +115,16 @@ function apiDocumentEndpoints(app) { return; } - const { success, reason, documents } = - await Collector.processDocument(originalname); + const { success, reason, documents } = await Collector.processDocument( + originalname, + metadata + ); + if (!success) { - response + return response .status(500) .json({ success: false, error: reason, documents }) .end(); - return; } Collector.log( @@ -151,7 +163,7 @@ function apiDocumentEndpoints(app) { example: 'my-folder' } #swagger.requestBody = { - description: 'File to be uploaded.', + description: 'File to be uploaded, with optional metadata.', required: true, content: { "multipart/form-data": { @@ -167,6 +179,11 @@ function apiDocumentEndpoints(app) { addToWorkspaces: { type: 'string', description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2', + }, + metadata: { + type: 'object', + description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.', + example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' } } } } @@ -221,7 +238,13 @@ function apiDocumentEndpoints(app) { */ try { const { originalname } = request.file; - const { addToWorkspaces = "" } = reqBody(request); + const { addToWorkspaces = "", metadata: _metadata = {} } = + reqBody(request); + const metadata = + typeof _metadata === "string" + ? safeJsonParse(_metadata, {}) + : _metadata; + let folder = request.params?.folderName || "custom-documents"; folder = normalizePath(folder); const targetFolderPath = path.join(documentsPath, folder); @@ -236,25 +259,25 @@ function apiDocumentEndpoints(app) { const Collector = new CollectorApi(); const processingOnline = await Collector.online(); if (!processingOnline) { - response + return response .status(500) .json({ success: false, error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`, }) .end(); - return; } - // Process the uploaded document - const { success, reason, documents } = - await Collector.processDocument(originalname); + // Process the uploaded document with metadata + const { success, reason, documents } = await Collector.processDocument( + originalname, + metadata + ); if (!success) { - response + return response .status(500) .json({ success: false, error: reason, documents }) .end(); - return; } // For each processed document, check if it is already in the desired folder. @@ -314,7 +337,7 @@ function apiDocumentEndpoints(app) { #swagger.tags = ['Documents'] #swagger.description = 'Upload a valid URL for AnythingLLM to scrape and prepare for embedding. Optionally, specify a comma-separated list of workspace slugs to embed the document into post-upload.' #swagger.requestBody = { - description: 'Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload.', + description: 'Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload, and optional metadata.', required: true, content: { "application/json": { @@ -326,6 +349,12 @@ function apiDocumentEndpoints(app) { "scraperHeaders": { "Authorization": "Bearer token123", "My-Custom-Header": "value" + }, + "metadata": { + "title": "Custom Title", + "docAuthor": "Author Name", + "description": "A brief description", + "docSource": "Source of the document" } } } @@ -373,30 +402,34 @@ function apiDocumentEndpoints(app) { link, addToWorkspaces = "", scraperHeaders = {}, + metadata: _metadata = {}, } = reqBody(request); + const metadata = + typeof _metadata === "string" + ? safeJsonParse(_metadata, {}) + : _metadata; const processingOnline = await Collector.online(); if (!processingOnline) { - response + return response .status(500) .json({ success: false, error: `Document processing API is not online. Link ${link} will not be processed automatically.`, }) .end(); - return; } const { success, reason, documents } = await Collector.processLink( link, - scraperHeaders + scraperHeaders, + metadata ); if (!success) { - response + return response .status(500) .json({ success: false, error: reason, documents }) .end(); - return; } Collector.log( @@ -488,20 +521,23 @@ function apiDocumentEndpoints(app) { const requiredMetadata = ["title"]; const { textContent, - metadata = {}, + metadata: _metadata = {}, addToWorkspaces = "", } = reqBody(request); + const metadata = + typeof _metadata === "string" + ? safeJsonParse(_metadata, {}) + : _metadata; const processingOnline = await Collector.online(); if (!processingOnline) { - response + return response .status(500) .json({ success: false, error: `Document processing API is not online. Request will not be processed.`, }) .end(); - return; } if ( @@ -510,7 +546,7 @@ function apiDocumentEndpoints(app) { Object.keys(metadata).includes(reqKey) && !!metadata[reqKey] ) ) { - response + return response .status(422) .json({ success: false, @@ -519,18 +555,16 @@ function apiDocumentEndpoints(app) { .join(", ")}`, }) .end(); - return; } if (!textContent || textContent?.length === 0) { - response + return response .status(422) .json({ success: false, error: `The 'textContent' key cannot have an empty value.`, }) .end(); - return; } const { success, reason, documents } = await Collector.processRawText( @@ -538,11 +572,10 @@ function apiDocumentEndpoints(app) { metadata ); if (!success) { - response + return response .status(500) .json({ success: false, error: reason, documents }) .end(); - return; } Collector.log( diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index 40e057ed33d..fcb87dad120 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -843,7 +843,7 @@ "tags": [ "Documents" ], - "description": "Upload a new file to AnythingLLM to be parsed and prepared for embedding.", + "description": "Upload a new file to AnythingLLM to be parsed and prepared for embedding, with optional metadata.", "parameters": [], "responses": { "200": { @@ -913,6 +913,16 @@ "addToWorkspaces": { "type": "string", "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2" + }, + "metadata": { + "type": "object", + "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.", + "example": { + "title": "Custom Title", + "docAuthor": "Author Name", + "description": "A brief description", + "docSource": "Source of the document" + } } } } @@ -1000,7 +1010,7 @@ } }, "requestBody": { - "description": "File to be uploaded.", + "description": "File to be uploaded, with optional metadata.", "required": true, "content": { "multipart/form-data": { @@ -1018,6 +1028,16 @@ "addToWorkspaces": { "type": "string", "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2" + }, + "metadata": { + "type": "object", + "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.", + "example": { + "title": "Custom Title", + "docAuthor": "Author Name", + "description": "A brief description", + "docSource": "Source of the document" + } } } } @@ -1084,7 +1104,7 @@ } }, "requestBody": { - "description": "Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload.", + "description": "Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload, and optional metadata.", "required": true, "content": { "application/json": { @@ -1096,6 +1116,12 @@ "scraperHeaders": { "Authorization": "Bearer token123", "My-Custom-Header": "value" + }, + "metadata": { + "title": "Custom Title", + "docAuthor": "Author Name", + "description": "A brief description", + "docSource": "Source of the document" } } } diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index c991c299ce8..ef56f0c9255 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -63,15 +63,17 @@ class CollectorApi { /** * Process a document - * - Will append the options to the request body + * - Will append the options and optional metadata to the request body * @param {string} filename - The filename of the document to process + * @param {Object} metadata - Optional metadata key:value pairs * @returns {Promise} - The response from the collector API */ - async processDocument(filename = "") { + async processDocument(filename = "", metadata = {}) { if (!filename) return false; const data = JSON.stringify({ filename, + metadata, options: this.#attachOptions(), }); @@ -102,15 +104,17 @@ class CollectorApi { * - Will append the options to the request body * @param {string} link - The link to process * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply to the web-scraping request URL + * @param {[key: string]: string} metadata - Optional metadata to attach to the document * @returns {Promise} - The response from the collector API */ - async processLink(link = "", scraperHeaders = {}) { + async processLink(link = "", scraperHeaders = {}, metadata = {}) { if (!link) return false; const data = JSON.stringify({ link, scraperHeaders, options: this.#attachOptions(), + metadata: metadata, }); return await fetch(`${this.endpoint}/process-link`, { @@ -139,7 +143,7 @@ class CollectorApi { * Process raw text as a document for the collector * - Will append the options to the request body * @param {string} textContent - The text to process - * @param {Object} metadata - The metadata to process + * @param {[key: string]: string} metadata - The metadata to process * @returns {Promise} - The response from the collector API */ async processRawText(textContent = "", metadata = {}) {