From 27f652b2aa1942121a7d63f2440816d8175ad6a6 Mon Sep 17 00:00:00 2001 From: jstawskigmi Date: Sat, 30 Aug 2025 16:18:48 -0400 Subject: [PATCH 1/7] Added the ability to pass in metadata to the /document/upload/{folderName} endpoint --- collector/index.js | 4 +- .../processSingleFile/convert/asAudio.js | 12 +++--- collector/processSingleFile/convert/asDocx.js | 12 +++--- collector/processSingleFile/convert/asEPub.js | 12 +++--- .../processSingleFile/convert/asImage.js | 12 +++--- collector/processSingleFile/convert/asMbox.js | 14 +++---- .../processSingleFile/convert/asOfficeMime.js | 11 +++--- .../processSingleFile/convert/asPDF/index.js | 12 +++--- collector/processSingleFile/convert/asTxt.js | 12 +++--- collector/processSingleFile/convert/asXlsx.js | 12 +++--- collector/processSingleFile/index.js | 3 +- server/endpoints/api/document/index.js | 38 +++++++++++++++++-- server/swagger/openapi.json | 10 ++++- server/utils/collectorApi/index.js | 6 ++- 14 files changed, 106 insertions(+), 64 deletions(-) diff --git a/collector/index.js b/collector/index.js index 962ee6f8c65..75690533d56 100644 --- a/collector/index.js +++ b/collector/index.js @@ -32,7 +32,7 @@ app.post( "/process", [verifyPayloadIntegrity], async function (request, response) { - const { filename, options = {} } = reqBody(request); + const { filename, options = {}, metadata = {} } = reqBody(request); try { const targetFilename = path .normalize(filename) @@ -41,7 +41,7 @@ app.post( success, reason, documents = [], - } = await processSingleFile(targetFilename, options); + } = await processSingleFile(targetFilename, options, metadata); response .status(200) .json({ filename: targetFilename, success, reason, documents }); diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index dccb2b9522a..ee306ea84f5 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -14,7 +14,7 @@ const WHISPER_PROVIDERS = { local: LocalWhisper, }; -async function asAudio({ fullFilePath = "", filename = "", options = {} }) { +async function asAudio({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty( options?.whisperProvider ) @@ -48,11 +48,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "no author found", - description: "No description found.", - docSource: "pdf file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "no author found", + description: metadata.description || "No description found.", + docSource: metadata.docSource || "audio file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index 6e2133b1e38..9ca4de5ce5b 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -8,7 +8,7 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function asDocX({ fullFilePath = "", filename = "", options = {} }) { +async function asDocX({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { const loader = new DocxLoader(fullFilePath); console.log(`-- Working ${filename} --`); @@ -34,11 +34,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "no author found", - description: "No description found.", - docSource: "pdf file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "no author found", + description: metadata.description || "No description found.", + docSource: metadata.docSource || "docx file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js index 15a01b23fc1..4a4fe1e67ee 100644 --- a/collector/processSingleFile/convert/asEPub.js +++ b/collector/processSingleFile/convert/asEPub.js @@ -8,7 +8,7 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asEPub({ fullFilePath = "", filename = "", options = {} }) { +async function asEPub({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { let content = ""; try { const loader = new EPubLoader(fullFilePath, { splitChapters: false }); @@ -32,11 +32,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "Unknown", // TODO: Find a better author - description: "Unknown", // TODO: Find a better description - docSource: "a epub file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "Unknown", + description: metadata.description || "Unknown", + docSource: metadata.docSource || "epub file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js index 05eff2d5696..f97cf73307f 100644 --- a/collector/processSingleFile/convert/asImage.js +++ b/collector/processSingleFile/convert/asImage.js @@ -8,7 +8,7 @@ const { const OCRLoader = require("../../utils/OCRLoader"); const { default: slugify } = require("slugify"); -async function asImage({ fullFilePath = "", filename = "", options = {} }) { +async function asImage({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { let content = await new OCRLoader({ targetLanguages: options?.ocr?.langList, }).ocrImage(fullFilePath); @@ -27,11 +27,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "Unknown", // TODO: Find a better author - description: "Unknown", // TODO: Find a better description - docSource: "a text file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "Unknown", + description: metadata.description || "Unknown", + docSource: metadata.docSource || "image file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index a89b54ca9d8..389494f243d 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -9,7 +9,7 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function asMbox({ fullFilePath = "", filename = "", options = {} }) { +async function asMbox({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { console.log(`-- Working ${filename} --`); const mails = await mboxParser(fs.createReadStream(fullFilePath)) @@ -43,13 +43,13 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: mail?.subject + title: metadata.title || (mail?.subject ? slugify(mail?.subject?.replace(".", "")) + ".mbox" - : `msg_${item}-${filename}`, - docAuthor: mail?.from?.text, - description: "No description found.", - docSource: "Mbox message file uploaded by the user.", - chunkSource: "", + : `msg_${item}-${filename}`), + docAuthor: metadata.docAuthor || mail?.from?.text, + description: metadata.description || "No description found.", + docSource: metadata.docSource || "Mbox message file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index 41a2faa8515..91062495b23 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -12,6 +12,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "", options = {}, + metadata = {} }) { console.log(`-- Working ${filename} --`); let content = ""; @@ -34,11 +35,11 @@ async function asOfficeMime({ const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "no author found", - description: "No description found.", - docSource: "Office file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "no author found", + description: metadata.description || "No description found.", + docSource: metadata.docSource || "Office file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index 5971dd4e416..863f95b7bf2 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -9,7 +9,7 @@ const { default: slugify } = require("slugify"); const PDFLoader = require("./PDFLoader"); const OCRLoader = require("../../../utils/OCRLoader"); -async function asPdf({ fullFilePath = "", filename = "", options = {} }) { +async function asPdf({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { const pdfLoader = new PDFLoader(fullFilePath, { splitPages: true, }); @@ -51,11 +51,11 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found", - description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.", - docSource: "pdf file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || docs[0]?.metadata?.pdf?.info?.Creator || "no author found", + description: metadata.description || docs[0]?.metadata?.pdf?.info?.Title || "No description found.", + docSource: metadata.docSource || "pdf file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index 7e3bd92c06e..73b16a7435e 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -8,7 +8,7 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asTxt({ fullFilePath = "", filename = "", options = {} }) { +async function asTxt({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { let content = ""; try { content = fs.readFileSync(fullFilePath, "utf8"); @@ -30,11 +30,11 @@ async function asTxt({ fullFilePath = "", filename = "", options = {} }) { const data = { id: v4(), url: "file://" + fullFilePath, - title: filename, - docAuthor: "Unknown", // TODO: Find a better author - description: "Unknown", // TODO: Find a better description - docSource: "a text file uploaded by the user.", - chunkSource: "", + title: metadata.title || filename, + docAuthor: metadata.docAuthor || "Unknown", + description: metadata.description || "Unknown", + docSource: metadata.docSource || "a text file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 832e44a7df7..6efad61b50b 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -27,7 +27,7 @@ function convertToCSV(data) { .join("\n"); } -async function asXlsx({ fullFilePath = "", filename = "", options = {} }) { +async function asXlsx({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { const documents = []; const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { lower: true, @@ -56,11 +56,11 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {} }) { const sheetData = { id: v4(), url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, - title: `${filename} - Sheet:${name}`, - docAuthor: "Unknown", - description: `Spreadsheet data from sheet: ${name}`, - docSource: "an xlsx file uploaded by the user.", - chunkSource: "", + title: metadata.title || `${filename} - Sheet:${name}`, + docAuthor: metadata.docAuthor || "Unknown", + description: metadata.description || `Spreadsheet data from sheet: ${name}`, + docSource: metadata.docSource || "an xlsx file uploaded by the user.", + chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(/\s+/).length, pageContent: content, diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index a00b139ed4b..8ec2088141b 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -12,7 +12,7 @@ const { } = require("../utils/files"); const RESERVED_FILES = ["__HOTDIR__.md"]; -async function processSingleFile(targetFilename, options = {}) { +async function processSingleFile(targetFilename, options = {}, metadata = {}) { const fullFilePath = path.resolve( WATCH_DIRECTORY, normalizePath(targetFilename) @@ -70,6 +70,7 @@ async function processSingleFile(targetFilename, options = {}) { fullFilePath, filename: targetFilename, options, + metadata, }); } diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index 0795be7a3f8..83b71425744 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -151,7 +151,7 @@ function apiDocumentEndpoints(app) { example: 'my-folder' } #swagger.requestBody = { - description: 'File to be uploaded.', + description: 'File to be uploaded, with optional metadata.', required: true, content: { "multipart/form-data": { @@ -167,6 +167,11 @@ function apiDocumentEndpoints(app) { addToWorkspaces: { type: 'string', description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2', + }, + "metadata": { + type: 'object', + description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. ' + example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' } } } } @@ -221,7 +226,8 @@ function apiDocumentEndpoints(app) { */ try { const { originalname } = request.file; - const { addToWorkspaces = "" } = reqBody(request); + const { addToWorkspaces = "", metadata = {} } = reqBody(request); + let folder = request.params?.folderName || "custom-documents"; folder = normalizePath(folder); const targetFolderPath = path.join(documentsPath, folder); @@ -233,6 +239,30 @@ function apiDocumentEndpoints(app) { if (!fs.existsSync(targetFolderPath)) fs.mkdirSync(targetFolderPath, { recursive: true }); + // Validate required metadata keys if present + // Parse JSON string into an object + let metadataObj = {}; + if (metadata && typeof metadata === "string") { + try { + metadataObj = JSON.parse(metadata); + } + catch { + response.status(422).json({ success: false, error: 'Invalid metadata' }).end(); + return; + } + } + + const requiredMetadata = ["title"]; + if ( + metadataObj && Object.keys(metadataObj).length > 0 && + !requiredMetadata.every( + (reqKey) => Object.keys(metadataObj).includes(reqKey) && !!metadataObj[reqKey] + ) + ) { + response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end(); + return; + } + const Collector = new CollectorApi(); const processingOnline = await Collector.online(); if (!processingOnline) { @@ -246,9 +276,9 @@ function apiDocumentEndpoints(app) { return; } - // Process the uploaded document + // Process the uploaded document with metadata const { success, reason, documents } = - await Collector.processDocument(originalname); + await Collector.processDocument(originalname, metadataObj); if (!success) { response .status(500) diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index 40e057ed33d..b8175b72045 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -984,6 +984,9 @@ } } }, + "422": { + "description": "Unprocessable Entity" + }, "500": { "description": "Internal Server Error", "content": { @@ -1000,7 +1003,7 @@ } }, "requestBody": { - "description": "File to be uploaded.", + "description": "File to be uploaded, with optional metadata.", "required": true, "content": { "multipart/form-data": { @@ -1018,6 +1021,11 @@ "addToWorkspaces": { "type": "string", "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2" + }, + "metadata": { + "type": "object", + "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. ", + "example": { "title": "Custom Title", "docAuthor": "Author Name", "description": "A brief description", "docSource": "Source of the document" } } } } diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index c991c299ce8..c651f4cb328 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -63,15 +63,17 @@ class CollectorApi { /** * Process a document - * - Will append the options to the request body + * - Will append the options and optional metadata to the request body * @param {string} filename - The filename of the document to process + * @param {Object} metadata - Optional metadata key:value pairs * @returns {Promise} - The response from the collector API */ - async processDocument(filename = "") { + async processDocument(filename = "", metadata = {}) { if (!filename) return false; const data = JSON.stringify({ filename, + metadata, options: this.#attachOptions(), }); From f0b2bb22af147b544ee1cde9e20d8b2e85d0fb0a Mon Sep 17 00:00:00 2001 From: jstawskigmi Date: Sat, 30 Aug 2025 17:11:15 -0400 Subject: [PATCH 2/7] Added the ability to pass in metadata to the /document/upload-link endpoint --- collector/index.js | 4 +-- collector/processLink/convert/generic.js | 9 ++--- collector/processLink/index.js | 5 ++- server/endpoints/api/document/index.js | 24 ++++++++++++-- server/swagger/openapi.json | 42 +++++++----------------- server/utils/collectorApi/index.js | 3 +- 6 files changed, 46 insertions(+), 41 deletions(-) diff --git a/collector/index.js b/collector/index.js index 75690533d56..a0dd0e5644d 100644 --- a/collector/index.js +++ b/collector/index.js @@ -95,13 +95,13 @@ app.post( "/process-link", [verifyPayloadIntegrity], async function (request, response) { - const { link, scraperHeaders = {} } = reqBody(request); + const { link, scraperHeaders = {}, metadata = {} } = reqBody(request); try { const { success, reason, documents = [], - } = await processLink(link, scraperHeaders); + } = await processLink(link, scraperHeaders, metadata); response.status(200).json({ url: link, success, reason, documents }); } catch (e) { console.error(e); diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index d75aaf41449..af3374b1669 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -20,6 +20,7 @@ async function scrapeGenericUrl({ captureAs = "text", processAsDocument = true, scraperHeaders = {}, + metadata = {}, }) { console.log(`-- Working URL ${link} => (${captureAs}) --`); const content = await getPageContent({ @@ -51,10 +52,10 @@ async function scrapeGenericUrl({ const data = { id: v4(), url: "file://" + slugify(filename) + ".html", - title: slugify(filename) + ".html", - docAuthor: "no author found", - description: "No description found.", - docSource: "URL link uploaded by the user.", + title: metadata.title || slugify(filename) + ".html", + docAuthor: metadata.docAuthor || "no author found", + description: metadata.description || "No description found.", + docSource: metadata.docSource || "URL link uploaded by the user.", chunkSource: `link://${link}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, diff --git a/collector/processLink/index.js b/collector/processLink/index.js index 819b1863f94..05ede7db36c 100644 --- a/collector/processLink/index.js +++ b/collector/processLink/index.js @@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic"); * so it can be used for embedding later. * @param {string} link - The link to process * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link + * @param {Object} metadata - Optional metadata to attach to the document * @returns {Promise<{success: boolean, content: string}>} - Response from collector */ -async function processLink(link, scraperHeaders = {}) { +async function processLink(link, scraperHeaders = {}, metadata = {}) { if (!validURL(link)) return { success: false, reason: "Not a valid URL." }; return await scrapeGenericUrl({ link, captureAs: "text", processAsDocument: true, scraperHeaders, + metadata, }); } @@ -31,6 +33,7 @@ async function getLinkText(link, captureAs = "text") { link, captureAs, processAsDocument: false, + metadata: {} }); } diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index 83b71425744..a44b80b8896 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -344,7 +344,7 @@ function apiDocumentEndpoints(app) { #swagger.tags = ['Documents'] #swagger.description = 'Upload a valid URL for AnythingLLM to scrape and prepare for embedding. Optionally, specify a comma-separated list of workspace slugs to embed the document into post-upload.' #swagger.requestBody = { - description: 'Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload.', + description: 'Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload, and optional metadata.', required: true, content: { "application/json": { @@ -356,6 +356,12 @@ function apiDocumentEndpoints(app) { "scraperHeaders": { "Authorization": "Bearer token123", "My-Custom-Header": "value" + }, + "metadata": { + "title": "Custom Title", + "docAuthor": "Author Name", + "description": "A brief description", + "docSource": "Source of the document" } } } @@ -399,13 +405,26 @@ function apiDocumentEndpoints(app) { */ try { const Collector = new CollectorApi(); + const requiredMetadata = ["title"]; const { link, addToWorkspaces = "", scraperHeaders = {}, + metadata = {} } = reqBody(request); const processingOnline = await Collector.online(); + // Validate required metadata keys if present + if ( + metadata && Object.keys(metadata).length > 0 && + !requiredMetadata.every( + (reqKey) => Object.keys(metadata).includes(reqKey) && !!metadata[reqKey] + ) + ) { + response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end(); + return; + } + if (!processingOnline) { response .status(500) @@ -419,7 +438,8 @@ function apiDocumentEndpoints(app) { const { success, reason, documents } = await Collector.processLink( link, - scraperHeaders + scraperHeaders, + metadata ); if (!success) { response diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index b8175b72045..ca71cb9f22a 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -1002,36 +1002,7 @@ } } }, - "requestBody": { - "description": "File to be uploaded, with optional metadata.", - "required": true, - "content": { - "multipart/form-data": { - "schema": { - "type": "object", - "required": [ - "file" - ], - "properties": { - "file": { - "type": "string", - "format": "binary", - "description": "The file to upload" - }, - "addToWorkspaces": { - "type": "string", - "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2" - }, - "metadata": { - "type": "object", - "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. ", - "example": { "title": "Custom Title", "docAuthor": "Author Name", "description": "A brief description", "docSource": "Source of the document" } - } - } - } - } - } - } + "requestBody": {} } }, "/v1/document/upload-link": { @@ -1087,12 +1058,15 @@ } } }, + "422": { + "description": "Unprocessable Entity" + }, "500": { "description": "Internal Server Error" } }, "requestBody": { - "description": "Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload.", + "description": "Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload, and optional metadata.", "required": true, "content": { "application/json": { @@ -1104,6 +1078,12 @@ "scraperHeaders": { "Authorization": "Bearer token123", "My-Custom-Header": "value" + }, + "metadata": { + "title": "Custom Title", + "docAuthor": "Author Name", + "description": "A brief description", + "docSource": "Source of the document" } } } diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index c651f4cb328..66bc5ac3775 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -106,13 +106,14 @@ class CollectorApi { * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply to the web-scraping request URL * @returns {Promise} - The response from the collector API */ - async processLink(link = "", scraperHeaders = {}) { + async processLink(link = "", scraperHeaders = {}, metadata = {}) { if (!link) return false; const data = JSON.stringify({ link, scraperHeaders, options: this.#attachOptions(), + metadata: metadata }); return await fetch(`${this.endpoint}/process-link`, { From 2e6101f7a1521298d5912e67eb1ba936a1226ed7 Mon Sep 17 00:00:00 2001 From: jstawskigmi Date: Sat, 30 Aug 2025 20:27:23 -0400 Subject: [PATCH 3/7] feat: added metadata to document/upload api endpoint --- server/endpoints/api/document/index.js | 40 +++++++++++++++++--- server/swagger/openapi.json | 51 +++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 7 deletions(-) diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index a44b80b8896..974ccdf04a4 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -29,7 +29,7 @@ function apiDocumentEndpoints(app) { async (request, response) => { /* #swagger.tags = ['Documents'] - #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.' + #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding, with optional metadata.' #swagger.requestBody = { description: 'File to be uploaded.', required: true, @@ -47,6 +47,11 @@ function apiDocumentEndpoints(app) { addToWorkspaces: { type: 'string', description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2', + }, + metadata: { + type: 'object', + description: 'Key:Value pairs of metadata to attach to the document in JSON Object format.', + example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' } } }, required: ['file'] @@ -91,7 +96,32 @@ function apiDocumentEndpoints(app) { try { const Collector = new CollectorApi(); const { originalname } = request.file; - const { addToWorkspaces = "" } = reqBody(request); + const { addToWorkspaces = "", metadata = {} } = reqBody(request); + + // Validate required metadata keys if present + // Parse JSON string into an object + let metadataObj = {}; + if (metadata && typeof metadata === "string") { + try { + metadataObj = JSON.parse(metadata); + } + catch { + response.status(422).json({ success: false, error: 'Invalid metadata' }).end(); + return; + } + } + + const requiredMetadata = ["title"]; + if ( + metadataObj && Object.keys(metadataObj).length > 0 && + !requiredMetadata.every( + (reqKey) => Object.keys(metadataObj).includes(reqKey) && !!metadataObj[reqKey] + ) + ) { + response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end(); + return; + } + const processingOnline = await Collector.online(); if (!processingOnline) { @@ -106,7 +136,7 @@ function apiDocumentEndpoints(app) { } const { success, reason, documents } = - await Collector.processDocument(originalname); + await Collector.processDocument(originalname, metadataObj); if (!success) { response .status(500) @@ -168,9 +198,9 @@ function apiDocumentEndpoints(app) { type: 'string', description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2', }, - "metadata": { + metadata: { type: 'object', - description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. ' + description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. ', example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' } } } diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index ca71cb9f22a..b12960fade0 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -843,7 +843,7 @@ "tags": [ "Documents" ], - "description": "Upload a new file to AnythingLLM to be parsed and prepared for embedding.", + "description": "Upload a new file to AnythingLLM to be parsed and prepared for embedding, with optional metadata.", "parameters": [], "responses": { "200": { @@ -890,6 +890,9 @@ } } }, + "422": { + "description": "Unprocessable Entity" + }, "500": { "description": "Internal Server Error" } @@ -913,6 +916,16 @@ "addToWorkspaces": { "type": "string", "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2" + }, + "metadata": { + "type": "object", + "description": "Key:Value pairs of metadata to attach to the document in JSON Object format.", + "example": { + "title": "Custom Title", + "docAuthor": "Author Name", + "description": "A brief description", + "docSource": "Source of the document" + } } } } @@ -1002,7 +1015,41 @@ } } }, - "requestBody": {} + "requestBody": { + "description": "File to be uploaded, with optional metadata.", + "required": true, + "content": { + "multipart/form-data": { + "schema": { + "type": "object", + "required": [ + "file" + ], + "properties": { + "file": { + "type": "string", + "format": "binary", + "description": "The file to upload" + }, + "addToWorkspaces": { + "type": "string", + "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2" + }, + "metadata": { + "type": "object", + "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. ", + "example": { + "title": "Custom Title", + "docAuthor": "Author Name", + "description": "A brief description", + "docSource": "Source of the document" + } + } + } + } + } + } + } } }, "/v1/document/upload-link": { From 65f412e697678ce8a52d1b531e6240708be4ea21 Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Wed, 3 Sep 2025 16:39:20 -0700 Subject: [PATCH 4/7] simplify optional metadata in document dev api endpoints --- server/endpoints/api/document/index.js | 81 +++++--------------------- server/swagger/openapi.json | 9 --- 2 files changed, 15 insertions(+), 75 deletions(-) diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index 974ccdf04a4..de3e42eb14c 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -8,7 +8,7 @@ const { normalizePath, isWithin, } = require("../../../utils/files"); -const { reqBody } = require("../../../utils/http"); +const { reqBody, safeJsonParse } = require("../../../utils/http"); const { EventLogs } = require("../../../models/eventLogs"); const { CollectorApi } = require("../../../utils/collectorApi"); const fs = require("fs"); @@ -98,29 +98,7 @@ function apiDocumentEndpoints(app) { const { originalname } = request.file; const { addToWorkspaces = "", metadata = {} } = reqBody(request); - // Validate required metadata keys if present - // Parse JSON string into an object - let metadataObj = {}; - if (metadata && typeof metadata === "string") { - try { - metadataObj = JSON.parse(metadata); - } - catch { - response.status(422).json({ success: false, error: 'Invalid metadata' }).end(); - return; - } - } - - const requiredMetadata = ["title"]; - if ( - metadataObj && Object.keys(metadataObj).length > 0 && - !requiredMetadata.every( - (reqKey) => Object.keys(metadataObj).includes(reqKey) && !!metadataObj[reqKey] - ) - ) { - response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end(); - return; - } + const metadataObj = safeJsonParse(metadata, {}); const processingOnline = await Collector.online(); @@ -135,8 +113,10 @@ function apiDocumentEndpoints(app) { return; } - const { success, reason, documents } = - await Collector.processDocument(originalname, metadataObj); + const { success, reason, documents } = await Collector.processDocument( + originalname, + metadataObj + ); if (!success) { response .status(500) @@ -257,7 +237,7 @@ function apiDocumentEndpoints(app) { try { const { originalname } = request.file; const { addToWorkspaces = "", metadata = {} } = reqBody(request); - + let folder = request.params?.folderName || "custom-documents"; folder = normalizePath(folder); const targetFolderPath = path.join(documentsPath, folder); @@ -269,29 +249,7 @@ function apiDocumentEndpoints(app) { if (!fs.existsSync(targetFolderPath)) fs.mkdirSync(targetFolderPath, { recursive: true }); - // Validate required metadata keys if present - // Parse JSON string into an object - let metadataObj = {}; - if (metadata && typeof metadata === "string") { - try { - metadataObj = JSON.parse(metadata); - } - catch { - response.status(422).json({ success: false, error: 'Invalid metadata' }).end(); - return; - } - } - - const requiredMetadata = ["title"]; - if ( - metadataObj && Object.keys(metadataObj).length > 0 && - !requiredMetadata.every( - (reqKey) => Object.keys(metadataObj).includes(reqKey) && !!metadataObj[reqKey] - ) - ) { - response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end(); - return; - } + const metadataObj = safeJsonParse(metadata, {}); const Collector = new CollectorApi(); const processingOnline = await Collector.online(); @@ -307,8 +265,10 @@ function apiDocumentEndpoints(app) { } // Process the uploaded document with metadata - const { success, reason, documents } = - await Collector.processDocument(originalname, metadataObj); + const { success, reason, documents } = await Collector.processDocument( + originalname, + metadataObj + ); if (!success) { response .status(500) @@ -435,26 +395,15 @@ function apiDocumentEndpoints(app) { */ try { const Collector = new CollectorApi(); - const requiredMetadata = ["title"]; const { link, addToWorkspaces = "", scraperHeaders = {}, - metadata = {} + metadata = {}, } = reqBody(request); + const metadataObj = safeJsonParse(metadata, {}); const processingOnline = await Collector.online(); - // Validate required metadata keys if present - if ( - metadata && Object.keys(metadata).length > 0 && - !requiredMetadata.every( - (reqKey) => Object.keys(metadata).includes(reqKey) && !!metadata[reqKey] - ) - ) { - response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end(); - return; - } - if (!processingOnline) { response .status(500) @@ -469,7 +418,7 @@ function apiDocumentEndpoints(app) { const { success, reason, documents } = await Collector.processLink( link, scraperHeaders, - metadata + metadataObj ); if (!success) { response diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index b12960fade0..62665b55cfc 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -890,9 +890,6 @@ } } }, - "422": { - "description": "Unprocessable Entity" - }, "500": { "description": "Internal Server Error" } @@ -997,9 +994,6 @@ } } }, - "422": { - "description": "Unprocessable Entity" - }, "500": { "description": "Internal Server Error", "content": { @@ -1105,9 +1099,6 @@ } } }, - "422": { - "description": "Unprocessable Entity" - }, "500": { "description": "Internal Server Error" } From 34edeefcad9efaf0f2fc8c59589ebeecfdea5b31 Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Wed, 3 Sep 2025 16:39:48 -0700 Subject: [PATCH 5/7] lint --- collector/processLink/index.js | 2 +- collector/processSingleFile/convert/asAudio.js | 7 ++++++- collector/processSingleFile/convert/asDocx.js | 7 ++++++- collector/processSingleFile/convert/asEPub.js | 7 ++++++- collector/processSingleFile/convert/asImage.js | 7 ++++++- collector/processSingleFile/convert/asMbox.js | 18 +++++++++++++----- .../processSingleFile/convert/asOfficeMime.js | 2 +- .../processSingleFile/convert/asPDF/index.js | 17 ++++++++++++++--- collector/processSingleFile/convert/asTxt.js | 7 ++++++- collector/processSingleFile/convert/asXlsx.js | 10 ++++++++-- server/utils/collectorApi/index.js | 2 +- 11 files changed, 68 insertions(+), 18 deletions(-) diff --git a/collector/processLink/index.js b/collector/processLink/index.js index 05ede7db36c..5406092684d 100644 --- a/collector/processLink/index.js +++ b/collector/processLink/index.js @@ -33,7 +33,7 @@ async function getLinkText(link, captureAs = "text") { link, captureAs, processAsDocument: false, - metadata: {} + metadata: {}, }); } diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index ee306ea84f5..b4ddc8eb76d 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = { local: LocalWhisper, }; -async function asAudio({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { +async function asAudio({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty( options?.whisperProvider ) diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index 9ca4de5ce5b..1f77e772002 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -8,7 +8,12 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function asDocX({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { +async function asDocX({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { const loader = new DocxLoader(fullFilePath); console.log(`-- Working ${filename} --`); diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js index 4a4fe1e67ee..283eb1e2018 100644 --- a/collector/processSingleFile/convert/asEPub.js +++ b/collector/processSingleFile/convert/asEPub.js @@ -8,7 +8,12 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asEPub({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { +async function asEPub({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { let content = ""; try { const loader = new EPubLoader(fullFilePath, { splitChapters: false }); diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js index f97cf73307f..77052f14a18 100644 --- a/collector/processSingleFile/convert/asImage.js +++ b/collector/processSingleFile/convert/asImage.js @@ -8,7 +8,12 @@ const { const OCRLoader = require("../../utils/OCRLoader"); const { default: slugify } = require("slugify"); -async function asImage({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { +async function asImage({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { let content = await new OCRLoader({ targetLanguages: options?.ocr?.langList, }).ocrImage(fullFilePath); diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index 389494f243d..8927616a019 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -9,7 +9,12 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); -async function asMbox({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { +async function asMbox({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { console.log(`-- Working ${filename} --`); const mails = await mboxParser(fs.createReadStream(fullFilePath)) @@ -43,12 +48,15 @@ async function asMbox({ fullFilePath = "", filename = "", options = {}, metadata const data = { id: v4(), url: "file://" + fullFilePath, - title: metadata.title || (mail?.subject - ? slugify(mail?.subject?.replace(".", "")) + ".mbox" - : `msg_${item}-${filename}`), + title: + metadata.title || + (mail?.subject + ? slugify(mail?.subject?.replace(".", "")) + ".mbox" + : `msg_${item}-${filename}`), docAuthor: metadata.docAuthor || mail?.from?.text, description: metadata.description || "No description found.", - docSource: metadata.docSource || "Mbox message file uploaded by the user.", + docSource: + metadata.docSource || "Mbox message file uploaded by the user.", chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index 91062495b23..dcd084144eb 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -12,7 +12,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "", options = {}, - metadata = {} + metadata = {}, }) { console.log(`-- Working ${filename} --`); let content = ""; diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index 863f95b7bf2..bacfdaf53bf 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -9,7 +9,12 @@ const { default: slugify } = require("slugify"); const PDFLoader = require("./PDFLoader"); const OCRLoader = require("../../../utils/OCRLoader"); -async function asPdf({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { +async function asPdf({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { const pdfLoader = new PDFLoader(fullFilePath, { splitPages: true, }); @@ -52,8 +57,14 @@ async function asPdf({ fullFilePath = "", filename = "", options = {}, metadata id: v4(), url: "file://" + fullFilePath, title: metadata.title || filename, - docAuthor: metadata.docAuthor || docs[0]?.metadata?.pdf?.info?.Creator || "no author found", - description: metadata.description || docs[0]?.metadata?.pdf?.info?.Title || "No description found.", + docAuthor: + metadata.docAuthor || + docs[0]?.metadata?.pdf?.info?.Creator || + "no author found", + description: + metadata.description || + docs[0]?.metadata?.pdf?.info?.Title || + "No description found.", docSource: metadata.docSource || "pdf file uploaded by the user.", chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index 73b16a7435e..d32cebce551 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -8,7 +8,12 @@ const { } = require("../../utils/files"); const { default: slugify } = require("slugify"); -async function asTxt({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { +async function asTxt({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { let content = ""; try { content = fs.readFileSync(fullFilePath, "utf8"); diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index 6efad61b50b..c01bc86688c 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -27,7 +27,12 @@ function convertToCSV(data) { .join("\n"); } -async function asXlsx({ fullFilePath = "", filename = "", options = {}, metadata = {} }) { +async function asXlsx({ + fullFilePath = "", + filename = "", + options = {}, + metadata = {}, +}) { const documents = []; const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { lower: true, @@ -58,7 +63,8 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {}, metadata url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, title: metadata.title || `${filename} - Sheet:${name}`, docAuthor: metadata.docAuthor || "Unknown", - description: metadata.description || `Spreadsheet data from sheet: ${name}`, + description: + metadata.description || `Spreadsheet data from sheet: ${name}`, docSource: metadata.docSource || "an xlsx file uploaded by the user.", chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index 66bc5ac3775..22a69f436f0 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -113,7 +113,7 @@ class CollectorApi { link, scraperHeaders, options: this.#attachOptions(), - metadata: metadata + metadata: metadata, }); return await fetch(`${this.endpoint}/process-link`, { From 2c4095697621d7489a6cdc64a9a6fa7698958e3a Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Mon, 8 Sep 2025 15:26:37 -0700 Subject: [PATCH 6/7] patch handling of metadata in dev api --- server/endpoints/api/document/index.js | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index de3e42eb14c..397c314cafd 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -98,7 +98,8 @@ function apiDocumentEndpoints(app) { const { originalname } = request.file; const { addToWorkspaces = "", metadata = {} } = reqBody(request); - const metadataObj = safeJsonParse(metadata, {}); + const metadataObj = + typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata; const processingOnline = await Collector.online(); @@ -249,7 +250,8 @@ function apiDocumentEndpoints(app) { if (!fs.existsSync(targetFolderPath)) fs.mkdirSync(targetFolderPath, { recursive: true }); - const metadataObj = safeJsonParse(metadata, {}); + const metadataObj = + typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata; const Collector = new CollectorApi(); const processingOnline = await Collector.online(); @@ -401,7 +403,8 @@ function apiDocumentEndpoints(app) { scraperHeaders = {}, metadata = {}, } = reqBody(request); - const metadataObj = safeJsonParse(metadata, {}); + const metadataObj = + typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata; const processingOnline = await Collector.online(); if (!processingOnline) { From 5b9c8ad3095f1b3304739976174abd0a2fea2212 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 17 Sep 2025 10:54:45 -0700 Subject: [PATCH 7/7] Linting, small comments --- collector/processLink/convert/generic.js | 1 + collector/processLink/index.js | 1 - collector/processSingleFile/index.js | 7 +++ server/endpoints/api/document/index.js | 73 ++++++++++++------------ server/swagger/openapi.json | 4 +- server/utils/collectorApi/index.js | 3 +- 6 files changed, 49 insertions(+), 40 deletions(-) diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index af3374b1669..84589197749 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -13,6 +13,7 @@ const { default: slugify } = require("slugify"); * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text' * @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request + * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document * @returns {Promise} - The content of the page */ async function scrapeGenericUrl({ diff --git a/collector/processLink/index.js b/collector/processLink/index.js index 5406092684d..bcbbfb9e659 100644 --- a/collector/processLink/index.js +++ b/collector/processLink/index.js @@ -33,7 +33,6 @@ async function getLinkText(link, captureAs = "text") { link, captureAs, processAsDocument: false, - metadata: {}, }); } diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index 8ec2088141b..3bf31a70004 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -12,6 +12,13 @@ const { } = require("../utils/files"); const RESERVED_FILES = ["__HOTDIR__.md"]; +/** + * Process a single file and return the documents + * @param {string} targetFilename - The filename to process + * @param {Object} options - The options for the file processing + * @param {Object} metadata - The metadata for the file processing + * @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing + */ async function processSingleFile(targetFilename, options = {}, metadata = {}) { const fullFilePath = path.resolve( WATCH_DIRECTORY, diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index 397c314cafd..3226a740b2b 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -50,7 +50,7 @@ function apiDocumentEndpoints(app) { }, metadata: { type: 'object', - description: 'Key:Value pairs of metadata to attach to the document in JSON Object format.', + description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.', example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' } } }, @@ -96,11 +96,12 @@ function apiDocumentEndpoints(app) { try { const Collector = new CollectorApi(); const { originalname } = request.file; - const { addToWorkspaces = "", metadata = {} } = reqBody(request); - - const metadataObj = - typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata; - + const { addToWorkspaces = "", metadata: _metadata = {} } = + reqBody(request); + const metadata = + typeof _metadata === "string" + ? safeJsonParse(_metadata, {}) + : _metadata; const processingOnline = await Collector.online(); if (!processingOnline) { @@ -116,14 +117,14 @@ function apiDocumentEndpoints(app) { const { success, reason, documents } = await Collector.processDocument( originalname, - metadataObj + metadata ); + if (!success) { - response + return response .status(500) .json({ success: false, error: reason, documents }) .end(); - return; } Collector.log( @@ -181,7 +182,7 @@ function apiDocumentEndpoints(app) { }, metadata: { type: 'object', - description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. ', + description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.', example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' } } } @@ -237,7 +238,12 @@ function apiDocumentEndpoints(app) { */ try { const { originalname } = request.file; - const { addToWorkspaces = "", metadata = {} } = reqBody(request); + const { addToWorkspaces = "", metadata: _metadata = {} } = + reqBody(request); + const metadata = + typeof _metadata === "string" + ? safeJsonParse(_metadata, {}) + : _metadata; let folder = request.params?.folderName || "custom-documents"; folder = normalizePath(folder); @@ -250,33 +256,28 @@ function apiDocumentEndpoints(app) { if (!fs.existsSync(targetFolderPath)) fs.mkdirSync(targetFolderPath, { recursive: true }); - const metadataObj = - typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata; - const Collector = new CollectorApi(); const processingOnline = await Collector.online(); if (!processingOnline) { - response + return response .status(500) .json({ success: false, error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`, }) .end(); - return; } // Process the uploaded document with metadata const { success, reason, documents } = await Collector.processDocument( originalname, - metadataObj + metadata ); if (!success) { - response + return response .status(500) .json({ success: false, error: reason, documents }) .end(); - return; } // For each processed document, check if it is already in the desired folder. @@ -401,34 +402,34 @@ function apiDocumentEndpoints(app) { link, addToWorkspaces = "", scraperHeaders = {}, - metadata = {}, + metadata: _metadata = {}, } = reqBody(request); - const metadataObj = - typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata; + const metadata = + typeof _metadata === "string" + ? safeJsonParse(_metadata, {}) + : _metadata; const processingOnline = await Collector.online(); if (!processingOnline) { - response + return response .status(500) .json({ success: false, error: `Document processing API is not online. Link ${link} will not be processed automatically.`, }) .end(); - return; } const { success, reason, documents } = await Collector.processLink( link, scraperHeaders, - metadataObj + metadata ); if (!success) { - response + return response .status(500) .json({ success: false, error: reason, documents }) .end(); - return; } Collector.log( @@ -520,20 +521,23 @@ function apiDocumentEndpoints(app) { const requiredMetadata = ["title"]; const { textContent, - metadata = {}, + metadata: _metadata = {}, addToWorkspaces = "", } = reqBody(request); + const metadata = + typeof _metadata === "string" + ? safeJsonParse(_metadata, {}) + : _metadata; const processingOnline = await Collector.online(); if (!processingOnline) { - response + return response .status(500) .json({ success: false, error: `Document processing API is not online. Request will not be processed.`, }) .end(); - return; } if ( @@ -542,7 +546,7 @@ function apiDocumentEndpoints(app) { Object.keys(metadata).includes(reqKey) && !!metadata[reqKey] ) ) { - response + return response .status(422) .json({ success: false, @@ -551,18 +555,16 @@ function apiDocumentEndpoints(app) { .join(", ")}`, }) .end(); - return; } if (!textContent || textContent?.length === 0) { - response + return response .status(422) .json({ success: false, error: `The 'textContent' key cannot have an empty value.`, }) .end(); - return; } const { success, reason, documents } = await Collector.processRawText( @@ -570,11 +572,10 @@ function apiDocumentEndpoints(app) { metadata ); if (!success) { - response + return response .status(500) .json({ success: false, error: reason, documents }) .end(); - return; } Collector.log( diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index 62665b55cfc..fcb87dad120 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -916,7 +916,7 @@ }, "metadata": { "type": "object", - "description": "Key:Value pairs of metadata to attach to the document in JSON Object format.", + "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.", "example": { "title": "Custom Title", "docAuthor": "Author Name", @@ -1031,7 +1031,7 @@ }, "metadata": { "type": "object", - "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. ", + "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.", "example": { "title": "Custom Title", "docAuthor": "Author Name", diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index 22a69f436f0..ef56f0c9255 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -104,6 +104,7 @@ class CollectorApi { * - Will append the options to the request body * @param {string} link - The link to process * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply to the web-scraping request URL + * @param {[key: string]: string} metadata - Optional metadata to attach to the document * @returns {Promise} - The response from the collector API */ async processLink(link = "", scraperHeaders = {}, metadata = {}) { @@ -142,7 +143,7 @@ class CollectorApi { * Process raw text as a document for the collector * - Will append the options to the request body * @param {string} textContent - The text to process - * @param {Object} metadata - The metadata to process + * @param {[key: string]: string} metadata - The metadata to process * @returns {Promise} - The response from the collector API */ async processRawText(textContent = "", metadata = {}) {