From 27f652b2aa1942121a7d63f2440816d8175ad6a6 Mon Sep 17 00:00:00 2001
From: jstawskigmi <jstawski@getmyinterns.org>
Date: Sat, 30 Aug 2025 16:18:48 -0400
Subject: [PATCH 1/7] Added the ability to pass in metadata to the
 /document/upload/{folderName} endpoint

---
 collector/index.js                            |  4 +-
 .../processSingleFile/convert/asAudio.js      | 12 +++---
 collector/processSingleFile/convert/asDocx.js | 12 +++---
 collector/processSingleFile/convert/asEPub.js | 12 +++---
 .../processSingleFile/convert/asImage.js      | 12 +++---
 collector/processSingleFile/convert/asMbox.js | 14 +++----
 .../processSingleFile/convert/asOfficeMime.js | 11 +++---
 .../processSingleFile/convert/asPDF/index.js  | 12 +++---
 collector/processSingleFile/convert/asTxt.js  | 12 +++---
 collector/processSingleFile/convert/asXlsx.js | 12 +++---
 collector/processSingleFile/index.js          |  3 +-
 server/endpoints/api/document/index.js        | 38 +++++++++++++++++--
 server/swagger/openapi.json                   | 10 ++++-
 server/utils/collectorApi/index.js            |  6 ++-
 14 files changed, 106 insertions(+), 64 deletions(-)

diff --git a/collector/index.js b/collector/index.js
index 962ee6f8c65..75690533d56 100644
--- a/collector/index.js
+++ b/collector/index.js
@@ -32,7 +32,7 @@ app.post(
   "/process",
   [verifyPayloadIntegrity],
   async function (request, response) {
-    const { filename, options = {} } = reqBody(request);
+    const { filename, options = {}, metadata = {} } = reqBody(request);
     try {
       const targetFilename = path
         .normalize(filename)
@@ -41,7 +41,7 @@ app.post(
         success,
         reason,
         documents = [],
-      } = await processSingleFile(targetFilename, options);
+      } = await processSingleFile(targetFilename, options, metadata);
       response
         .status(200)
         .json({ filename: targetFilename, success, reason, documents });
diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js
index dccb2b9522a..ee306ea84f5 100644
--- a/collector/processSingleFile/convert/asAudio.js
+++ b/collector/processSingleFile/convert/asAudio.js
@@ -14,7 +14,7 @@ const WHISPER_PROVIDERS = {
   local: LocalWhisper,
 };
 
-async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
+async function asAudio({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
   const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
     options?.whisperProvider
   )
@@ -48,11 +48,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "audio file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js
index 6e2133b1e38..9ca4de5ce5b 100644
--- a/collector/processSingleFile/convert/asDocx.js
+++ b/collector/processSingleFile/convert/asDocx.js
@@ -8,7 +8,7 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
+async function asDocX({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
   const loader = new DocxLoader(fullFilePath);
 
   console.log(`-- Working ${filename} --`);
@@ -34,11 +34,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "docx file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js
index 15a01b23fc1..4a4fe1e67ee 100644
--- a/collector/processSingleFile/convert/asEPub.js
+++ b/collector/processSingleFile/convert/asEPub.js
@@ -8,7 +8,7 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");
 
-async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
+async function asEPub({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
   let content = "";
   try {
     const loader = new EPubLoader(fullFilePath, { splitChapters: false });
@@ -32,11 +32,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a epub file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "epub file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js
index 05eff2d5696..f97cf73307f 100644
--- a/collector/processSingleFile/convert/asImage.js
+++ b/collector/processSingleFile/convert/asImage.js
@@ -8,7 +8,7 @@ const {
 const OCRLoader = require("../../utils/OCRLoader");
 const { default: slugify } = require("slugify");
 
-async function asImage({ fullFilePath = "", filename = "", options = {} }) {
+async function asImage({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
   let content = await new OCRLoader({
     targetLanguages: options?.ocr?.langList,
   }).ocrImage(fullFilePath);
@@ -27,11 +27,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a text file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "image file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js
index a89b54ca9d8..389494f243d 100644
--- a/collector/processSingleFile/convert/asMbox.js
+++ b/collector/processSingleFile/convert/asMbox.js
@@ -9,7 +9,7 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
+async function asMbox({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
   console.log(`-- Working ${filename} --`);
 
   const mails = await mboxParser(fs.createReadStream(fullFilePath))
@@ -43,13 +43,13 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
     const data = {
       id: v4(),
       url: "file://" + fullFilePath,
-      title: mail?.subject
+      title: metadata.title || (mail?.subject
         ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
-        : `msg_${item}-${filename}`,
-      docAuthor: mail?.from?.text,
-      description: "No description found.",
-      docSource: "Mbox message file uploaded by the user.",
-      chunkSource: "",
+        : `msg_${item}-${filename}`),
+      docAuthor: metadata.docAuthor || mail?.from?.text,
+      description: metadata.description || "No description found.",
+      docSource: metadata.docSource || "Mbox message file uploaded by the user.",
+      chunkSource: metadata.chunkSource || "",
       published: createdDate(fullFilePath),
       wordCount: content.split(" ").length,
       pageContent: content,
diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js
index 41a2faa8515..91062495b23 100644
--- a/collector/processSingleFile/convert/asOfficeMime.js
+++ b/collector/processSingleFile/convert/asOfficeMime.js
@@ -12,6 +12,7 @@ async function asOfficeMime({
   fullFilePath = "",
   filename = "",
   options = {},
+  metadata = {}
 }) {
   console.log(`-- Working ${filename} --`);
   let content = "";
@@ -34,11 +35,11 @@ async function asOfficeMime({
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "Office file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "Office file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js
index 5971dd4e416..863f95b7bf2 100644
--- a/collector/processSingleFile/convert/asPDF/index.js
+++ b/collector/processSingleFile/convert/asPDF/index.js
@@ -9,7 +9,7 @@ const { default: slugify } = require("slugify");
 const PDFLoader = require("./PDFLoader");
 const OCRLoader = require("../../../utils/OCRLoader");
 
-async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
+async function asPdf({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
   const pdfLoader = new PDFLoader(fullFilePath, {
     splitPages: true,
   });
@@ -51,11 +51,11 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
-    description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
+    description: metadata.description || docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
+    docSource: metadata.docSource || "pdf file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js
index 7e3bd92c06e..73b16a7435e 100644
--- a/collector/processSingleFile/convert/asTxt.js
+++ b/collector/processSingleFile/convert/asTxt.js
@@ -8,7 +8,7 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");
 
-async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
+async function asTxt({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
   let content = "";
   try {
     content = fs.readFileSync(fullFilePath, "utf8");
@@ -30,11 +30,11 @@ async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
   const data = {
     id: v4(),
     url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a text file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "a text file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js
index 832e44a7df7..6efad61b50b 100644
--- a/collector/processSingleFile/convert/asXlsx.js
+++ b/collector/processSingleFile/convert/asXlsx.js
@@ -27,7 +27,7 @@ function convertToCSV(data) {
     .join("\n");
 }
 
-async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
+async function asXlsx({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
   const documents = [];
   const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
     lower: true,
@@ -56,11 +56,11 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
         const sheetData = {
           id: v4(),
           url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
-          title: `${filename} - Sheet:${name}`,
-          docAuthor: "Unknown",
-          description: `Spreadsheet data from sheet: ${name}`,
-          docSource: "an xlsx file uploaded by the user.",
-          chunkSource: "",
+          title: metadata.title || `${filename} - Sheet:${name}`,
+          docAuthor: metadata.docAuthor || "Unknown",
+          description: metadata.description || `Spreadsheet data from sheet: ${name}`,
+          docSource: metadata.docSource || "an xlsx file uploaded by the user.",
+          chunkSource: metadata.chunkSource || "",
           published: createdDate(fullFilePath),
           wordCount: content.split(/\s+/).length,
           pageContent: content,
diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js
index a00b139ed4b..8ec2088141b 100644
--- a/collector/processSingleFile/index.js
+++ b/collector/processSingleFile/index.js
@@ -12,7 +12,7 @@ const {
 } = require("../utils/files");
 const RESERVED_FILES = ["__HOTDIR__.md"];
 
-async function processSingleFile(targetFilename, options = {}) {
+async function processSingleFile(targetFilename, options = {}, metadata = {}) {
   const fullFilePath = path.resolve(
     WATCH_DIRECTORY,
     normalizePath(targetFilename)
@@ -70,6 +70,7 @@ async function processSingleFile(targetFilename, options = {}) {
     fullFilePath,
     filename: targetFilename,
     options,
+    metadata,
   });
 }
 
diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js
index 0795be7a3f8..83b71425744 100644
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@@ -151,7 +151,7 @@ function apiDocumentEndpoints(app) {
         example: 'my-folder'
       }
       #swagger.requestBody = {
-        description: 'File to be uploaded.',
+        description: 'File to be uploaded, with optional metadata.',
         required: true,
         content: {
           "multipart/form-data": {
@@ -167,6 +167,11 @@ function apiDocumentEndpoints(app) {
                 addToWorkspaces: {
                   type: 'string',
                   description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2',
+                },
+                "metadata": {
+                  type: 'object',
+                  description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. '
+                  example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' }
                 }
               }
             }
@@ -221,7 +226,8 @@ function apiDocumentEndpoints(app) {
       */
       try {
         const { originalname } = request.file;
-        const { addToWorkspaces = "" } = reqBody(request);
+        const { addToWorkspaces = "", metadata = {} } = reqBody(request);
+  
         let folder = request.params?.folderName || "custom-documents";
         folder = normalizePath(folder);
         const targetFolderPath = path.join(documentsPath, folder);
@@ -233,6 +239,30 @@ function apiDocumentEndpoints(app) {
         if (!fs.existsSync(targetFolderPath))
           fs.mkdirSync(targetFolderPath, { recursive: true });
 
+        // Validate required metadata keys if present
+        // Parse JSON string into an object
+        let metadataObj = {};
+        if (metadata && typeof metadata === "string") {
+          try {
+            metadataObj = JSON.parse(metadata);
+          }
+          catch {
+            response.status(422).json({ success: false, error: 'Invalid metadata' }).end();
+            return;
+          }
+        }
+
+        const requiredMetadata = ["title"];
+        if (
+          metadataObj && Object.keys(metadataObj).length > 0 &&
+          !requiredMetadata.every(
+            (reqKey) => Object.keys(metadataObj).includes(reqKey) && !!metadataObj[reqKey]
+          )
+        ) {
+          response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end();
+          return;
+        }
+
         const Collector = new CollectorApi();
         const processingOnline = await Collector.online();
         if (!processingOnline) {
@@ -246,9 +276,9 @@ function apiDocumentEndpoints(app) {
           return;
         }
 
-        // Process the uploaded document
+        // Process the uploaded document with metadata
         const { success, reason, documents } =
-          await Collector.processDocument(originalname);
+          await Collector.processDocument(originalname, metadataObj);
         if (!success) {
           response
             .status(500)
diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json
index 40e057ed33d..b8175b72045 100644
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@@ -984,6 +984,9 @@
               }
             }
           },
+          "422": {
+            "description": "Unprocessable Entity"
+          },
           "500": {
             "description": "Internal Server Error",
             "content": {
@@ -1000,7 +1003,7 @@
           }
         },
         "requestBody": {
-          "description": "File to be uploaded.",
+          "description": "File to be uploaded, with optional metadata.",
           "required": true,
           "content": {
             "multipart/form-data": {
@@ -1018,6 +1021,11 @@
                   "addToWorkspaces": {
                     "type": "string",
                     "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2"
+                  },
+                  "metadata": {
+                    "type": "object",
+                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. ",
+                    "example": { "title": "Custom Title", "docAuthor": "Author Name", "description": "A brief description", "docSource": "Source of the document" }
                   }
                 }
               }
diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js
index c991c299ce8..c651f4cb328 100644
--- a/server/utils/collectorApi/index.js
+++ b/server/utils/collectorApi/index.js
@@ -63,15 +63,17 @@ class CollectorApi {
 
   /**
    * Process a document
-   * - Will append the options to the request body
+   * - Will append the options and optional metadata to the request body
    * @param {string} filename - The filename of the document to process
+   * @param {Object} metadata - Optional metadata key:value pairs
    * @returns {Promise<Object>} - The response from the collector API
    */
-  async processDocument(filename = "") {
+  async processDocument(filename = "", metadata = {}) {
     if (!filename) return false;
 
     const data = JSON.stringify({
       filename,
+      metadata,
       options: this.#attachOptions(),
     });
 

From f0b2bb22af147b544ee1cde9e20d8b2e85d0fb0a Mon Sep 17 00:00:00 2001
From: jstawskigmi <jstawski@getmyinterns.org>
Date: Sat, 30 Aug 2025 17:11:15 -0400
Subject: [PATCH 2/7] Added the ability to pass in metadata to the
 /document/upload-link endpoint

---
 collector/index.js                       |  4 +--
 collector/processLink/convert/generic.js |  9 ++---
 collector/processLink/index.js           |  5 ++-
 server/endpoints/api/document/index.js   | 24 ++++++++++++--
 server/swagger/openapi.json              | 42 +++++++-----------------
 server/utils/collectorApi/index.js       |  3 +-
 6 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/collector/index.js b/collector/index.js
index 75690533d56..a0dd0e5644d 100644
--- a/collector/index.js
+++ b/collector/index.js
@@ -95,13 +95,13 @@ app.post(
   "/process-link",
   [verifyPayloadIntegrity],
   async function (request, response) {
-    const { link, scraperHeaders = {} } = reqBody(request);
+    const { link, scraperHeaders = {}, metadata = {} } = reqBody(request);
     try {
       const {
         success,
         reason,
         documents = [],
-      } = await processLink(link, scraperHeaders);
+      } = await processLink(link, scraperHeaders, metadata);
       response.status(200).json({ url: link, success, reason, documents });
     } catch (e) {
       console.error(e);
diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
index d75aaf41449..af3374b1669 100644
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@@ -20,6 +20,7 @@ async function scrapeGenericUrl({
   captureAs = "text",
   processAsDocument = true,
   scraperHeaders = {},
+  metadata = {},
 }) {
   console.log(`-- Working URL ${link} => (${captureAs}) --`);
   const content = await getPageContent({
@@ -51,10 +52,10 @@ async function scrapeGenericUrl({
   const data = {
     id: v4(),
     url: "file://" + slugify(filename) + ".html",
-    title: slugify(filename) + ".html",
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "URL link uploaded by the user.",
+    title: metadata.title || slugify(filename) + ".html",
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "URL link uploaded by the user.",
     chunkSource: `link://${link}`,
     published: new Date().toLocaleString(),
     wordCount: content.split(" ").length,
diff --git a/collector/processLink/index.js b/collector/processLink/index.js
index 819b1863f94..05ede7db36c 100644
--- a/collector/processLink/index.js
+++ b/collector/processLink/index.js
@@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic");
  * so it can be used for embedding later.
  * @param {string} link - The link to process
  * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
+ * @param {Object} metadata - Optional metadata to attach to the document
  * @returns {Promise<{success: boolean, content: string}>} - Response from collector
  */
-async function processLink(link, scraperHeaders = {}) {
+async function processLink(link, scraperHeaders = {}, metadata = {}) {
   if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
   return await scrapeGenericUrl({
     link,
     captureAs: "text",
     processAsDocument: true,
     scraperHeaders,
+    metadata,
   });
 }
 
@@ -31,6 +33,7 @@ async function getLinkText(link, captureAs = "text") {
     link,
     captureAs,
     processAsDocument: false,
+    metadata: {}
   });
 }
 
diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js
index 83b71425744..a44b80b8896 100644
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@@ -344,7 +344,7 @@ function apiDocumentEndpoints(app) {
     #swagger.tags = ['Documents']
     #swagger.description = 'Upload a valid URL for AnythingLLM to scrape and prepare for embedding. Optionally, specify a comma-separated list of workspace slugs to embed the document into post-upload.'
     #swagger.requestBody = {
-      description: 'Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload.',
+      description: 'Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload, and optional metadata.',
       required: true,
       content: {
           "application/json": {
@@ -356,6 +356,12 @@ function apiDocumentEndpoints(app) {
                 "scraperHeaders": {
                   "Authorization": "Bearer token123",
                   "My-Custom-Header": "value"
+                },
+                "metadata": {
+                  "title": "Custom Title",
+                  "docAuthor": "Author Name",
+                  "description": "A brief description",
+                  "docSource": "Source of the document"
                 }
               }
             }
@@ -399,13 +405,26 @@ function apiDocumentEndpoints(app) {
     */
       try {
         const Collector = new CollectorApi();
+        const requiredMetadata = ["title"];
         const {
           link,
           addToWorkspaces = "",
           scraperHeaders = {},
+          metadata = {}
         } = reqBody(request);
         const processingOnline = await Collector.online();
 
+        // Validate required metadata keys if present
+        if (
+          metadata && Object.keys(metadata).length > 0 &&
+          !requiredMetadata.every(
+            (reqKey) => Object.keys(metadata).includes(reqKey) && !!metadata[reqKey]
+          )
+        ) {
+          response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end();
+          return;
+        }
+
         if (!processingOnline) {
           response
             .status(500)
@@ -419,7 +438,8 @@ function apiDocumentEndpoints(app) {
 
         const { success, reason, documents } = await Collector.processLink(
           link,
-          scraperHeaders
+          scraperHeaders,
+          metadata
         );
         if (!success) {
           response
diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json
index b8175b72045..ca71cb9f22a 100644
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@@ -1002,36 +1002,7 @@
             }
           }
         },
-        "requestBody": {
-          "description": "File to be uploaded, with optional metadata.",
-          "required": true,
-          "content": {
-            "multipart/form-data": {
-              "schema": {
-                "type": "object",
-                "required": [
-                  "file"
-                ],
-                "properties": {
-                  "file": {
-                    "type": "string",
-                    "format": "binary",
-                    "description": "The file to upload"
-                  },
-                  "addToWorkspaces": {
-                    "type": "string",
-                    "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2"
-                  },
-                  "metadata": {
-                    "type": "object",
-                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. ",
-                    "example": { "title": "Custom Title", "docAuthor": "Author Name", "description": "A brief description", "docSource": "Source of the document" }
-                  }
-                }
-              }
-            }
-          }
-        }
+        "requestBody": {}
       }
     },
     "/v1/document/upload-link": {
@@ -1087,12 +1058,15 @@
               }
             }
           },
+          "422": {
+            "description": "Unprocessable Entity"
+          },
           "500": {
             "description": "Internal Server Error"
           }
         },
         "requestBody": {
-          "description": "Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload.",
+          "description": "Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload, and optional metadata.",
           "required": true,
           "content": {
             "application/json": {
@@ -1104,6 +1078,12 @@
                   "scraperHeaders": {
                     "Authorization": "Bearer token123",
                     "My-Custom-Header": "value"
+                  },
+                  "metadata": {
+                    "title": "Custom Title",
+                    "docAuthor": "Author Name",
+                    "description": "A brief description",
+                    "docSource": "Source of the document"
                   }
                 }
               }
diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js
index c651f4cb328..66bc5ac3775 100644
--- a/server/utils/collectorApi/index.js
+++ b/server/utils/collectorApi/index.js
@@ -106,13 +106,14 @@ class CollectorApi {
    * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply to the web-scraping request URL
    * @returns {Promise<Object>} - The response from the collector API
    */
-  async processLink(link = "", scraperHeaders = {}) {
+  async processLink(link = "", scraperHeaders = {}, metadata = {}) {
     if (!link) return false;
 
     const data = JSON.stringify({
       link,
       scraperHeaders,
       options: this.#attachOptions(),
+      metadata: metadata
     });
 
     return await fetch(`${this.endpoint}/process-link`, {

From 2e6101f7a1521298d5912e67eb1ba936a1226ed7 Mon Sep 17 00:00:00 2001
From: jstawskigmi <jstawski@getmyinterns.org>
Date: Sat, 30 Aug 2025 20:27:23 -0400
Subject: [PATCH 3/7] feat: added metadata to document/upload api endpoint

---
 server/endpoints/api/document/index.js | 40 +++++++++++++++++---
 server/swagger/openapi.json            | 51 +++++++++++++++++++++++++-
 2 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js
index a44b80b8896..974ccdf04a4 100644
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@@ -29,7 +29,7 @@ function apiDocumentEndpoints(app) {
     async (request, response) => {
       /*
     #swagger.tags = ['Documents']
-    #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.'
+    #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding, with optional metadata.'
     #swagger.requestBody = {
       description: 'File to be uploaded.',
       required: true,
@@ -47,6 +47,11 @@ function apiDocumentEndpoints(app) {
               addToWorkspaces: {
                 type: 'string',
                 description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2',
+              },
+              metadata: {
+                type: 'object',
+                description: 'Key:Value pairs of metadata to attach to the document in JSON Object format.',
+                example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' }
               }
             },
             required: ['file']
@@ -91,7 +96,32 @@ function apiDocumentEndpoints(app) {
       try {
         const Collector = new CollectorApi();
         const { originalname } = request.file;
-        const { addToWorkspaces = "" } = reqBody(request);
+        const { addToWorkspaces = "", metadata = {} } = reqBody(request);
+
+        // Validate required metadata keys if present
+        // Parse JSON string into an object
+        let metadataObj = {};
+        if (metadata && typeof metadata === "string") {
+          try {
+            metadataObj = JSON.parse(metadata);
+          }
+          catch {
+            response.status(422).json({ success: false, error: 'Invalid metadata' }).end();
+            return;
+          }
+        }
+
+        const requiredMetadata = ["title"];
+        if (
+          metadataObj && Object.keys(metadataObj).length > 0 &&
+          !requiredMetadata.every(
+            (reqKey) => Object.keys(metadataObj).includes(reqKey) && !!metadataObj[reqKey]
+          )
+        ) {
+          response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end();
+          return;
+        }
+
         const processingOnline = await Collector.online();
 
         if (!processingOnline) {
@@ -106,7 +136,7 @@ function apiDocumentEndpoints(app) {
         }
 
         const { success, reason, documents } =
-          await Collector.processDocument(originalname);
+          await Collector.processDocument(originalname, metadataObj);
         if (!success) {
           response
             .status(500)
@@ -168,9 +198,9 @@ function apiDocumentEndpoints(app) {
                   type: 'string',
                   description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2',
                 },
-                "metadata": {
+                metadata: {
                   type: 'object',
-                  description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. '
+                  description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. ',
                   example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' }
                 }
               }
diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json
index ca71cb9f22a..b12960fade0 100644
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@@ -843,7 +843,7 @@
         "tags": [
           "Documents"
         ],
-        "description": "Upload a new file to AnythingLLM to be parsed and prepared for embedding.",
+        "description": "Upload a new file to AnythingLLM to be parsed and prepared for embedding, with optional metadata.",
         "parameters": [],
         "responses": {
           "200": {
@@ -890,6 +890,9 @@
               }
             }
           },
+          "422": {
+            "description": "Unprocessable Entity"
+          },
           "500": {
             "description": "Internal Server Error"
           }
@@ -913,6 +916,16 @@
                   "addToWorkspaces": {
                     "type": "string",
                     "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2"
+                  },
+                  "metadata": {
+                    "type": "object",
+                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format.",
+                    "example": {
+                      "title": "Custom Title",
+                      "docAuthor": "Author Name",
+                      "description": "A brief description",
+                      "docSource": "Source of the document"
+                    }
                   }
                 }
               }
@@ -1002,7 +1015,41 @@
             }
           }
         },
-        "requestBody": {}
+        "requestBody": {
+          "description": "File to be uploaded, with optional metadata.",
+          "required": true,
+          "content": {
+            "multipart/form-data": {
+              "schema": {
+                "type": "object",
+                "required": [
+                  "file"
+                ],
+                "properties": {
+                  "file": {
+                    "type": "string",
+                    "format": "binary",
+                    "description": "The file to upload"
+                  },
+                  "addToWorkspaces": {
+                    "type": "string",
+                    "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2"
+                  },
+                  "metadata": {
+                    "type": "object",
+                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. ",
+                    "example": {
+                      "title": "Custom Title",
+                      "docAuthor": "Author Name",
+                      "description": "A brief description",
+                      "docSource": "Source of the document"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
       }
     },
     "/v1/document/upload-link": {

From 65f412e697678ce8a52d1b531e6240708be4ea21 Mon Sep 17 00:00:00 2001
From: shatfield4 <seanhatfield5@gmail.com>
Date: Wed, 3 Sep 2025 16:39:20 -0700
Subject: [PATCH 4/7] simplify optional metadata in document dev api endpoints

---
 server/endpoints/api/document/index.js | 81 +++++---------------------
 server/swagger/openapi.json            |  9 ---
 2 files changed, 15 insertions(+), 75 deletions(-)

diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js
index 974ccdf04a4..de3e42eb14c 100644
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@@ -8,7 +8,7 @@ const {
   normalizePath,
   isWithin,
 } = require("../../../utils/files");
-const { reqBody } = require("../../../utils/http");
+const { reqBody, safeJsonParse } = require("../../../utils/http");
 const { EventLogs } = require("../../../models/eventLogs");
 const { CollectorApi } = require("../../../utils/collectorApi");
 const fs = require("fs");
@@ -98,29 +98,7 @@ function apiDocumentEndpoints(app) {
         const { originalname } = request.file;
         const { addToWorkspaces = "", metadata = {} } = reqBody(request);
 
-        // Validate required metadata keys if present
-        // Parse JSON string into an object
-        let metadataObj = {};
-        if (metadata && typeof metadata === "string") {
-          try {
-            metadataObj = JSON.parse(metadata);
-          }
-          catch {
-            response.status(422).json({ success: false, error: 'Invalid metadata' }).end();
-            return;
-          }
-        }
-
-        const requiredMetadata = ["title"];
-        if (
-          metadataObj && Object.keys(metadataObj).length > 0 &&
-          !requiredMetadata.every(
-            (reqKey) => Object.keys(metadataObj).includes(reqKey) && !!metadataObj[reqKey]
-          )
-        ) {
-          response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end();
-          return;
-        }
+        const metadataObj = safeJsonParse(metadata, {});
 
         const processingOnline = await Collector.online();
 
@@ -135,8 +113,10 @@ function apiDocumentEndpoints(app) {
           return;
         }
 
-        const { success, reason, documents } =
-          await Collector.processDocument(originalname, metadataObj);
+        const { success, reason, documents } = await Collector.processDocument(
+          originalname,
+          metadataObj
+        );
         if (!success) {
           response
             .status(500)
@@ -257,7 +237,7 @@ function apiDocumentEndpoints(app) {
       try {
         const { originalname } = request.file;
         const { addToWorkspaces = "", metadata = {} } = reqBody(request);
-  
+
         let folder = request.params?.folderName || "custom-documents";
         folder = normalizePath(folder);
         const targetFolderPath = path.join(documentsPath, folder);
@@ -269,29 +249,7 @@ function apiDocumentEndpoints(app) {
         if (!fs.existsSync(targetFolderPath))
           fs.mkdirSync(targetFolderPath, { recursive: true });
 
-        // Validate required metadata keys if present
-        // Parse JSON string into an object
-        let metadataObj = {};
-        if (metadata && typeof metadata === "string") {
-          try {
-            metadataObj = JSON.parse(metadata);
-          }
-          catch {
-            response.status(422).json({ success: false, error: 'Invalid metadata' }).end();
-            return;
-          }
-        }
-
-        const requiredMetadata = ["title"];
-        if (
-          metadataObj && Object.keys(metadataObj).length > 0 &&
-          !requiredMetadata.every(
-            (reqKey) => Object.keys(metadataObj).includes(reqKey) && !!metadataObj[reqKey]
-          )
-        ) {
-          response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end();
-          return;
-        }
+        const metadataObj = safeJsonParse(metadata, {});
 
         const Collector = new CollectorApi();
         const processingOnline = await Collector.online();
@@ -307,8 +265,10 @@ function apiDocumentEndpoints(app) {
         }
 
         // Process the uploaded document with metadata
-        const { success, reason, documents } =
-          await Collector.processDocument(originalname, metadataObj);
+        const { success, reason, documents } = await Collector.processDocument(
+          originalname,
+          metadataObj
+        );
         if (!success) {
           response
             .status(500)
@@ -435,26 +395,15 @@ function apiDocumentEndpoints(app) {
     */
       try {
         const Collector = new CollectorApi();
-        const requiredMetadata = ["title"];
         const {
           link,
           addToWorkspaces = "",
           scraperHeaders = {},
-          metadata = {}
+          metadata = {},
         } = reqBody(request);
+        const metadataObj = safeJsonParse(metadata, {});
         const processingOnline = await Collector.online();
 
-        // Validate required metadata keys if present
-        if (
-          metadata && Object.keys(metadata).length > 0 &&
-          !requiredMetadata.every(
-            (reqKey) => Object.keys(metadata).includes(reqKey) && !!metadata[reqKey]
-          )
-        ) {
-          response.status(422).json({ success: false, error: `You are missing required metadata key:value pairs in your request. Required metadata key:values are ${requiredMetadata.map((v) => `'${v}'`).join(", ")}` }).end();
-          return;
-        }
-
         if (!processingOnline) {
           response
             .status(500)
@@ -469,7 +418,7 @@ function apiDocumentEndpoints(app) {
         const { success, reason, documents } = await Collector.processLink(
           link,
           scraperHeaders,
-          metadata
+          metadataObj
         );
         if (!success) {
           response
diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json
index b12960fade0..62665b55cfc 100644
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@@ -890,9 +890,6 @@
               }
             }
           },
-          "422": {
-            "description": "Unprocessable Entity"
-          },
           "500": {
             "description": "Internal Server Error"
           }
@@ -997,9 +994,6 @@
               }
             }
           },
-          "422": {
-            "description": "Unprocessable Entity"
-          },
           "500": {
             "description": "Internal Server Error",
             "content": {
@@ -1105,9 +1099,6 @@
               }
             }
           },
-          "422": {
-            "description": "Unprocessable Entity"
-          },
           "500": {
             "description": "Internal Server Error"
           }

From 34edeefcad9efaf0f2fc8c59589ebeecfdea5b31 Mon Sep 17 00:00:00 2001
From: shatfield4 <seanhatfield5@gmail.com>
Date: Wed, 3 Sep 2025 16:39:48 -0700
Subject: [PATCH 5/7] lint

---
 collector/processLink/index.js                 |  2 +-
 collector/processSingleFile/convert/asAudio.js |  7 ++++++-
 collector/processSingleFile/convert/asDocx.js  |  7 ++++++-
 collector/processSingleFile/convert/asEPub.js  |  7 ++++++-
 collector/processSingleFile/convert/asImage.js |  7 ++++++-
 collector/processSingleFile/convert/asMbox.js  | 18 +++++++++++++-----
 .../processSingleFile/convert/asOfficeMime.js  |  2 +-
 .../processSingleFile/convert/asPDF/index.js   | 17 ++++++++++++++---
 collector/processSingleFile/convert/asTxt.js   |  7 ++++++-
 collector/processSingleFile/convert/asXlsx.js  | 10 ++++++++--
 server/utils/collectorApi/index.js             |  2 +-
 11 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/collector/processLink/index.js b/collector/processLink/index.js
index 05ede7db36c..5406092684d 100644
--- a/collector/processLink/index.js
+++ b/collector/processLink/index.js
@@ -33,7 +33,7 @@ async function getLinkText(link, captureAs = "text") {
     link,
     captureAs,
     processAsDocument: false,
-    metadata: {}
+    metadata: {},
   });
 }
 
diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js
index ee306ea84f5..b4ddc8eb76d 100644
--- a/collector/processSingleFile/convert/asAudio.js
+++ b/collector/processSingleFile/convert/asAudio.js
@@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = {
   local: LocalWhisper,
 };
 
-async function asAudio({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
+async function asAudio({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
     options?.whisperProvider
   )
diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js
index 9ca4de5ce5b..1f77e772002 100644
--- a/collector/processSingleFile/convert/asDocx.js
+++ b/collector/processSingleFile/convert/asDocx.js
@@ -8,7 +8,12 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function asDocX({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
+async function asDocX({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const loader = new DocxLoader(fullFilePath);
 
   console.log(`-- Working ${filename} --`);
diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js
index 4a4fe1e67ee..283eb1e2018 100644
--- a/collector/processSingleFile/convert/asEPub.js
+++ b/collector/processSingleFile/convert/asEPub.js
@@ -8,7 +8,12 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");
 
-async function asEPub({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
+async function asEPub({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   let content = "";
   try {
     const loader = new EPubLoader(fullFilePath, { splitChapters: false });
diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js
index f97cf73307f..77052f14a18 100644
--- a/collector/processSingleFile/convert/asImage.js
+++ b/collector/processSingleFile/convert/asImage.js
@@ -8,7 +8,12 @@ const {
 const OCRLoader = require("../../utils/OCRLoader");
 const { default: slugify } = require("slugify");
 
-async function asImage({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
+async function asImage({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   let content = await new OCRLoader({
     targetLanguages: options?.ocr?.langList,
   }).ocrImage(fullFilePath);
diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js
index 389494f243d..8927616a019 100644
--- a/collector/processSingleFile/convert/asMbox.js
+++ b/collector/processSingleFile/convert/asMbox.js
@@ -9,7 +9,12 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 
-async function asMbox({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
+async function asMbox({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   console.log(`-- Working ${filename} --`);
 
   const mails = await mboxParser(fs.createReadStream(fullFilePath))
@@ -43,12 +48,15 @@ async function asMbox({ fullFilePath = "", filename = "", options = {}, metadata
     const data = {
       id: v4(),
       url: "file://" + fullFilePath,
-      title: metadata.title || (mail?.subject
-        ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
-        : `msg_${item}-${filename}`),
+      title:
+        metadata.title ||
+        (mail?.subject
+          ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
+          : `msg_${item}-${filename}`),
       docAuthor: metadata.docAuthor || mail?.from?.text,
       description: metadata.description || "No description found.",
-      docSource: metadata.docSource || "Mbox message file uploaded by the user.",
+      docSource:
+        metadata.docSource || "Mbox message file uploaded by the user.",
       chunkSource: metadata.chunkSource || "",
       published: createdDate(fullFilePath),
       wordCount: content.split(" ").length,
diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js
index 91062495b23..dcd084144eb 100644
--- a/collector/processSingleFile/convert/asOfficeMime.js
+++ b/collector/processSingleFile/convert/asOfficeMime.js
@@ -12,7 +12,7 @@ async function asOfficeMime({
   fullFilePath = "",
   filename = "",
   options = {},
-  metadata = {}
+  metadata = {},
 }) {
   console.log(`-- Working ${filename} --`);
   let content = "";
diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js
index 863f95b7bf2..bacfdaf53bf 100644
--- a/collector/processSingleFile/convert/asPDF/index.js
+++ b/collector/processSingleFile/convert/asPDF/index.js
@@ -9,7 +9,12 @@ const { default: slugify } = require("slugify");
 const PDFLoader = require("./PDFLoader");
 const OCRLoader = require("../../../utils/OCRLoader");
 
-async function asPdf({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
+async function asPdf({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const pdfLoader = new PDFLoader(fullFilePath, {
     splitPages: true,
   });
@@ -52,8 +57,14 @@ async function asPdf({ fullFilePath = "", filename = "", options = {}, metadata
     id: v4(),
     url: "file://" + fullFilePath,
     title: metadata.title || filename,
-    docAuthor: metadata.docAuthor || docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
-    description: metadata.description || docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
+    docAuthor:
+      metadata.docAuthor ||
+      docs[0]?.metadata?.pdf?.info?.Creator ||
+      "no author found",
+    description:
+      metadata.description ||
+      docs[0]?.metadata?.pdf?.info?.Title ||
+      "No description found.",
     docSource: metadata.docSource || "pdf file uploaded by the user.",
     chunkSource: metadata.chunkSource || "",
     published: createdDate(fullFilePath),
diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js
index 73b16a7435e..d32cebce551 100644
--- a/collector/processSingleFile/convert/asTxt.js
+++ b/collector/processSingleFile/convert/asTxt.js
@@ -8,7 +8,12 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");
 
-async function asTxt({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
+async function asTxt({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   let content = "";
   try {
     content = fs.readFileSync(fullFilePath, "utf8");
diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js
index 6efad61b50b..c01bc86688c 100644
--- a/collector/processSingleFile/convert/asXlsx.js
+++ b/collector/processSingleFile/convert/asXlsx.js
@@ -27,7 +27,12 @@ function convertToCSV(data) {
     .join("\n");
 }
 
-async function asXlsx({ fullFilePath = "", filename = "", options = {}, metadata = {} }) {
+async function asXlsx({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
   const documents = [];
   const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
     lower: true,
@@ -58,7 +63,8 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {}, metadata
           url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
           title: metadata.title || `${filename} - Sheet:${name}`,
           docAuthor: metadata.docAuthor || "Unknown",
-          description: metadata.description || `Spreadsheet data from sheet: ${name}`,
+          description:
+            metadata.description || `Spreadsheet data from sheet: ${name}`,
           docSource: metadata.docSource || "an xlsx file uploaded by the user.",
           chunkSource: metadata.chunkSource || "",
           published: createdDate(fullFilePath),
diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js
index 66bc5ac3775..22a69f436f0 100644
--- a/server/utils/collectorApi/index.js
+++ b/server/utils/collectorApi/index.js
@@ -113,7 +113,7 @@ class CollectorApi {
       link,
       scraperHeaders,
       options: this.#attachOptions(),
-      metadata: metadata
+      metadata: metadata,
     });
 
     return await fetch(`${this.endpoint}/process-link`, {

From 2c4095697621d7489a6cdc64a9a6fa7698958e3a Mon Sep 17 00:00:00 2001
From: shatfield4 <seanhatfield5@gmail.com>
Date: Mon, 8 Sep 2025 15:26:37 -0700
Subject: [PATCH 6/7] patch handling of metadata in dev api

---
 server/endpoints/api/document/index.js | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js
index de3e42eb14c..397c314cafd 100644
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@@ -98,7 +98,8 @@ function apiDocumentEndpoints(app) {
         const { originalname } = request.file;
         const { addToWorkspaces = "", metadata = {} } = reqBody(request);
 
-        const metadataObj = safeJsonParse(metadata, {});
+        const metadataObj =
+          typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata;
 
         const processingOnline = await Collector.online();
 
@@ -249,7 +250,8 @@ function apiDocumentEndpoints(app) {
         if (!fs.existsSync(targetFolderPath))
           fs.mkdirSync(targetFolderPath, { recursive: true });
 
-        const metadataObj = safeJsonParse(metadata, {});
+        const metadataObj =
+          typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata;
 
         const Collector = new CollectorApi();
         const processingOnline = await Collector.online();
@@ -401,7 +403,8 @@ function apiDocumentEndpoints(app) {
           scraperHeaders = {},
           metadata = {},
         } = reqBody(request);
-        const metadataObj = safeJsonParse(metadata, {});
+        const metadataObj =
+          typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata;
         const processingOnline = await Collector.online();
 
         if (!processingOnline) {

From 5b9c8ad3095f1b3304739976174abd0a2fea2212 Mon Sep 17 00:00:00 2001
From: timothycarambat <rambat1010@gmail.com>
Date: Wed, 17 Sep 2025 10:54:45 -0700
Subject: [PATCH 7/7] Linting, small comments

---
 collector/processLink/convert/generic.js |  1 +
 collector/processLink/index.js           |  1 -
 collector/processSingleFile/index.js     |  7 +++
 server/endpoints/api/document/index.js   | 73 ++++++++++++------------
 server/swagger/openapi.json              |  4 +-
 server/utils/collectorApi/index.js       |  3 +-
 6 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
index af3374b1669..84589197749 100644
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@@ -13,6 +13,7 @@ const { default: slugify } = require("slugify");
  * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
  * @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
  * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
+ * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
  * @returns {Promise<Object>} - The content of the page
  */
 async function scrapeGenericUrl({
diff --git a/collector/processLink/index.js b/collector/processLink/index.js
index 5406092684d..bcbbfb9e659 100644
--- a/collector/processLink/index.js
+++ b/collector/processLink/index.js
@@ -33,7 +33,6 @@ async function getLinkText(link, captureAs = "text") {
     link,
     captureAs,
     processAsDocument: false,
-    metadata: {},
   });
 }
 
diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js
index 8ec2088141b..3bf31a70004 100644
--- a/collector/processSingleFile/index.js
+++ b/collector/processSingleFile/index.js
@@ -12,6 +12,13 @@ const {
 } = require("../utils/files");
 const RESERVED_FILES = ["__HOTDIR__.md"];
 
+/**
+ * Process a single file and return the documents
+ * @param {string} targetFilename - The filename to process
+ * @param {Object} options - The options for the file processing
+ * @param {Object} metadata - The metadata for the file processing
+ * @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing
+ */
 async function processSingleFile(targetFilename, options = {}, metadata = {}) {
   const fullFilePath = path.resolve(
     WATCH_DIRECTORY,
diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js
index 397c314cafd..3226a740b2b 100644
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@@ -50,7 +50,7 @@ function apiDocumentEndpoints(app) {
               },
               metadata: {
                 type: 'object',
-                description: 'Key:Value pairs of metadata to attach to the document in JSON Object format.',
+                description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.',
                 example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' }
               }
             },
@@ -96,11 +96,12 @@ function apiDocumentEndpoints(app) {
       try {
         const Collector = new CollectorApi();
         const { originalname } = request.file;
-        const { addToWorkspaces = "", metadata = {} } = reqBody(request);
-
-        const metadataObj =
-          typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata;
-
+        const { addToWorkspaces = "", metadata: _metadata = {} } =
+          reqBody(request);
+        const metadata =
+          typeof _metadata === "string"
+            ? safeJsonParse(_metadata, {})
+            : _metadata;
         const processingOnline = await Collector.online();
 
         if (!processingOnline) {
@@ -116,14 +117,14 @@ function apiDocumentEndpoints(app) {
 
         const { success, reason, documents } = await Collector.processDocument(
           originalname,
-          metadataObj
+          metadata
         );
+
         if (!success) {
-          response
+          return response
             .status(500)
             .json({ success: false, error: reason, documents })
             .end();
-          return;
         }
 
         Collector.log(
@@ -181,7 +182,7 @@ function apiDocumentEndpoints(app) {
                 },
                 metadata: {
                   type: 'object',
-                  description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. ',
+                  description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.',
                   example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' }
                 }
               }
@@ -237,7 +238,12 @@ function apiDocumentEndpoints(app) {
       */
       try {
         const { originalname } = request.file;
-        const { addToWorkspaces = "", metadata = {} } = reqBody(request);
+        const { addToWorkspaces = "", metadata: _metadata = {} } =
+          reqBody(request);
+        const metadata =
+          typeof _metadata === "string"
+            ? safeJsonParse(_metadata, {})
+            : _metadata;
 
         let folder = request.params?.folderName || "custom-documents";
         folder = normalizePath(folder);
@@ -250,33 +256,28 @@ function apiDocumentEndpoints(app) {
         if (!fs.existsSync(targetFolderPath))
           fs.mkdirSync(targetFolderPath, { recursive: true });
 
-        const metadataObj =
-          typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata;
-
         const Collector = new CollectorApi();
         const processingOnline = await Collector.online();
         if (!processingOnline) {
-          response
+          return response
             .status(500)
             .json({
               success: false,
               error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`,
             })
             .end();
-          return;
         }
 
         // Process the uploaded document with metadata
         const { success, reason, documents } = await Collector.processDocument(
           originalname,
-          metadataObj
+          metadata
         );
         if (!success) {
-          response
+          return response
             .status(500)
             .json({ success: false, error: reason, documents })
             .end();
-          return;
         }
 
         // For each processed document, check if it is already in the desired folder.
@@ -401,34 +402,34 @@ function apiDocumentEndpoints(app) {
           link,
           addToWorkspaces = "",
           scraperHeaders = {},
-          metadata = {},
+          metadata: _metadata = {},
         } = reqBody(request);
-        const metadataObj =
-          typeof metadata === "string" ? safeJsonParse(metadata, {}) : metadata;
+        const metadata =
+          typeof _metadata === "string"
+            ? safeJsonParse(_metadata, {})
+            : _metadata;
         const processingOnline = await Collector.online();
 
         if (!processingOnline) {
-          response
+          return response
             .status(500)
             .json({
               success: false,
               error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
             })
             .end();
-          return;
         }
 
         const { success, reason, documents } = await Collector.processLink(
           link,
           scraperHeaders,
-          metadataObj
+          metadata
         );
         if (!success) {
-          response
+          return response
             .status(500)
             .json({ success: false, error: reason, documents })
             .end();
-          return;
         }
 
         Collector.log(
@@ -520,20 +521,23 @@ function apiDocumentEndpoints(app) {
         const requiredMetadata = ["title"];
         const {
           textContent,
-          metadata = {},
+          metadata: _metadata = {},
           addToWorkspaces = "",
         } = reqBody(request);
+        const metadata =
+          typeof _metadata === "string"
+            ? safeJsonParse(_metadata, {})
+            : _metadata;
         const processingOnline = await Collector.online();
 
         if (!processingOnline) {
-          response
+          return response
             .status(500)
             .json({
               success: false,
               error: `Document processing API is not online. Request will not be processed.`,
             })
             .end();
-          return;
         }
 
         if (
@@ -542,7 +546,7 @@ function apiDocumentEndpoints(app) {
               Object.keys(metadata).includes(reqKey) && !!metadata[reqKey]
           )
         ) {
-          response
+          return response
             .status(422)
             .json({
               success: false,
@@ -551,18 +555,16 @@ function apiDocumentEndpoints(app) {
                 .join(", ")}`,
             })
             .end();
-          return;
         }
 
         if (!textContent || textContent?.length === 0) {
-          response
+          return response
             .status(422)
             .json({
               success: false,
               error: `The 'textContent' key cannot have an empty value.`,
             })
             .end();
-          return;
         }
 
         const { success, reason, documents } = await Collector.processRawText(
@@ -570,11 +572,10 @@ function apiDocumentEndpoints(app) {
           metadata
         );
         if (!success) {
-          response
+          return response
             .status(500)
             .json({ success: false, error: reason, documents })
             .end();
-          return;
         }
 
         Collector.log(
diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json
index 62665b55cfc..fcb87dad120 100644
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@@ -916,7 +916,7 @@
                   },
                   "metadata": {
                     "type": "object",
-                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format.",
+                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.",
                     "example": {
                       "title": "Custom Title",
                       "docAuthor": "Author Name",
@@ -1031,7 +1031,7 @@
                   },
                   "metadata": {
                     "type": "object",
-                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. ",
+                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.",
                     "example": {
                       "title": "Custom Title",
                       "docAuthor": "Author Name",
diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js
index 22a69f436f0..ef56f0c9255 100644
--- a/server/utils/collectorApi/index.js
+++ b/server/utils/collectorApi/index.js
@@ -104,6 +104,7 @@ class CollectorApi {
    * - Will append the options to the request body
    * @param {string} link - The link to process
    * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply to the web-scraping request URL
+   * @param {[key: string]: string} metadata - Optional metadata to attach to the document
    * @returns {Promise<Object>} - The response from the collector API
    */
   async processLink(link = "", scraperHeaders = {}, metadata = {}) {
@@ -142,7 +143,7 @@ class CollectorApi {
    * Process raw text as a document for the collector
    * - Will append the options to the request body
    * @param {string} textContent - The text to process
-   * @param {Object} metadata - The metadata to process
+   * @param {[key: string]: string} metadata - The metadata to process
    * @returns {Promise<Object>} - The response from the collector API
    */
   async processRawText(textContent = "", metadata = {}) {