Mintplex-Labs · timothycarambat · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml
@@ -6,7 +6,7 @@ concurrency:
 
 on:
   push:
-    branches: ['agent-builder'] # put your current branch to create a build. Core team only.
+    branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
     paths-ignore:
       - '**.md'
       - 'cloud-deployments/*'

diff --git a/collector/package.json b/collector/package.json
@@ -19,6 +19,7 @@
     "@xenova/transformers": "^2.11.0",
     "bcrypt": "^5.1.0",
     "body-parser": "^1.20.2",
+    "canvas": "^2.11.2",
     "cors": "^2.8.5",
     "dotenv": "^16.0.3",
     "epub2": "^3.0.2",
@@ -40,6 +41,7 @@
     "puppeteer": "~21.5.2",
     "sharp": "^0.33.5",
     "slugify": "^1.6.6",
+    "tesseract.js": "^6.0.0",
     "url-pattern": "^1.0.3",
     "uuid": "^9.0.0",
     "wavefile": "^11.0.0",
@@ -50,4 +52,4 @@
     "nodemon": "^2.0.22",
     "prettier": "^2.4.1"
   }
-}
+}
diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js
@@ -7,6 +7,7 @@ const {
 const { tokenizeString } = require("../../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 const PDFLoader = require("./PDFLoader");
+const OCRLoader = require("../../../utils/OCRLoader");
 
 async function asPdf({ fullFilePath = "", filename = "" }) {
   const pdfLoader = new PDFLoader(fullFilePath, {
@@ -15,7 +16,14 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
 
   console.log(`-- Working ${filename} --`);
   const pageContent = [];
-  const docs = await pdfLoader.load();
+  let docs = await pdfLoader.load();
+
+  if (docs.length === 0) {
+    console.log(
+      `[asPDF] No text content found for ${filename}. Will attempt OCR parse.`
+    );
+    docs = await new OCRLoader().ocrPDF(fullFilePath);
+  }
 
   for (const doc of docs) {
     console.log(
@@ -28,7 +36,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
   }
 
   if (!pageContent.length) {
-    console.error(`Resulting text content was empty for ${filename}.`);
+    console.error(`[asPDF] Resulting text content was empty for ${filename}.`);
     trashFile(fullFilePath);
     return {
       success: false,

diff --git a/collector/utils/OCRLoader/CanvasFactory.js b/collector/utils/OCRLoader/CanvasFactory.js
@@ -0,0 +1,52 @@
+/**
+ * This is a factory for creating a canvas and context in Node.js
+ * it is used to create a canvas and context for the PDFLoader for turning the PDF into an image
+ * so we can later use the image to extract text from the PDF.
+ */
+class NodeCanvasFactory {
+  constructor() {
+    this.CanvasModule = null;
+  }
+
+  async init() {
+    this.CanvasModule = await import("canvas");
+    this.Image = this.CanvasModule.Image;
+  }
+
+  /**
+   * Creates a canvas and context for the PDFLoader
+   * @param {number} width - The width of the canvas
+   * @param {number} height - The height of the canvas
+   * @param {boolean} transparent - Whether the canvas is transparent
+   * @returns {{canvas: HTMLCanvasElement, context: CanvasRenderingContext2D}} - The canvas and context
+   */
+  create(width, height, transparent = false) {
+    const canvas = this.CanvasModule.createCanvas(width, height);
+    const context = canvas.getContext("2d", { alpha: transparent });
+    if (transparent) context.clearRect(0, 0, width, height);
+    return {
+      canvas,
+      context,
+    };
+  }
+
+  /**
+   * Required for the PDFLoader pdfjs interation - do not remove or use directly.
+   */
+  reset(canvasAndContext, width, height) {
+    canvasAndContext.canvas.width = width;
+    canvasAndContext.canvas.height = height;
+  }
+
+  /**
+   * Required for the PDFLoader pdfjs interation - do not remove or use directly.
+   */
+  destroy(canvasAndContext) {
+    canvasAndContext.canvas.width = 0;
+    canvasAndContext.canvas.height = 0;
+    canvasAndContext.canvas = null;
+    canvasAndContext.context = null;
+  }
+}
+
+module.exports = NodeCanvasFactory;
diff --git a/collector/utils/OCRLoader/index.js b/collector/utils/OCRLoader/index.js
@@ -0,0 +1,190 @@
+const fs = require("fs");
+const os = require("os");
+const path = require("path");
+const NodeCanvasFactory = require("./CanvasFactory");
+
+class OCRLoader {
+  constructor() {
+    this.cacheDir = path.resolve(
+      process.env.STORAGE_DIR
+        ? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
+        : path.resolve(__dirname, `../../../server/storage/models/tesseract`)
+    );
+  }
+
+  log(text, ...args) {
+    console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args);
+  }
+
+  /**
+   * Loads a PDF file and returns an array of documents.
+   * This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function
+   * @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata.
+   */
+  async ocrPDF(
+    filePath,
+    { maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {}
+  ) {
+    if (
+      !filePath ||
+      !fs.existsSync(filePath) ||
+      !fs.statSync(filePath).isFile()
+    ) {
+      this.log(`File ${filePath} does not exist. Skipping OCR.`);
+      return [];
+    }
+
+    const documentTitle = path.basename(filePath);
+    this.log(`Starting OCR of ${documentTitle}`);
+    const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
+    let buffer = fs.readFileSync(filePath);
+    const canvasFactory = new NodeCanvasFactory();
+    await canvasFactory.init();
+    global.Image = canvasFactory.Image;
+
+    const pdfDocument = await pdfjs.getDocument({
+      data: new Uint8Array(buffer),
+      canvasFactory,
+    }).promise;
+    buffer = null;
+
+    const documents = [];
+    const meta = await pdfDocument.getMetadata().catch(() => null);
+    const metadata = {
+      source: filePath,
+      pdf: {
+        version: "v2.0.550",
+        info: meta?.info,
+        metadata: meta?.metadata,
+        totalPages: pdfDocument.numPages,
+      },
+    };
+
+    async function getPageAsBuffer(pageNumber, scale = 1) {
+      let canvas = null;
+      let context = null;
+      try {
+        const page = await pdfDocument.getPage(pageNumber);
+        const viewport = page.getViewport(scale);
+        ({ canvas, context } = canvasFactory.create(
+          viewport.width,
+          viewport.height
+        ));
+        await page.render({
+          canvasFactory,
+          canvasContext: context,
+          viewport,
+        }).promise;
+        return canvas.toBuffer();
+      } catch (e) {
+        this.log(`Error getting page as buffer: ${e.message}`);
+        return null;
+      } finally {
+        canvas = null;
+        context = null;
+      }
+    }
+
+    const { createWorker, OEM } = require("tesseract.js");
+    const BATCH_SIZE = batchSize;
+    const MAX_EXECUTION_TIME = maxExecutionTime;
+    const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4);
+    const totalPages = pdfDocument.numPages;
+    const workerPool = await Promise.all(
+      Array(NUM_WORKERS)
+        .fill(0)
+        .map(() =>
+          createWorker("eng", OEM.LSTM_ONLY, {
+            cachePath: this.cacheDir,
+          })
+        )
+    );
+
+    const startTime = Date.now();
+    try {
+      this.log("Bootstrapping OCR completed successfully!", {
+        MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME,
+        BATCH_SIZE,
+        MAX_CONCURRENT_WORKERS: NUM_WORKERS,
+        TOTAL_PAGES: totalPages,
+      });
+      const timeoutPromise = new Promise((_, reject) => {
+        setTimeout(() => {
+          reject(
+            new Error(
+              `OCR job took too long to complete (${
+                MAX_EXECUTION_TIME / 1000
+              } seconds)`
+            )
+          );
+        }, MAX_EXECUTION_TIME);
+      });
+
+      const processPages = async () => {
+        for (
+          let startPage = 1;
+          startPage <= totalPages;
+          startPage += BATCH_SIZE
+        ) {
+          const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages);
+          const pageNumbers = Array.from(
+            { length: endPage - startPage + 1 },
+            (_, i) => startPage + i
+          );
+          this.log(`Working on pages ${startPage} - ${endPage}`);
+
+          const pageQueue = [...pageNumbers];
+          const results = [];
+          const workerPromises = workerPool.map(async (worker, workerIndex) => {
+            while (pageQueue.length > 0) {
+              const pageNum = pageQueue.shift();
+              this.log(
+                `\x1b[34m[Worker ${
+                  workerIndex + 1
+                }]\x1b[0m assigned pg${pageNum}`
+              );
+              const imageBuffer = await getPageAsBuffer(pageNum, 5);
+              const { data } = await worker.recognize(imageBuffer, {}, "text");
+              this.log(
+                `✅ \x1b[34m[Worker ${
+                  workerIndex + 1
+                }]\x1b[0m completed pg${pageNum}`
+              );
+              results.push({
+                pageContent: data.text,
+                metadata: {
+                  ...metadata,
+                  loc: { pageNumber: pageNum },
+                },
+              });
+            }
+          });
+
+          await Promise.all(workerPromises);
+          documents.push(
+            ...results.sort(
+              (a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber
+            )
+          );
+        }
+        return documents;
+      };
+
+      await Promise.race([timeoutPromise, processPages()]);
+    } catch (e) {
+      this.log(`Error: ${e.message}`);
+    } finally {
+      global.Image = undefined;
+      await Promise.all(workerPool.map((worker) => worker.terminate()));
+    }
+
+    this.log(`Completed OCR of ${documentTitle}!`, {
+      documentsParsed: documents.length,
+      totalPages: totalPages,
+      executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
+    });
+    return documents;
+  }
+}
+
+module.exports = OCRLoader;