θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/dev-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ concurrency:

on:
push:
branches: ['agent-builder'] # put your current branch to create a build. Core team only.
branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
Expand Down
4 changes: 3 additions & 1 deletion collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"@xenova/transformers": "^2.11.0",
"bcrypt": "^5.1.0",
"body-parser": "^1.20.2",
"canvas": "^2.11.2",
"cors": "^2.8.5",
"dotenv": "^16.0.3",
"epub2": "^3.0.2",
Expand All @@ -40,6 +41,7 @@
"puppeteer": "~21.5.2",
"sharp": "^0.33.5",
"slugify": "^1.6.6",
"tesseract.js": "^6.0.0",
"url-pattern": "^1.0.3",
"uuid": "^9.0.0",
"wavefile": "^11.0.0",
Expand All @@ -50,4 +52,4 @@
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
}
}
12 changes: 10 additions & 2 deletions collector/processSingleFile/convert/asPDF/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const {
const { tokenizeString } = require("../../../utils/tokenizer");
const { default: slugify } = require("slugify");
const PDFLoader = require("./PDFLoader");
const OCRLoader = require("../../../utils/OCRLoader");

async function asPdf({ fullFilePath = "", filename = "" }) {
const pdfLoader = new PDFLoader(fullFilePath, {
Expand All @@ -15,7 +16,14 @@ async function asPdf({ fullFilePath = "", filename = "" }) {

console.log(`-- Working ${filename} --`);
const pageContent = [];
const docs = await pdfLoader.load();
let docs = await pdfLoader.load();

if (docs.length === 0) {
console.log(
`[asPDF] No text content found for ${filename}. Will attempt OCR parse.`
);
docs = await new OCRLoader().ocrPDF(fullFilePath);
}

for (const doc of docs) {
console.log(
Expand All @@ -28,7 +36,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
}

if (!pageContent.length) {
console.error(`Resulting text content was empty for ${filename}.`);
console.error(`[asPDF] Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return {
success: false,
Expand Down
52 changes: 52 additions & 0 deletions collector/utils/OCRLoader/CanvasFactory.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/**
* This is a factory for creating a canvas and context in Node.js
* it is used to create a canvas and context for the PDFLoader for turning the PDF into an image
* so we can later use the image to extract text from the PDF.
*/
class NodeCanvasFactory {
constructor() {
this.CanvasModule = null;
}

async init() {
this.CanvasModule = await import("canvas");
this.Image = this.CanvasModule.Image;
}

/**
* Creates a canvas and context for the PDFLoader
* @param {number} width - The width of the canvas
* @param {number} height - The height of the canvas
* @param {boolean} transparent - Whether the canvas is transparent
* @returns {{canvas: HTMLCanvasElement, context: CanvasRenderingContext2D}} - The canvas and context
*/
create(width, height, transparent = false) {
const canvas = this.CanvasModule.createCanvas(width, height);
const context = canvas.getContext("2d", { alpha: transparent });
if (transparent) context.clearRect(0, 0, width, height);
return {
canvas,
context,
};
}

/**
* Required for the PDFLoader pdfjs interation - do not remove or use directly.
*/
reset(canvasAndContext, width, height) {
canvasAndContext.canvas.width = width;
canvasAndContext.canvas.height = height;
}

/**
* Required for the PDFLoader pdfjs interation - do not remove or use directly.
*/
destroy(canvasAndContext) {
canvasAndContext.canvas.width = 0;
canvasAndContext.canvas.height = 0;
canvasAndContext.canvas = null;
canvasAndContext.context = null;
}
}

module.exports = NodeCanvasFactory;
190 changes: 190 additions & 0 deletions collector/utils/OCRLoader/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
const fs = require("fs");
const os = require("os");
const path = require("path");
const NodeCanvasFactory = require("./CanvasFactory");

class OCRLoader {
constructor() {
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
: path.resolve(__dirname, `../../../server/storage/models/tesseract`)
);
}

log(text, ...args) {
console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args);
}

/**
* Loads a PDF file and returns an array of documents.
* This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function
* @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata.
*/
async ocrPDF(
filePath,
{ maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {}
) {
if (
!filePath ||
!fs.existsSync(filePath) ||
!fs.statSync(filePath).isFile()
) {
this.log(`File ${filePath} does not exist. Skipping OCR.`);
return [];
}

const documentTitle = path.basename(filePath);
this.log(`Starting OCR of ${documentTitle}`);
const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
let buffer = fs.readFileSync(filePath);
const canvasFactory = new NodeCanvasFactory();
await canvasFactory.init();
global.Image = canvasFactory.Image;

const pdfDocument = await pdfjs.getDocument({
data: new Uint8Array(buffer),
canvasFactory,
}).promise;
buffer = null;

const documents = [];
const meta = await pdfDocument.getMetadata().catch(() => null);
const metadata = {
source: filePath,
pdf: {
version: "v2.0.550",
info: meta?.info,
metadata: meta?.metadata,
totalPages: pdfDocument.numPages,
},
};

async function getPageAsBuffer(pageNumber, scale = 1) {
let canvas = null;
let context = null;
try {
const page = await pdfDocument.getPage(pageNumber);
const viewport = page.getViewport(scale);
({ canvas, context } = canvasFactory.create(
viewport.width,
viewport.height
));
await page.render({
canvasFactory,
canvasContext: context,
viewport,
}).promise;
return canvas.toBuffer();
} catch (e) {
this.log(`Error getting page as buffer: ${e.message}`);
return null;
} finally {
canvas = null;
context = null;
}
}

const { createWorker, OEM } = require("tesseract.js");
const BATCH_SIZE = batchSize;
const MAX_EXECUTION_TIME = maxExecutionTime;
const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4);
const totalPages = pdfDocument.numPages;
const workerPool = await Promise.all(
Array(NUM_WORKERS)
.fill(0)
.map(() =>
createWorker("eng", OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
})
)
);

const startTime = Date.now();
try {
this.log("Bootstrapping OCR completed successfully!", {
MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME,
BATCH_SIZE,
MAX_CONCURRENT_WORKERS: NUM_WORKERS,
TOTAL_PAGES: totalPages,
});
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => {
reject(
new Error(
`OCR job took too long to complete (${
MAX_EXECUTION_TIME / 1000
} seconds)`
)
);
}, MAX_EXECUTION_TIME);
});

const processPages = async () => {
for (
let startPage = 1;
startPage <= totalPages;
startPage += BATCH_SIZE
) {
const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages);
const pageNumbers = Array.from(
{ length: endPage - startPage + 1 },
(_, i) => startPage + i
);
this.log(`Working on pages ${startPage} - ${endPage}`);

const pageQueue = [...pageNumbers];
const results = [];
const workerPromises = workerPool.map(async (worker, workerIndex) => {
while (pageQueue.length > 0) {
const pageNum = pageQueue.shift();
this.log(
`\x1b[34m[Worker ${
workerIndex + 1
}]\x1b[0m assigned pg${pageNum}`
);
const imageBuffer = await getPageAsBuffer(pageNum, 5);
const { data } = await worker.recognize(imageBuffer, {}, "text");
this.log(
`βœ… \x1b[34m[Worker ${
workerIndex + 1
}]\x1b[0m completed pg${pageNum}`
);
results.push({
pageContent: data.text,
metadata: {
...metadata,
loc: { pageNumber: pageNum },
},
});
}
});

await Promise.all(workerPromises);
documents.push(
...results.sort(
(a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber
)
);
}
return documents;
};

await Promise.race([timeoutPromise, processPages()]);
} catch (e) {
this.log(`Error: ${e.message}`);
} finally {
global.Image = undefined;
await Promise.all(workerPool.map((worker) => worker.terminate()));
}

this.log(`Completed OCR of ${documentTitle}!`, {
documentsParsed: documents.length,
totalPages: totalPages,
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
});
return documents;
}
}

module.exports = OCRLoader;
Loading