θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/dev-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ concurrency:

on:
push:
branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
branches: ['sharp-pdf-image-converter'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
Expand Down
3 changes: 1 addition & 2 deletions collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"@xenova/transformers": "^2.11.0",
"bcrypt": "^5.1.0",
"body-parser": "^1.20.2",
"canvas": "^2.11.2",
"cors": "^2.8.5",
"dotenv": "^16.0.3",
"epub2": "^3.0.2",
Expand Down Expand Up @@ -52,4 +51,4 @@
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
}
}
52 changes: 0 additions & 52 deletions collector/utils/OCRLoader/CanvasFactory.js

This file was deleted.

126 changes: 91 additions & 35 deletions collector/utils/OCRLoader/index.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
const fs = require("fs");
const os = require("os");
const path = require("path");
const NodeCanvasFactory = require("./CanvasFactory");

class OCRLoader {
constructor() {
Expand Down Expand Up @@ -38,15 +37,8 @@ class OCRLoader {
this.log(`Starting OCR of ${documentTitle}`);
const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
let buffer = fs.readFileSync(filePath);
const canvasFactory = new NodeCanvasFactory();
await canvasFactory.init();
global.Image = canvasFactory.Image;

const pdfDocument = await pdfjs.getDocument({
data: new Uint8Array(buffer),
canvasFactory,
}).promise;
buffer = null;
const pdfDocument = await pdfjs.getDocument({ data: buffer });

const documents = [];
const meta = await pdfDocument.getMetadata().catch(() => null);
Expand All @@ -60,30 +52,14 @@ class OCRLoader {
},
};

async function getPageAsBuffer(pageNumber, scale = 1) {
let canvas = null;
let context = null;
try {
const page = await pdfDocument.getPage(pageNumber);
const viewport = page.getViewport(scale);
({ canvas, context } = canvasFactory.create(
viewport.width,
viewport.height
));
await page.render({
canvasFactory,
canvasContext: context,
viewport,
}).promise;
return canvas.toBuffer();
} catch (e) {
this.log(`Error getting page as buffer: ${e.message}`);
return null;
} finally {
canvas = null;
context = null;
}
}
const pdfSharp = new PDFSharp({
validOps: [
pdfjs.OPS.paintJpegXObject,
pdfjs.OPS.paintImageXObject,
pdfjs.OPS.paintInlineImageXObject,
],
});
await pdfSharp.init();

const { createWorker, OEM } = require("tesseract.js");
const BATCH_SIZE = batchSize;
Expand Down Expand Up @@ -143,7 +119,9 @@ class OCRLoader {
workerIndex + 1
}]\x1b[0m assigned pg${pageNum}`
);
const imageBuffer = await getPageAsBuffer(pageNum, 5);
const page = await pdfDocument.getPage(pageNum);
const imageBuffer = await pdfSharp.pageToBuffer({ page });
if (!imageBuffer) continue;
const { data } = await worker.recognize(imageBuffer, {}, "text");
this.log(
`βœ… \x1b[34m[Worker ${
Expand Down Expand Up @@ -172,7 +150,7 @@ class OCRLoader {

await Promise.race([timeoutPromise, processPages()]);
} catch (e) {
this.log(`Error: ${e.message}`);
this.log(`Error: ${e.message}`, e.stack);
} finally {
global.Image = undefined;
await Promise.all(workerPool.map((worker) => worker.terminate()));
Expand Down Expand Up @@ -248,4 +226,82 @@ class OCRLoader {
}
}

/**
* Converts a PDF page to a buffer using Sharp.
* @param {Object} options - The options for the Sharp PDF page object.
* @param {Object} options.page - The PDFJS page proxy object.
* @returns {Promise<Buffer>} The buffer of the page.
*/
class PDFSharp {
constructor({ validOps = [] } = {}) {
this.sharp = null;
this.validOps = validOps;
}

log(text, ...args) {
console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args);
}

async init() {
this.sharp = (await import("sharp")).default;
}

/**
* Converts a PDF page to a buffer.
* @param {Object} options - The options for the Sharp PDF page object.
* @param {Object} options.page - The PDFJS page proxy object.
* @returns {Promise<Buffer>} The buffer of the page.
*/
async pageToBuffer({ page }) {
if (!this.sharp) await this.init();
try {
this.log(`Converting page ${page.pageNumber} to image...`);
const ops = await page.getOperatorList();
const pageImages = ops.fnArray.length;

for (let i = 0; i < pageImages; i++) {
try {
if (!this.validOps.includes(ops.fnArray[i])) continue;

const name = ops.argsArray[i][0];
const img = await page.objs.get(name);
const { width, height } = img;
const size = img.data.length;
const channels = size / width / height;
const targetDPI = 70;
const targetWidth = Math.floor(width * (targetDPI / 72));
const targetHeight = Math.floor(height * (targetDPI / 72));

const image = this.sharp(img.data, {
raw: { width, height, channels },
density: targetDPI,
})
.resize({
width: targetWidth,
height: targetHeight,
fit: "fill",
})
.withMetadata({
density: targetDPI,
resolution: targetDPI,
})
.png();

// For debugging purposes
// await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`));
return await image.toBuffer();
} catch (error) {
this.log(`Iteration error: ${error.message}`, error.stack);
continue;
}
}
this.log(`No valid images found on page ${page.pageNumber}`);
return null;
} catch (error) {
this.log(`Error: ${error.message}`, error.stack);
return null;
}
}
}

module.exports = OCRLoader;
37 changes: 1 addition & 36 deletions collector/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@
"@langchain/core" "~0.1"
js-tiktoken "^1.0.11"

"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11":
"@mapbox/node-pre-gyp@^1.0.11":
version "1.0.11"
resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
Expand Down Expand Up @@ -793,15 +793,6 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==

canvas@^2.11.2:
version "2.11.2"
resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860"
integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==
dependencies:
"@mapbox/node-pre-gyp" "^1.0.0"
nan "^2.17.0"
simple-get "^3.0.3"

chalk@^2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
Expand Down Expand Up @@ -1057,13 +1048,6 @@ decamelize@1.2.0:
resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==

decompress-response@^4.2.0:
version "4.2.1"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986"
integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==
dependencies:
mimic-response "^2.0.0"

decompress-response@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc"
Expand Down Expand Up @@ -2307,11 +2291,6 @@ mime@^3.0.0:
resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7"
integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==

mimic-response@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43"
integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==

mimic-response@^3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
Expand Down Expand Up @@ -2425,11 +2404,6 @@ mustache@^4.2.0:
resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==

nan@^2.17.0:
version "2.22.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.22.0.tgz#31bc433fc33213c97bad36404bb68063de604de3"
integrity sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==

napi-build-utils@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806"
Expand Down Expand Up @@ -3255,15 +3229,6 @@ simple-concat@^1.0.0:
resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f"
integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==

simple-get@^3.0.3:
version "3.1.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55"
integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==
dependencies:
decompress-response "^4.2.0"
once "^1.3.1"
simple-concat "^1.0.0"

simple-get@^4.0.0, simple-get@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543"
Expand Down
Loading