θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/dev-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ concurrency:

on:
push:
branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
Expand Down
48 changes: 48 additions & 0 deletions collector/processSingleFile/convert/asImage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
const { v4 } = require("uuid");
const { tokenizeString } = require("../../utils/tokenizer");
const {
createdDate,
trashFile,
writeToServerDocuments,
} = require("../../utils/files");
const OCRLoader = require("../../utils/OCRLoader");
const { default: slugify } = require("slugify");

async function asImage({ fullFilePath = "", filename = "" }) {
let content = await new OCRLoader().ocrImage(fullFilePath);

if (!content?.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}

console.log(`-- Working ${filename} --`);
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: "Unknown", // TODO: Find a better author
description: "Unknown", // TODO: Find a better description
docSource: "a text file uploaded by the user.",
chunkSource: "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null, documents: [document] };
}

module.exports = asImage;
61 changes: 61 additions & 0 deletions collector/utils/OCRLoader/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,67 @@ class OCRLoader {
});
return documents;
}

/**
* Loads an image file and returns the OCRed text.
* @param {string} filePath - The path to the image file.
* @param {Object} options - The options for the OCR.
* @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
* @returns {Promise<string>} The OCRed text.
*/
async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
let content = "";
let worker = null;
if (
!filePath ||
!fs.existsSync(filePath) ||
!fs.statSync(filePath).isFile()
) {
this.log(`File ${filePath} does not exist. Skipping OCR.`);
return null;
}

const documentTitle = path.basename(filePath);
try {
this.log(`Starting OCR of ${documentTitle}`);
const startTime = Date.now();
const { createWorker, OEM } = require("tesseract.js");
worker = await createWorker("eng", OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
});

// Race the timeout with the OCR
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => {
reject(
new Error(
`OCR job took too long to complete (${
maxExecutionTime / 1000
} seconds)`
)
);
}, maxExecutionTime);
});

const processImage = async () => {
const { data } = await worker.recognize(filePath, {}, "text");
content = data.text;
};

await Promise.race([timeoutPromise, processImage()]);
this.log(`Completed OCR of ${documentTitle}!`, {
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
});

return content;
} catch (e) {
this.log(`Error: ${e.message}`);
return null;
} finally {
if (!worker) return;
await worker.terminate();
}
}
}

module.exports = OCRLoader;
7 changes: 7 additions & 0 deletions collector/utils/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ const ACCEPTED_MIMES = {
"video/mp4": [".mp4"],
"video/mpeg": [".mpeg"],
"application/epub+zip": [".epub"],
"image/png": [".png"],
"image/jpeg": [".jpg"],
"image/jpg": [".jpg"],
};

const SUPPORTED_FILETYPE_CONVERTERS = {
Expand Down Expand Up @@ -55,6 +58,10 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
".wav": "./convert/asAudio.js",
".mp4": "./convert/asAudio.js",
".mpeg": "./convert/asAudio.js",

".png": "./convert/asImage.js",
".jpg": "./convert/asImage.js",
".jpeg": "./convert/asImage.js",
};

module.exports = {
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/files/mime.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
const MimeLib = require("mime");
class MimeDetector {
nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
nonTextTypes = ["multipart", "model", "audio", "video", "font"];
badMimes = [
"application/octet-stream",
"application/zip",
Expand Down