θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions collector/processSingleFile/convert/asImage.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ const {
const OCRLoader = require("../../utils/OCRLoader");
const { default: slugify } = require("slugify");

async function asImage({ fullFilePath = "", filename = "" }) {
let content = await new OCRLoader().ocrImage(fullFilePath);
async function asImage({ fullFilePath = "", filename = "", options = {} }) {
let content = await new OCRLoader({
targetLanguages: options?.ocr?.langList,
}).ocrImage(fullFilePath);

if (!content?.length) {
console.error(`Resulting text content was empty for ${filename}.`);
Expand Down
6 changes: 4 additions & 2 deletions collector/processSingleFile/convert/asPDF/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const { default: slugify } = require("slugify");
const PDFLoader = require("./PDFLoader");
const OCRLoader = require("../../../utils/OCRLoader");

async function asPdf({ fullFilePath = "", filename = "" }) {
async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
const pdfLoader = new PDFLoader(fullFilePath, {
splitPages: true,
});
Expand All @@ -22,7 +22,9 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
console.log(
`[asPDF] No text content found for ${filename}. Will attempt OCR parse.`
);
docs = await new OCRLoader().ocrPDF(fullFilePath);
docs = await new OCRLoader({
targetLanguages: options?.ocr?.langList,
}).ocrPDF(fullFilePath);
}

for (const doc of docs) {
Expand Down
53 changes: 50 additions & 3 deletions collector/utils/OCRLoader/index.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,61 @@
const fs = require("fs");
const os = require("os");
const path = require("path");
const { VALID_LANGUAGE_CODES } = require("./validLangs");

class OCRLoader {
constructor() {
/**
* The language code(s) to use for the OCR.
* @type {string[]}
*/
language;
/**
* The cache directory for the OCR.
* @type {string}
*/
cacheDir;

/**
* The constructor for the OCRLoader.
* @param {Object} options - The options for the OCRLoader.
* @param {string} options.targetLanguages - The target languages to use for the OCR as a comma separated string. eg: "eng,deu,..."
*/
constructor({ targetLanguages = "eng" } = {}) {
this.language = this.parseLanguages(targetLanguages);
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
: path.resolve(__dirname, `../../../server/storage/models/tesseract`)
);

// Ensure the cache directory exists or else Tesseract will persist the cache in the default location.
if (!fs.existsSync(this.cacheDir))
fs.mkdirSync(this.cacheDir, { recursive: true });
this.log(
`OCRLoader initialized with language support for:`,
this.language.map((lang) => VALID_LANGUAGE_CODES[lang]).join(", ")
);
}

/**
* Parses the language code from a provided comma separated string of language codes.
* @param {string} language - The language code to parse.
* @returns {string[]} The parsed language code.
*/
parseLanguages(language = null) {
try {
if (!language || typeof language !== "string") return ["eng"];
const langList = language
.split(",")
.map((lang) => (lang.trim() !== "" ? lang.trim() : null))
.filter(Boolean)
.filter((lang) => VALID_LANGUAGE_CODES.hasOwnProperty(lang));
if (langList.length === 0) return ["eng"];
return langList;
} catch (e) {
this.log(`Error parsing languages: ${e.message}`, e.stack);
return ["eng"];
}
}

log(text, ...args) {
Expand Down Expand Up @@ -70,7 +117,7 @@ class OCRLoader {
Array(NUM_WORKERS)
.fill(0)
.map(() =>
createWorker("eng", OEM.LSTM_ONLY, {
createWorker(this.language, OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
})
)
Expand Down Expand Up @@ -188,7 +235,7 @@ class OCRLoader {
this.log(`Starting OCR of ${documentTitle}`);
const startTime = Date.now();
const { createWorker, OEM } = require("tesseract.js");
worker = await createWorker("eng", OEM.LSTM_ONLY, {
worker = await createWorker(this.language, OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
});

Expand Down
155 changes: 155 additions & 0 deletions collector/utils/OCRLoader/validLangs.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*

To get the list of valid language codes - do the following:
Open the following URL in your browser: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html

Check this element is the proper table tbody with all the codes via console:
document.getElementsByTagName('table').item(0).children.item(1)

Now, copy the following code and paste it into the console:
function parseLangs() {
let langs = {};
Array.from(document.getElementsByTagName('table').item(0).children.item(1).children).forEach((el) => {
const [codeEl, languageEl, ...rest] = el.children
const code = codeEl.innerText.trim()
const language = languageEl.innerText.trim()
if (!!code && !!language) langs[code] = language
})
return langs;
}

now, run the function:
copy(parseLangs())
*/

const VALID_LANGUAGE_CODES = {
afr: "Afrikaans",
amh: "Amharic",
ara: "Arabic",
asm: "Assamese",
aze: "Azerbaijani",
aze_cyrl: "Azerbaijani - Cyrilic",
bel: "Belarusian",
ben: "Bengali",
bod: "Tibetan",
bos: "Bosnian",
bre: "Breton",
bul: "Bulgarian",
cat: "Catalan; Valencian",
ceb: "Cebuano",
ces: "Czech",
chi_sim: "Chinese - Simplified",
chi_tra: "Chinese - Traditional",
chr: "Cherokee",
cos: "Corsican",
cym: "Welsh",
dan: "Danish",
dan_frak: "Danish - Fraktur (contrib)",
deu: "German",
deu_frak: "German - Fraktur (contrib)",
deu_latf: "German (Fraktur Latin)",
dzo: "Dzongkha",
ell: "Greek, Modern (1453-)",
eng: "English",
enm: "English, Middle (1100-1500)",
epo: "Esperanto",
equ: "Math / equation detection module",
est: "Estonian",
eus: "Basque",
fao: "Faroese",
fas: "Persian",
fil: "Filipino (old - Tagalog)",
fin: "Finnish",
fra: "French",
frk: "German - Fraktur (now deu_latf)",
frm: "French, Middle (ca.1400-1600)",
fry: "Western Frisian",
gla: "Scottish Gaelic",
gle: "Irish",
glg: "Galician",
grc: "Greek, Ancient (to 1453) (contrib)",
guj: "Gujarati",
hat: "Haitian; Haitian Creole",
heb: "Hebrew",
hin: "Hindi",
hrv: "Croatian",
hun: "Hungarian",
hye: "Armenian",
iku: "Inuktitut",
ind: "Indonesian",
isl: "Icelandic",
ita: "Italian",
ita_old: "Italian - Old",
jav: "Javanese",
jpn: "Japanese",
kan: "Kannada",
kat: "Georgian",
kat_old: "Georgian - Old",
kaz: "Kazakh",
khm: "Central Khmer",
kir: "Kirghiz; Kyrgyz",
kmr: "Kurmanji (Kurdish - Latin Script)",
kor: "Korean",
kor_vert: "Korean (vertical)",
kur: "Kurdish (Arabic Script)",
lao: "Lao",
lat: "Latin",
lav: "Latvian",
lit: "Lithuanian",
ltz: "Luxembourgish",
mal: "Malayalam",
mar: "Marathi",
mkd: "Macedonian",
mlt: "Maltese",
mon: "Mongolian",
mri: "Maori",
msa: "Malay",
mya: "Burmese",
nep: "Nepali",
nld: "Dutch; Flemish",
nor: "Norwegian",
oci: "Occitan (post 1500)",
ori: "Oriya",
osd: "Orientation and script detection module",
pan: "Panjabi; Punjabi",
pol: "Polish",
por: "Portuguese",
pus: "Pushto; Pashto",
que: "Quechua",
ron: "Romanian; Moldavian; Moldovan",
rus: "Russian",
san: "Sanskrit",
sin: "Sinhala; Sinhalese",
slk: "Slovak",
slk_frak: "Slovak - Fraktur (contrib)",
slv: "Slovenian",
snd: "Sindhi",
spa: "Spanish; Castilian",
spa_old: "Spanish; Castilian - Old",
sqi: "Albanian",
srp: "Serbian",
srp_latn: "Serbian - Latin",
sun: "Sundanese",
swa: "Swahili",
swe: "Swedish",
syr: "Syriac",
tam: "Tamil",
tat: "Tatar",
tel: "Telugu",
tgk: "Tajik",
tgl: "Tagalog (new - Filipino)",
tha: "Thai",
tir: "Tigrinya",
ton: "Tonga",
tur: "Turkish",
uig: "Uighur; Uyghur",
ukr: "Ukrainian",
urd: "Urdu",
uzb: "Uzbek",
uzb_cyrl: "Uzbek - Cyrilic",
vie: "Vietnamese",
yid: "Yiddish",
yor: "Yoruba",
};

module.exports.VALID_LANGUAGE_CODES = VALID_LANGUAGE_CODES;
5 changes: 5 additions & 0 deletions docker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,8 @@ GID='1000'
# Enable simple SSO passthrough to pre-authenticate users from a third party service.
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
# SIMPLE_SSO_ENABLED=1

# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol
5 changes: 5 additions & 0 deletions server/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,8 @@ TTS_PROVIDER="native"
# Enable simple SSO passthrough to pre-authenticate users from a third party service.
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
# SIMPLE_SSO_ENABLED=1

# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol
3 changes: 3 additions & 0 deletions server/utils/collectorApi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ class CollectorApi {
whisperProvider: process.env.WHISPER_PROVIDER || "local",
WhisperModelPref: process.env.WHISPER_MODEL_PREF,
openAiKey: process.env.OPEN_AI_KEY || null,
ocr: {
langList: process.env.TARGET_OCR_LANG || "eng",
},
};
}

Expand Down
3 changes: 3 additions & 0 deletions server/utils/helpers/updateENV.js
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,9 @@ function dumpENV() {

// Nvidia NIM Keys that are automatically managed
"NVIDIA_NIM_LLM_MODEL_TOKEN_LIMIT",

// OCR Language Support
"TARGET_OCR_LANG",
];

// Simple sanitization of each value to prevent ENV injection via newline or quote escaping.
Expand Down