From 3d06edc8d79bc8e290f3869ac9d8915521ecb6e5 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Tue, 12 Dec 2023 17:27:58 -0800 Subject: [PATCH] patch: remove unidecode as it was transliterating non-latin chars resolves #298 --- collector/scripts/watch/convert/as_pdf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py index 8fc1d1f3d2c..7199d6c5e29 100644 --- a/collector/scripts/watch/convert/as_pdf.py +++ b/collector/scripts/watch/convert/as_pdf.py @@ -3,7 +3,6 @@ from slugify import slugify from ..utils import guid, file_creation_time, write_to_server_documents, move_source from ...utils import tokenize -from unidecode import unidecode # Process all PDF-related documents. def as_pdf(**kwargs): @@ -29,7 +28,7 @@ def as_pdf(**kwargs): page_content = '' for page in fitz.open(fullpath): print(f"-- Parsing content from pg {page.number} --") - page_content += unidecode(page.get_text('text')) + page_content += str(page.get_text('text')) if len(page_content) == 0: print(f"Resulting page content was empty - no text could be extracted from the document.")