From 06ffdd69d3b560f136afb99be8b74ae1d783a48f Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 1 Nov 2023 16:43:18 -0700 Subject: [PATCH] normalize parser struct for all file types --- collector/scripts/watch/convert/as_docx.py | 2 +- collector/scripts/watch/convert/as_mbox.py | 5 +---- collector/scripts/watch/convert/as_text.py | 1 + 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/collector/scripts/watch/convert/as_docx.py b/collector/scripts/watch/convert/as_docx.py index 33aaaaaeb0b..b3778617932 100644 --- a/collector/scripts/watch/convert/as_docx.py +++ b/collector/scripts/watch/convert/as_docx.py @@ -61,7 +61,7 @@ def as_odt(**kwargs): 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': f"{filename}{ext}", - 'author': 'Unknown', # TODO: Find a better author + 'docAuthor': 'Unknown', # TODO: Find a better author 'description': 'Unknown', # TODO: Find a better bescription 'docSource': 'ODT Text file uploaded by the user.', 'chunkSource': f"{filename}{ext}", diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py index f5a645eaa70..2d7c08e637c 100644 --- a/collector/scripts/watch/convert/as_mbox.py +++ b/collector/scripts/watch/convert/as_mbox.py @@ -110,11 +110,8 @@ def as_mbox(**kwargs): "docAuthor": message["From"], "description": f"email from {message['From']} to {message['To']}", "docSource": "mbox file uploaded by the user.", + "chunkSource": subject, "published": file_creation_time(fullpath), - "sender": message["From"], - "recipient": message["To"], - "subject": subject, - "date_sent": date_sent, "wordCount": len(content), "pageContent": content, "token_count_estimate": len(tokenize(content)), diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py index e6ad85140d8..1b897874b4f 100644 --- a/collector/scripts/watch/convert/as_text.py +++ b/collector/scripts/watch/convert/as_text.py @@ -23,6 +23,7 @@ def as_text(**kwargs): 'title': f"{filename}{ext}", 'docAuthor': 'Unknown', # TODO: Find a better author 'description': 'Unknown', # TODO: Find a better description + 'docSource': 'a text file uploaded by the user.', 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content),