这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions collector/scripts/watch/convert/as_mbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import datetime
import email.utils
from mailbox import mbox
from slugify import slugify
from ..utils import guid, file_creation_time, write_to_server_documents, move_source
from ...utils import tokenize
from bs4 import BeautifulSoup

# Process all mbox-related documents.
def as_mbox(**kwargs):
parent_dir = kwargs.get('directory', 'hotdir')
filename = kwargs.get('filename')
ext = kwargs.get('ext', '.mbox')
remove = kwargs.get('remove_on_complete', False)
fullpath = f"{parent_dir}/{filename}{ext}"

print(f"-- Working {fullpath} --")
box = mbox(fullpath)

for message in box:
content = ""
if message.is_multipart():
for part in message.get_payload():
if part.get_content_type() == 'text/plain':
content = part.get_payload()
elif part.get_content_type() == 'text/html':
soup = BeautifulSoup(part.get_payload(), 'html.parser')
content = soup.get_text()
else:
content = message.get_payload()

date_tuple = email.utils.parsedate_tz(message['Date'])
if date_tuple:
local_date = datetime.datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S")
else:
date_sent = None

data = {
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'published': file_creation_time(fullpath),
'sender': message['From'],
'recipient': message['To'],
'subject': message['Subject'],
'date_sent': date_sent,
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
5 changes: 4 additions & 1 deletion collector/scripts/watch/filetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,21 @@
from .convert.as_markdown import as_markdown
from .convert.as_pdf import as_pdf
from .convert.as_docx import as_docx, as_odt
from .convert.as_mbox import as_mbox

FILETYPES = {
'.txt': as_text,
'.md': as_markdown,
'.pdf': as_pdf,
'.docx': as_docx,
'.odt': as_odt,
'.mbox': as_mbox,
}

ACCEPTED_MIMES = {
'text/plain': ['.txt', '.md'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'application/pdf': ['.pdf'],
}
'application/mbox': ['.mbox'],
}