这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions collector/scripts/watch/convert/as_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os, re
from slugify import slugify
from langchain.document_loaders import BSHTMLLoader
from ..utils import guid, file_creation_time, write_to_server_documents, move_source
from ...utils import tokenize

# Process all html-related documents.
def as_html(**kwargs):
parent_dir = kwargs.get('directory', 'hotdir')
filename = kwargs.get('filename')
ext = kwargs.get('ext', '.html')
remove = kwargs.get('remove_on_complete', False)
fullpath = f"{parent_dir}/{filename}{ext}"
Comment on lines +8 to +13

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function signature has been updated to include the optional arguments directly, with their default values. This makes the function more readable and maintainable, as it's easier to understand what arguments the function expects.

Suggested change
def as_html(**kwargs):
parent_dir = kwargs.get('directory', 'hotdir')
filename = kwargs.get('filename')
ext = kwargs.get('ext', '.html')
remove = kwargs.get('remove_on_complete', False)
fullpath = f"{parent_dir}/{filename}{ext}"
def as_html(directory='hotdir', filename=None, ext='.html', remove_on_complete=False):
fullpath = f"{directory}/{filename}{ext}"
...


loader = BSHTMLLoader(fullpath)
document = loader.load()[0]
content = re.sub(r"\n+", "\n", document.page_content)

if len(content) == 0:
print(f"Resulting text content was empty for {filename}{ext}.")
return(False, f"No text content found in {filename}{ext}")

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': document.metadata.get('title', f"{filename}{ext}"),
'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better description
'docSource': 'an HTML file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)

print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
return(True, None)
3 changes: 3 additions & 0 deletions collector/scripts/watch/filetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .convert.as_pdf import as_pdf
from .convert.as_docx import as_docx, as_odt
from .convert.as_mbox import as_mbox
from .convert.as_html import as_html

FILETYPES = {
'.txt': as_text,
Expand All @@ -11,10 +12,12 @@
'.docx': as_docx,
'.odt': as_odt,
'.mbox': as_mbox,
'.html': as_html,
}

ACCEPTED_MIMES = {
'text/plain': ['.txt', '.md'],
'text/html': ['.html'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'application/pdf': ['.pdf'],
Expand Down