这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions collector/scripts/watch/convert/as_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ def as_docx(**kwargs):
data = loader.load()[0]
content = data.page_content

if len(content) == 0:
print(f"Resulting text content was empty for {filename}{ext}.")
return(False, f"No text content found in {filename}{ext}")

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
Expand All @@ -33,7 +37,9 @@ def as_docx(**kwargs):

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)

print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
return(True, None)

def as_odt(**kwargs):
parent_dir = kwargs.get('directory', 'hotdir')
Expand All @@ -46,6 +52,10 @@ def as_odt(**kwargs):
data = loader.load()[0]
content = data.page_content

if len(content) == 0:
print(f"Resulting text content was empty for {filename}{ext}.")
return(False, f"No text content found in {filename}{ext}")

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
Expand All @@ -63,4 +73,6 @@ def as_odt(**kwargs):

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)

print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
return(True, None)
6 changes: 6 additions & 0 deletions collector/scripts/watch/convert/as_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ def as_markdown(**kwargs):
data = loader.load()[0]
content = data.page_content

if len(content) == 0:
print(f"Resulting page content was empty - no text could be extracted from {filename}{ext}.")
return(False, f"No text could be extracted from {filename}{ext}.")

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
Expand All @@ -33,4 +37,6 @@ def as_markdown(**kwargs):

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)

print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
return(True, None)
2 changes: 2 additions & 0 deletions collector/scripts/watch/convert/as_mbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,7 @@ def as_mbox(**kwargs):
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")

move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
return(True, None)
8 changes: 7 additions & 1 deletion collector/scripts/watch/convert/as_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def as_pdf(**kwargs):

if len(pages) == 0:
print(f"{fullpath} parsing resulted in no pages - nothing to do.")
return False
return(False, f"No pages found for {filename}{ext}!")

# Set doc to the first page so we can still get the metadata from PyMuPDF but without all the unicode issues.
doc = pages[0]
Expand All @@ -31,6 +31,10 @@ def as_pdf(**kwargs):
print(f"-- Parsing content from pg {page.number} --")
page_content += unidecode(page.get_text('text'))

if len(page_content) == 0:
print(f"Resulting page content was empty - no text could be extracted from the document.")
return(False, f"No text content could be extracted from {filename}{ext}!")

title = doc.metadata.get('title')
author = doc.metadata.get('author')
subject = doc.metadata.get('subject')
Expand All @@ -50,4 +54,6 @@ def as_pdf(**kwargs):

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)

print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
return(True, None)
6 changes: 6 additions & 0 deletions collector/scripts/watch/convert/as_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ def as_text(**kwargs):
fullpath = f"{parent_dir}/{filename}{ext}"
content = open(fullpath).read()

if len(content) == 0:
print(f"Resulting text content was empty for {filename}{ext}.")
return(False, f"No text content found in {filename}{ext}")

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
Expand All @@ -28,4 +32,6 @@ def as_text(**kwargs):

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)

print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
return(True, None)
8 changes: 4 additions & 4 deletions collector/scripts/watch/process_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ def process_single(directory, target_doc):
move_source(new_destination_filename=target_doc, failed=True, remove=True)
return (False, f"{fileext} not a supported file type for conversion. It will not be processed.")

FILETYPES[fileext](
# Returns Tuple of (Boolean, String|None) of success status and possible error message.
# Error message will display to user.
return FILETYPES[fileext](
directory=directory,
filename=filename,
ext=fileext,
remove_on_complete=True # remove source document to save disk space.
)

return (True, None)
)
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ function FileUploadProgressComponent({
onUploadError,
}) {
const [timerMs, setTimerMs] = useState(10);
const [status, setStatus] = useState(file?.rejected ? "uploading" : "failed");
const [status, setStatus] = useState("pending");
const [error, setError] = useState("");

useEffect(() => {
async function uploadFile() {
Expand All @@ -31,6 +32,7 @@ function FileUploadProgressComponent({
setStatus("failed");
clearInterval(timer);
onUploadError(data.error);
setError(data.error);
} else {
setStatus("complete");
clearInterval(timer);
Expand Down Expand Up @@ -58,6 +60,24 @@ function FileUploadProgressComponent({
);
}

if (status === "failed") {
return (
<div className="w-fit px-2 py-2 flex items-center gap-x-4 rounded-lg bg-blue-100 border-blue-600 dark:bg-stone-800 bg-opacity-50 border dark:border-stone-600">
<div className="w-6 h-6">
<XCircle className="w-6 h-6 stroke-white bg-red-500 rounded-full p-1 w-full h-full" />
</div>
<div className="flex flex-col">
<p className="text-black dark:text-stone-200 text-sm font-mono overflow-x-scroll">
{truncate(file.name, 30)}
</p>
<p className="text-red-700 dark:text-red-400 text-xs font-mono">
{error}
</p>
</div>
</div>
);
}

return (
<div className="w-fit px-2 py-2 flex items-center gap-x-4 rounded-lg bg-blue-100 border-blue-600 dark:bg-stone-800 bg-opacity-50 border dark:border-stone-600">
<div className="w-6 h-6">
Expand All @@ -77,6 +97,8 @@ function FileUploadProgressComponent({
</div>
</div>
);

return null;
}

export default memo(FileUploadProgressComponent);