这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# EditorConfig is awesome: https://EditorConfig.org

# top-most EditorConfig file
root = true

[*]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
7 changes: 5 additions & 2 deletions collector/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ mypy-extensions==1.0.0
nltk==3.8.1
numexpr==2.8.4
numpy==1.23.5
oauthlib==3.2.2
olefile==0.46
openapi-schema-pydantic==1.2.4
openpyxl==3.1.2
Expand All @@ -68,8 +69,8 @@ pycparser==2.21
pydantic==1.10.8
pyee==8.2.2
Pygments==2.15.1
PyMuPDF==1.22.5
pypandoc==1.4
pypdf==3.9.0
pyppeteer==1.0.2
pyquery==2.0.0
python-dateutil==2.8.2
Expand All @@ -83,6 +84,7 @@ PyYAML==6.0
regex==2023.5.5
requests==2.31.0
requests-html==0.10.0
requests-oauthlib==1.3.1
rfc3986==1.5.0
rich==13.0.1
six==1.16.0
Expand All @@ -94,9 +96,11 @@ tenacity==8.2.2
text-unidecode==1.3
tiktoken==0.4.0
tqdm==4.65.0
tweepy==4.14.0
typer==0.9.0
typing-inspect==0.9.0
typing_extensions==4.6.3
Unidecode==1.3.6
unstructured==0.7.1
urllib3==1.26.16
uuid==1.30
Expand All @@ -110,4 +114,3 @@ XlsxWriter==3.1.2
yarl==1.9.2
youtube-transcript-api==0.6.0
zipp==3.15.0
tweepy==4.14.0
8 changes: 4 additions & 4 deletions collector/scripts/gitbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ def gitbook():
data = {
'id': str(uuid4()),
'url': metadata.get('source'),
"title": metadata.get('title'),
"description": metadata.get('title'),
"published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
"wordCount": len(content),
'title': metadata.get('title'),
'description': metadata.get('title'),
'published': datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}
Expand Down
20 changes: 13 additions & 7 deletions collector/scripts/watch/convert/as_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,19 @@ def as_docx(**kwargs):

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better bescription
'docSource': 'Docx Text file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
Expand All @@ -45,16 +48,19 @@ def as_odt(**kwargs):

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'author': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better bescription
'docSource': 'ODT Text file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
13 changes: 8 additions & 5 deletions collector/scripts/watch/convert/as_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,19 @@ def as_markdown(**kwargs):

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'title': f"{filename}", # TODO: find a better metadata
'docAuthor': 'Unknown', # TODO: find a better metadata
'description': 'Unknown', # TODO: find a better metadata
'docSource': 'markdown file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
12 changes: 7 additions & 5 deletions collector/scripts/watch/convert/as_mbox.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import datetime
import datetime
import email.utils
from mailbox import mbox
from slugify import slugify
Expand Down Expand Up @@ -36,12 +36,14 @@ def as_mbox(**kwargs):
date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S")
else:
date_sent = None

data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'title': message['Subject'],
'docAuthor': message['From'],
'description': f"email {message['From']} to {message['To']}",
'docSource': "mbox file uploaded by the user.",
'published': file_creation_time(fullpath),
'sender': message['From'],
'recipient': message['To'],
Expand Down
61 changes: 38 additions & 23 deletions collector/scripts/watch/convert/as_pdf.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,53 @@
import os, time
from langchain.document_loaders import PyPDFLoader
import os, fitz
from langchain.document_loaders import PyMuPDFLoader # better UTF support and metadata
from slugify import slugify
from ..utils import guid, file_creation_time, write_to_server_documents, move_source
from ...utils import tokenize
from unidecode import unidecode

# Process all text-related documents.
# Process all PDF-related documents.
def as_pdf(**kwargs):
parent_dir = kwargs.get('directory', 'hotdir')
filename = kwargs.get('filename')
ext = kwargs.get('ext', '.txt')
remove = kwargs.get('remove_on_complete', False)
fullpath = f"{parent_dir}/{filename}{ext}"
destination = f"../server/storage/documents/{slugify(filename)}-{int(time.time())}"

loader = PyPDFLoader(fullpath)
pages = loader.load_and_split()

print(f"-- Working {fullpath} --")
for page in pages:
pg_num = page.metadata.get('page')
print(f"-- Working page {pg_num} --")
loader = PyMuPDFLoader(fullpath)
pages = loader.load()

if len(pages) == 0:
print(f"{fullpath} parsing resulted in no pages - nothing to do.")
return False

# Set doc to the first page so we can still get the metadata from PyMuPDF but without all the unicode issues.
doc = pages[0]
del loader
del pages

page_content = ''
for page in fitz.open(fullpath):
print(f"-- Parsing content from pg {page.number} --")
page_content += unidecode(page.get_text('text'))

content = page.page_content
data = {
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}_pg{pg_num}{ext}",
'description': "a custom file uploaded by the user.",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}
write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}", destination)
title = doc.metadata.get('title')
author = doc.metadata.get('author')
subject = doc.metadata.get('subject')
data = {
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': title if title else f"{filename}{ext}",
'docAuthor': author if author else 'No author found',
'description': subject if subject else 'No description found.',
'docSource': 'pdf file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(page_content), # Technically a letter count :p
'pageContent': page_content,
'token_count_estimate': len(tokenize(page_content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
10 changes: 6 additions & 4 deletions collector/scripts/watch/convert/as_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,18 @@ def as_text(**kwargs):

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better description
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,9 @@ export default function DocumentSettings({ workspace }) {
</div>
</div>
<div
className={`flex items-center ${canDelete ? "justify-between" : "justify-end"
} p-4 md:p-6 space-x-2 border-t border-gray-200 rounded-b dark:border-gray-600`}
className={`flex items-center ${
canDelete ? "justify-between" : "justify-end"
} p-4 md:p-6 space-x-2 border-t border-gray-200 rounded-b dark:border-gray-600`}
>
<button
hidden={!canDelete}
Expand Down
12 changes: 5 additions & 7 deletions server/endpoints/system.js
Original file line number Diff line number Diff line change
Expand Up @@ -448,13 +448,11 @@ function systemEndpoints(app) {
response.status(200).json({ canDelete });
} catch (error) {
console.error("Error fetching can delete workspaces:", error);
response
.status(500)
.json({
success: false,
message: "Internal server error",
canDelete: false,
});
response.status(500).json({
success: false,
message: "Internal server error",
canDelete: false,
});
}
}
);
Expand Down