这是indexloc提供的服务,不要输入任何密码
Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# EditorConfig is awesome: https://EditorConfig.org

# top-most EditorConfig file
root = true

[*]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
Empty file modified clean.sh
100644 → 100755
Empty file.
6 changes: 4 additions & 2 deletions collector/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ mypy-extensions==1.0.0
nltk==3.8.1
numexpr==2.8.4
numpy==1.23.5
oauthlib==3.2.2
olefile==0.46
openapi-schema-pydantic==1.2.4
openpyxl==3.1.2
Expand All @@ -68,8 +69,8 @@ pycparser==2.21
pydantic==1.10.8
pyee==8.2.2
Pygments==2.15.1
PyMuPDF==1.22.5
pypandoc==1.4
pypdf==3.9.0
pyppeteer==1.0.2
pyquery==2.0.0
python-dateutil==2.8.2
Expand All @@ -83,6 +84,7 @@ PyYAML==6.0
regex==2023.5.5
requests==2.31.0
requests-html==0.10.0
requests-oauthlib==1.3.1
rfc3986==1.5.0
rich==13.0.1
six==1.16.0
Expand All @@ -94,6 +96,7 @@ tenacity==8.2.2
text-unidecode==1.3
tiktoken==0.4.0
tqdm==4.65.0
tweepy==4.14.0
typer==0.9.0
typing-inspect==0.9.0
typing_extensions==4.6.3
Expand All @@ -110,4 +113,3 @@ XlsxWriter==3.1.2
yarl==1.9.2
youtube-transcript-api==0.6.0
zipp==3.15.0
tweepy==4.14.0
8 changes: 4 additions & 4 deletions collector/scripts/gitbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ def gitbook():
data = {
'id': str(uuid4()),
'url': metadata.get('source'),
"title": metadata.get('title'),
"description": metadata.get('title'),
"published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
"wordCount": len(content),
'title': metadata.get('title'),
'description': metadata.get('title'),
'published': datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}
Expand Down
20 changes: 13 additions & 7 deletions collector/scripts/watch/convert/as_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,19 @@ def as_docx(**kwargs):

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better bescription
'docSource': 'Docx Text file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
Expand All @@ -45,16 +48,19 @@ def as_odt(**kwargs):

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'author': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better bescription
'docSource': 'ODT Text file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
13 changes: 8 additions & 5 deletions collector/scripts/watch/convert/as_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,19 @@ def as_markdown(**kwargs):

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'title': f"{filename}", # TODO: find a better metadata
'docAuthor': 'Unknown', # TODO: find a better metadata
'description': 'Unknown', # TODO: find a better metadata
'docSource': 'markdown file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
12 changes: 7 additions & 5 deletions collector/scripts/watch/convert/as_mbox.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import datetime
import datetime
import email.utils
from mailbox import mbox
from slugify import slugify
Expand Down Expand Up @@ -36,12 +36,14 @@ def as_mbox(**kwargs):
date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S")
else:
date_sent = None

data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'title': message['Subject'],
'docAuthor': message['From'],
'description': f"email {message['From']} to {message['To']}",
'docSource': "mbox file uploaded by the user.",
'published': file_creation_time(fullpath),
'sender': message['From'],
'recipient': message['To'],
Expand Down
44 changes: 36 additions & 8 deletions collector/scripts/watch/convert/as_pdf.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os, time
from langchain.document_loaders import PyPDFLoader
import os, time, fitz
from langchain.document_loaders import PyMuPDFLoader # better UTF support and metadata
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from slugify import slugify
from ..utils import guid, file_creation_time, write_to_server_documents, move_source
from ...utils import tokenize



# Process all text-related documents.
def as_pdf(**kwargs):
parent_dir = kwargs.get('directory', 'hotdir')
Expand All @@ -13,20 +16,45 @@ def as_pdf(**kwargs):
fullpath = f"{parent_dir}/{filename}{ext}"
destination = f"../server/storage/documents/{slugify(filename)}-{int(time.time())}"

loader = PyPDFLoader(fullpath)
pages = loader.load_and_split()
loader = PyMuPDFLoader(fullpath)

# Custom flags for PyMuPDFLoader. https://pymupdf.readthedocs.io/en/latest/app1.html#text-extraction-flags-defaults
mu_flags = (fitz.TEXT_PRESERVE_WHITESPACE
| fitz.TEXT_PRESERVE_LIGATURES
| fitz.TEXT_MEDIABOX_CLIP
| fitz.TEXT_DEHYPHENATE) & ~fitz.TEXT_PRESERVE_SPANS & ~fitz.TEXT_PRESERVE_IMAGES

pages = loader.load(flags=mu_flags)

# The only thing PyMuPDFLoader does not have a flag it for removing all line breaks.
# comparing with PyPDF, to acchieve the same result, we need to do add space where there is '\n\s' and remove double spaces

# Best so fot, replace didn't understood '\n\s' so we need to do it in two steps
for page in pages:
page.page_content = page.page_content.replace("\n ", " ").replace("\uFFFD", " ").replace(" ", " ")

text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
pages = text_splitter.split_documents(pages)


print(f"-- Working {fullpath} --")
for page in pages:
pg_num = page.metadata.get('page')
print(f"-- Working page {pg_num} --")

content = page.page_content
title = page.metadata.get('title')
author = page.metadata.get('author')
subject = page.metadata.get('subject')

data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}_pg{pg_num}{ext}",
'description': "a custom file uploaded by the user.",
'title': title if title else 'Untitled',
'docAuthor': author if author else 'Unknown',
'description': subject if subject else 'Unknown',
'docSource': 'pdf file uploaded by the user.',
'chunkSource': f"{filename}_pg{pg_num}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
Expand All @@ -35,4 +63,4 @@ def as_pdf(**kwargs):
write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}", destination)

move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
26 changes: 21 additions & 5 deletions collector/scripts/watch/convert/as_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,32 @@ def as_text(**kwargs):

print(f"-- Working {fullpath} --")
data = {
'id': guid(),
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.",
'title': slugify(filename), # TODO: Find a better title
'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better description
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}

data = {
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
'title': slugify(filename), # TODO: Find a better title
'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better description
'docSource': "plain text file uploaded by the user.",
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}


write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
55 changes: 33 additions & 22 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,42 @@ FROM ubuntu:jammy-20230522 AS base
ARG ARG_UID
ARG ARG_GID

SHELL ["/bin/bash", "-o", "pipefail", "-c"]

# Install system dependencies
# hadolint ignore=DL3008,DL3013
RUN DEBIAN_FRONTEND=noninteractive apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends \
curl libgfortran5 python3 python3-pip tzdata netcat \
libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 \
libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 \
libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release \
xdg-utils && \
curl libgfortran5 python3 python3-pip tzdata netcat \
libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 \
libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 \
libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release \
xdg-utils && \
curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
apt-get install -yq --no-install-recommends nodejs && \
curl -LO https://github.com/yarnpkg/yarn/releases/download/v1.22.19/yarn_1.22.19_all.deb \
&& dpkg -i yarn_1.22.19_all.deb \
&& rm yarn_1.22.19_all.deb && \
&& dpkg -i yarn_1.22.19_all.deb \
&& rm yarn_1.22.19_all.deb && \
curl -LO https://github.com/jgm/pandoc/releases/download/3.1.3/pandoc-3.1.3-1-amd64.deb \
&& dpkg -i pandoc-3.1.3-1-amd64.deb \
&& rm pandoc-3.1.3-1-amd64.deb && \
&& dpkg -i pandoc-3.1.3-1-amd64.deb \
&& rm pandoc-3.1.3-1-amd64.deb && \
rm -rf /var/lib/apt/lists/* /usr/share/icons && \
dpkg-reconfigure -f noninteractive tzdata && \
python3 -m pip install --no-cache-dir virtualenv

# Create a group and user with specific UID and GID
RUN groupadd -g $ARG_GID anythingllm && \
useradd -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \
useradd -l -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \
mkdir -p /app/frontend/ /app/server/ /app/collector/ && chown -R anythingllm:anythingllm /app

# Copy docker helper scripts
COPY ./docker/docker-entrypoint.sh /usr/local/bin/
COPY ./docker/docker-healthcheck.sh /usr/local/bin/

# Ensure the scripts are executable
RUN chmod +x /usr/local/bin/docker-entrypoint.sh && \
chmod +x /usr/local/bin/docker-healthcheck.sh
RUN chmod +x /usr/local/bin/docker-entrypoint.sh \
&& chmod +x /usr/local/bin/docker-healthcheck.sh

USER anythingllm

Expand All @@ -47,19 +50,25 @@ WORKDIR /app
FROM base as frontend-deps

COPY ./frontend/package.json ./frontend/yarn.lock ./frontend/
RUN cd ./frontend/ && yarn install && yarn cache clean
WORKDIR /app/frontend
RUN yarn install && yarn cache clean
WORKDIR /app

# Install server dependencies
FROM base as server-deps
COPY ./server/package.json ./server/yarn.lock ./server/
RUN cd ./server/ && yarn install --production && yarn cache clean && \
WORKDIR /app/server
RUN yarn install --production && yarn cache clean && \
rm /app/server/node_modules/vectordb/x86_64-apple-darwin.node && \
rm /app/server/node_modules/vectordb/aarch64-apple-darwin.node
WORKDIR /app

# Build the frontend
FROM frontend-deps as build-stage
COPY ./frontend/ ./frontend/
RUN cd ./frontend/ && yarn build && yarn cache clean
WORKDIR /app/frontend
RUN yarn build && yarn cache clean
WORKDIR /app

# Setup the server
FROM server-deps as production-stage
Expand All @@ -72,10 +81,12 @@ COPY --from=build-stage /app/frontend/dist ./server/public
COPY --chown=anythingllm:anythingllm ./collector/ ./collector/

# Install collector dependencies
RUN cd /app/collector && \
python3 -m virtualenv v-env && \
. v-env/bin/activate && \
pip install --no-cache-dir -r requirements.txt
WORKDIR /app/collector
# hadolint ignore=SC1091
RUN python3 -m virtualenv v-env \
&& source /app/collector/v-env/bin/activate \
&& pip install --no-cache-dir -r requirements.txt
WORKDIR /app

# Setup the environment
ENV NODE_ENV=production
Expand All @@ -86,7 +97,7 @@ EXPOSE 3001

# Setup the healthcheck
HEALTHCHECK --interval=1m --timeout=10s --start-period=1m \
CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1
CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1

# Run the server
ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"]
ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"]
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
"generate::gcp_deployment": "node cloud-deployments/gcp/deployment/generate.mjs"
},
"private": false
}
}