Mintplex-Labs · franzbischoff · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,12 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+indent_style = space
+indent_size = 2
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
diff --git a/clean.sh b/clean.sh
diff --git a/collector/requirements.txt b/collector/requirements.txt
@@ -54,6 +54,7 @@ mypy-extensions==1.0.0
 nltk==3.8.1
 numexpr==2.8.4
 numpy==1.23.5
+oauthlib==3.2.2
 olefile==0.46
 openapi-schema-pydantic==1.2.4
 openpyxl==3.1.2
@@ -68,8 +69,8 @@ pycparser==2.21
 pydantic==1.10.8
 pyee==8.2.2
 Pygments==2.15.1
+PyMuPDF==1.22.5
 pypandoc==1.4
-pypdf==3.9.0
 pyppeteer==1.0.2
 pyquery==2.0.0
 python-dateutil==2.8.2
@@ -83,6 +84,7 @@ PyYAML==6.0
 regex==2023.5.5
 requests==2.31.0
 requests-html==0.10.0
+requests-oauthlib==1.3.1
 rfc3986==1.5.0
 rich==13.0.1
 six==1.16.0
@@ -94,6 +96,7 @@ tenacity==8.2.2
 text-unidecode==1.3
 tiktoken==0.4.0
 tqdm==4.65.0
+tweepy==4.14.0
 typer==0.9.0
 typing-inspect==0.9.0
 typing_extensions==4.6.3
@@ -110,4 +113,3 @@ XlsxWriter==3.1.2
 yarl==1.9.2
 youtube-transcript-api==0.6.0
 zipp==3.15.0
-tweepy==4.14.0 
diff --git a/collector/scripts/gitbook.py b/collector/scripts/gitbook.py
@@ -29,10 +29,10 @@ def gitbook():
     data = {
       'id': str(uuid4()),
       'url': metadata.get('source'),
-      "title": metadata.get('title'),
-      "description": metadata.get('title'),
-      "published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
-      "wordCount": len(content),
+      'title': metadata.get('title'),
+      'description': metadata.get('title'),
+      'published': datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
+      'wordCount': len(content),
       'pageContent': content,
       'token_count_estimate': len(tokenize(content))
     }

diff --git a/collector/scripts/watch/convert/as_docx.py b/collector/scripts/watch/convert/as_docx.py
@@ -18,16 +18,19 @@ def as_docx(**kwargs):
 
   print(f"-- Working {fullpath} --")
   data = {
-    'id': guid(), 
+    'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
     'title': f"{filename}{ext}",
-    'description': "a custom file uploaded by the user.",
+    'docAuthor': 'Unknown', # TODO: Find a better author
+    'description': 'Unknown', # TODO: Find a better bescription
+    'docSource': 'Docx Text file uploaded by the user.',
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
     'token_count_estimate': len(tokenize(content))
   }
-  
+
   write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
   print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
@@ -45,16 +48,19 @@ def as_odt(**kwargs):
 
   print(f"-- Working {fullpath} --")
   data = {
-    'id': guid(), 
+    'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
     'title': f"{filename}{ext}",
-    'description': "a custom file uploaded by the user.",
+    'author': 'Unknown', # TODO: Find a better author
+    'description': 'Unknown', # TODO: Find a better bescription
+    'docSource': 'ODT Text file uploaded by the user.',
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
     'token_count_estimate': len(tokenize(content))
   }
-  
+
   write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
-  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
+  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
diff --git a/collector/scripts/watch/convert/as_markdown.py b/collector/scripts/watch/convert/as_markdown.py
@@ -18,16 +18,19 @@ def as_markdown(**kwargs):
 
   print(f"-- Working {fullpath} --")
   data = {
-    'id': guid(), 
+    'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
-    'title': f"{filename}{ext}",
-    'description': "a custom file uploaded by the user.",
+    'title':  f"{filename}", # TODO: find a better metadata
+    'docAuthor': 'Unknown', # TODO: find a better metadata
+    'description': 'Unknown', # TODO: find a better metadata
+    'docSource': 'markdown file uploaded by the user.',
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
     'token_count_estimate': len(tokenize(content))
   }
-  
+
   write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
-  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
+  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py
@@ -1,5 +1,5 @@
 import os
-import datetime  
+import datetime
 import email.utils
 from mailbox import mbox
 from slugify import slugify
@@ -36,12 +36,14 @@ def as_mbox(**kwargs):
             date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S")
         else:
             date_sent = None
-            
+
         data = {
-            'id': guid(), 
+            'id': guid(),
             'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
-            'title': f"{filename}{ext}",
-            'description': "a custom file uploaded by the user.",
+            'title': message['Subject'],
+            'docAuthor': message['From'],
+            'description': f"email {message['From']} to {message['To']}",
+            'docSource': "mbox file uploaded by the user.",
             'published': file_creation_time(fullpath),
             'sender': message['From'],
             'recipient': message['To'],

diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py
@@ -1,9 +1,12 @@
-import os, time
-from langchain.document_loaders import PyPDFLoader
+import os, time, fitz
+from langchain.document_loaders import PyMuPDFLoader # better UTF support and metadata
+from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
 from slugify import slugify
 from ..utils import guid, file_creation_time, write_to_server_documents, move_source
 from ...utils import tokenize
 
+
+
 # Process all text-related documents.
 def as_pdf(**kwargs):
   parent_dir = kwargs.get('directory', 'hotdir')
@@ -13,20 +16,45 @@ def as_pdf(**kwargs):
   fullpath = f"{parent_dir}/{filename}{ext}"
   destination = f"../server/storage/documents/{slugify(filename)}-{int(time.time())}"
 
-  loader = PyPDFLoader(fullpath)
-  pages = loader.load_and_split()
+  loader = PyMuPDFLoader(fullpath)
+
+  # Custom flags for PyMuPDFLoader. https://pymupdf.readthedocs.io/en/latest/app1.html#text-extraction-flags-defaults
+  mu_flags = (fitz.TEXT_PRESERVE_WHITESPACE
+              | fitz.TEXT_PRESERVE_LIGATURES
+              | fitz.TEXT_MEDIABOX_CLIP
+              | fitz.TEXT_DEHYPHENATE) & ~fitz.TEXT_PRESERVE_SPANS & ~fitz.TEXT_PRESERVE_IMAGES
+
+  pages = loader.load(flags=mu_flags)
+
+  # The only thing PyMuPDFLoader does not have a flag it for removing all line breaks.
+  # comparing with PyPDF, to acchieve the same result, we need to do add space where there is '\n\s' and remove double spaces
+
+  # Best so fot, replace didn't understood '\n\s' so we need to do it in two steps
+  for page in pages:
+    page.page_content = page.page_content.replace("\n ", " ").replace("\uFFFD", " ").replace("  ", " ")
+
+  text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
+  pages = text_splitter.split_documents(pages)
+
 
   print(f"-- Working {fullpath} --")
   for page in pages:
     pg_num = page.metadata.get('page')
     print(f"-- Working page {pg_num} --")
 
     content = page.page_content
+    title = page.metadata.get('title')
+    author = page.metadata.get('author')
+    subject = page.metadata.get('subject')
+
     data = {
-      'id': guid(), 
+      'id': guid(),
       'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
-      'title': f"{filename}_pg{pg_num}{ext}",
-      'description': "a custom file uploaded by the user.",
+      'title': title if title else 'Untitled',
+      'docAuthor': author if author else 'Unknown',
+      'description': subject if subject else 'Unknown',
+      'docSource': 'pdf file uploaded by the user.',
+      'chunkSource': f"{filename}_pg{pg_num}{ext}",
       'published': file_creation_time(fullpath),
       'wordCount': len(content),
       'pageContent': content,
@@ -35,4 +63,4 @@ def as_pdf(**kwargs):
     write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}", destination)
 
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
-  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
+  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py
@@ -14,16 +14,32 @@ def as_text(**kwargs):
 
   print(f"-- Working {fullpath} --")
   data = {
-    'id': guid(), 
+    'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
-    'title': f"{filename}{ext}",
-    'description': "a custom file uploaded by the user.",
+    'title': slugify(filename), # TODO: Find a better title
+    'docAuthor': 'Unknown', # TODO: Find a better author
+    'description': 'Unknown', # TODO: Find a better description
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
     'token_count_estimate': len(tokenize(content))
   }
-
+  data = {
+    'id': guid(),
+    'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
+    'title': slugify(filename), # TODO: Find a better title
+    'docAuthor': 'Unknown', # TODO: Find a better author
+    'description': 'Unknown', # TODO: Find a better description
+    'docSource': "plain text file uploaded by the user.",
+    'chunkSource': f"{filename}{ext}",
+    'published': file_creation_time(fullpath),
+    'wordCount': len(content),
+    'pageContent': content,
+    'token_count_estimate': len(tokenize(content))
+  }
+
+
   write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
-  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
+  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -5,39 +5,42 @@ FROM ubuntu:jammy-20230522 AS base
 ARG ARG_UID
 ARG ARG_GID
 
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
 # Install system dependencies
+# hadolint ignore=DL3008,DL3013
 RUN DEBIAN_FRONTEND=noninteractive apt-get update && \
     DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends \
-        curl libgfortran5 python3 python3-pip tzdata netcat \
-        libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 \
-        libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
-        libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 \
-        libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release \
-        xdg-utils && \
+    curl libgfortran5 python3 python3-pip tzdata netcat \
+    libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 \
+    libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
+    libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 \
+    libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release \
+    xdg-utils && \
     curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
     apt-get install -yq --no-install-recommends nodejs && \
     curl -LO https://github.com/yarnpkg/yarn/releases/download/v1.22.19/yarn_1.22.19_all.deb \
-        && dpkg -i yarn_1.22.19_all.deb \
-        && rm yarn_1.22.19_all.deb && \
+    && dpkg -i yarn_1.22.19_all.deb \
+    && rm yarn_1.22.19_all.deb && \
     curl -LO https://github.com/jgm/pandoc/releases/download/3.1.3/pandoc-3.1.3-1-amd64.deb \
-        && dpkg -i pandoc-3.1.3-1-amd64.deb \
-        && rm pandoc-3.1.3-1-amd64.deb && \
+    && dpkg -i pandoc-3.1.3-1-amd64.deb \
+    && rm pandoc-3.1.3-1-amd64.deb && \
     rm -rf /var/lib/apt/lists/* /usr/share/icons && \
     dpkg-reconfigure -f noninteractive tzdata && \
     python3 -m pip install --no-cache-dir virtualenv
 
 # Create a group and user with specific UID and GID
 RUN groupadd -g $ARG_GID anythingllm && \
-    useradd -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \
+    useradd -l -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \
     mkdir -p /app/frontend/ /app/server/ /app/collector/ && chown -R anythingllm:anythingllm /app
 
 # Copy docker helper scripts
 COPY ./docker/docker-entrypoint.sh /usr/local/bin/
 COPY ./docker/docker-healthcheck.sh /usr/local/bin/
 
 # Ensure the scripts are executable
-RUN chmod +x /usr/local/bin/docker-entrypoint.sh && \
-    chmod +x /usr/local/bin/docker-healthcheck.sh
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh  \
+    && chmod +x /usr/local/bin/docker-healthcheck.sh
 
 USER anythingllm
 
@@ -47,19 +50,25 @@ WORKDIR /app
 FROM base as frontend-deps
 
 COPY ./frontend/package.json ./frontend/yarn.lock ./frontend/
-RUN cd ./frontend/ && yarn install && yarn cache clean
+WORKDIR /app/frontend
+RUN yarn install && yarn cache clean
+WORKDIR /app
 
 # Install server dependencies
 FROM base as server-deps
 COPY ./server/package.json ./server/yarn.lock ./server/
-RUN cd ./server/ && yarn install --production && yarn cache clean && \
+WORKDIR /app/server
+RUN yarn install --production && yarn cache clean && \
     rm /app/server/node_modules/vectordb/x86_64-apple-darwin.node && \
     rm /app/server/node_modules/vectordb/aarch64-apple-darwin.node
+WORKDIR /app
 
 # Build the frontend
 FROM frontend-deps as build-stage
 COPY ./frontend/ ./frontend/
-RUN cd ./frontend/ && yarn build && yarn cache clean
+WORKDIR /app/frontend
+RUN yarn build && yarn cache clean
+WORKDIR /app
 
 # Setup the server
 FROM server-deps as production-stage
@@ -72,10 +81,12 @@ COPY --from=build-stage /app/frontend/dist ./server/public
 COPY --chown=anythingllm:anythingllm ./collector/ ./collector/
 
 # Install collector dependencies
-RUN cd /app/collector && \
-    python3 -m virtualenv v-env && \
-    . v-env/bin/activate && \
-    pip install --no-cache-dir -r requirements.txt
+WORKDIR /app/collector
+# hadolint ignore=SC1091
+RUN python3 -m virtualenv v-env \
+    && source /app/collector/v-env/bin/activate \
+    && pip install --no-cache-dir -r requirements.txt
+WORKDIR /app
 
 # Setup the environment
 ENV NODE_ENV=production
@@ -86,7 +97,7 @@ EXPOSE 3001
 
 # Setup the healthcheck
 HEALTHCHECK --interval=1m --timeout=10s --start-period=1m \
-  CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1
+    CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1
 
 # Run the server
-ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"]
+ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"]
diff --git a/package.json b/package.json
@@ -20,4 +20,4 @@
     "generate::gcp_deployment": "node cloud-deployments/gcp/deployment/generate.mjs"
   },
   "private": false
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,4 +20,4 @@ @@
         "generate::gcp_deployment": "node cloud-deployments/gcp/deployment/generate.mjs"
       },
       "private": false
-    }
+    }