From 61cbae52b079568eb4848debebcdd270a4cf5514 Mon Sep 17 00:00:00 2001
From: Francisco Bischoff <franzbischoff@gmail.com>
Date: Thu, 10 Aug 2023 03:32:54 +0100
Subject: [PATCH 1/6] cosmetic changes to be compatible to hadolint

---
 docker/Dockerfile | 55 ++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1625263ddb3..d9013fefb1a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -5,30 +5,33 @@ FROM ubuntu:jammy-20230522 AS base
 ARG ARG_UID
 ARG ARG_GID
 
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
 # Install system dependencies
+# hadolint ignore=DL3008,DL3013
 RUN DEBIAN_FRONTEND=noninteractive apt-get update && \
     DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends \
-        curl libgfortran5 python3 python3-pip tzdata netcat \
-        libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 \
-        libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
-        libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 \
-        libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release \
-        xdg-utils && \
+    curl libgfortran5 python3 python3-pip tzdata netcat \
+    libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 \
+    libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
+    libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 \
+    libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release \
+    xdg-utils && \
     curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
     apt-get install -yq --no-install-recommends nodejs && \
     curl -LO https://github.com/yarnpkg/yarn/releases/download/v1.22.19/yarn_1.22.19_all.deb \
-        && dpkg -i yarn_1.22.19_all.deb \
-        && rm yarn_1.22.19_all.deb && \
+    && dpkg -i yarn_1.22.19_all.deb \
+    && rm yarn_1.22.19_all.deb && \
     curl -LO https://github.com/jgm/pandoc/releases/download/3.1.3/pandoc-3.1.3-1-amd64.deb \
-        && dpkg -i pandoc-3.1.3-1-amd64.deb \
-        && rm pandoc-3.1.3-1-amd64.deb && \
+    && dpkg -i pandoc-3.1.3-1-amd64.deb \
+    && rm pandoc-3.1.3-1-amd64.deb && \
     rm -rf /var/lib/apt/lists/* /usr/share/icons && \
     dpkg-reconfigure -f noninteractive tzdata && \
     python3 -m pip install --no-cache-dir virtualenv
 
 # Create a group and user with specific UID and GID
 RUN groupadd -g $ARG_GID anythingllm && \
-    useradd -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \
+    useradd -l -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \
     mkdir -p /app/frontend/ /app/server/ /app/collector/ && chown -R anythingllm:anythingllm /app
 
 # Copy docker helper scripts
@@ -36,8 +39,8 @@ COPY ./docker/docker-entrypoint.sh /usr/local/bin/
 COPY ./docker/docker-healthcheck.sh /usr/local/bin/
 
 # Ensure the scripts are executable
-RUN chmod +x /usr/local/bin/docker-entrypoint.sh && \
-    chmod +x /usr/local/bin/docker-healthcheck.sh
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh  \
+    && chmod +x /usr/local/bin/docker-healthcheck.sh
 
 USER anythingllm
 
@@ -47,19 +50,25 @@ WORKDIR /app
 FROM base as frontend-deps
 
 COPY ./frontend/package.json ./frontend/yarn.lock ./frontend/
-RUN cd ./frontend/ && yarn install && yarn cache clean
+WORKDIR /app/frontend
+RUN yarn install && yarn cache clean
+WORKDIR /app
 
 # Install server dependencies
 FROM base as server-deps
 COPY ./server/package.json ./server/yarn.lock ./server/
-RUN cd ./server/ && yarn install --production && yarn cache clean && \
+WORKDIR /app/server
+RUN yarn install --production && yarn cache clean && \
     rm /app/server/node_modules/vectordb/x86_64-apple-darwin.node && \
     rm /app/server/node_modules/vectordb/aarch64-apple-darwin.node
+WORKDIR /app
 
 # Build the frontend
 FROM frontend-deps as build-stage
 COPY ./frontend/ ./frontend/
-RUN cd ./frontend/ && yarn build && yarn cache clean
+WORKDIR /app/frontend
+RUN yarn build && yarn cache clean
+WORKDIR /app
 
 # Setup the server
 FROM server-deps as production-stage
@@ -72,10 +81,12 @@ COPY --from=build-stage /app/frontend/dist ./server/public
 COPY ./collector/ ./collector/
 
 # Install collector dependencies
-RUN cd /app/collector && \
-    python3 -m virtualenv v-env && \
-    . v-env/bin/activate && \
-    pip install --no-cache-dir -r requirements.txt
+WORKDIR /app/collector
+# hadolint ignore=SC1091
+RUN python3 -m virtualenv v-env \
+    && source /app/collector/v-env/bin/activate \
+    && pip install --no-cache-dir -r requirements.txt
+WORKDIR /app
 
 # Setup the environment
 ENV NODE_ENV=production
@@ -86,7 +97,7 @@ EXPOSE 3001
 
 # Setup the healthcheck
 HEALTHCHECK --interval=1m --timeout=10s --start-period=1m \
-  CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1
+    CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1
 
 # Run the server
-ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"]
\ No newline at end of file
+ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"]

From 0bd1cfe8b306469f6764b26202191b2cfab8a8bc Mon Sep 17 00:00:00 2001
From: Francisco Bischoff <franzbischoff@gmail.com>
Date: Thu, 10 Aug 2023 03:34:06 +0100
Subject: [PATCH 2/6] common configuration for most editors until better
 plugins comes up

---
 .editorconfig | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 .editorconfig

diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 00000000000..5d47c21c4e1
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,12 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+indent_style = space
+indent_size = 2
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true

From f767989eb75688b4990475a6233db3e985560f19 Mon Sep 17 00:00:00 2001
From: Francisco Bischoff <franzbischoff@gmail.com>
Date: Thu, 10 Aug 2023 03:36:14 +0100
Subject: [PATCH 3/6] Changes on PDF metadata, using PyMuPDF (faster and more
 compatible)

---
 collector/requirements.txt                |  6 ++--
 collector/scripts/watch/convert/as_pdf.py | 44 ++++++++++++++++++-----
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/collector/requirements.txt b/collector/requirements.txt
index 7fc4f05d390..2bb3b71d128 100644
--- a/collector/requirements.txt
+++ b/collector/requirements.txt
@@ -54,6 +54,7 @@ mypy-extensions==1.0.0
 nltk==3.8.1
 numexpr==2.8.4
 numpy==1.23.5
+oauthlib==3.2.2
 olefile==0.46
 openapi-schema-pydantic==1.2.4
 openpyxl==3.1.2
@@ -68,8 +69,8 @@ pycparser==2.21
 pydantic==1.10.8
 pyee==8.2.2
 Pygments==2.15.1
+PyMuPDF==1.22.5
 pypandoc==1.4
-pypdf==3.9.0
 pyppeteer==1.0.2
 pyquery==2.0.0
 python-dateutil==2.8.2
@@ -83,6 +84,7 @@ PyYAML==6.0
 regex==2023.5.5
 requests==2.31.0
 requests-html==0.10.0
+requests-oauthlib==1.3.1
 rfc3986==1.5.0
 rich==13.0.1
 six==1.16.0
@@ -94,6 +96,7 @@ tenacity==8.2.2
 text-unidecode==1.3
 tiktoken==0.4.0
 tqdm==4.65.0
+tweepy==4.14.0
 typer==0.9.0
 typing-inspect==0.9.0
 typing_extensions==4.6.3
@@ -110,4 +113,3 @@ XlsxWriter==3.1.2
 yarl==1.9.2
 youtube-transcript-api==0.6.0
 zipp==3.15.0
-tweepy==4.14.0 
diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py
index 12163cf2619..066e6f51c61 100644
--- a/collector/scripts/watch/convert/as_pdf.py
+++ b/collector/scripts/watch/convert/as_pdf.py
@@ -1,9 +1,12 @@
-import os, time
-from langchain.document_loaders import PyPDFLoader
+import os, time, fitz
+from langchain.document_loaders import PyMuPDFLoader # better UTF support and metadata
+from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
 from slugify import slugify
 from ..utils import guid, file_creation_time, write_to_server_documents, move_source
 from ...utils import tokenize
 
+
+
 # Process all text-related documents.
 def as_pdf(**kwargs):
   parent_dir = kwargs.get('directory', 'hotdir')
@@ -13,8 +16,26 @@ def as_pdf(**kwargs):
   fullpath = f"{parent_dir}/{filename}{ext}"
   destination = f"../server/storage/documents/{slugify(filename)}-{int(time.time())}"
 
-  loader = PyPDFLoader(fullpath)
-  pages = loader.load_and_split()
+  loader = PyMuPDFLoader(fullpath)
+
+  # Custom flags for PyMuPDFLoader. https://pymupdf.readthedocs.io/en/latest/app1.html#text-extraction-flags-defaults
+  mu_flags = (fitz.TEXT_PRESERVE_WHITESPACE
+              | fitz.TEXT_PRESERVE_LIGATURES
+              | fitz.TEXT_MEDIABOX_CLIP
+              | fitz.TEXT_DEHYPHENATE) & ~fitz.TEXT_PRESERVE_SPANS & ~fitz.TEXT_PRESERVE_IMAGES
+
+  pages = loader.load(flags=mu_flags)
+
+  # The only thing PyMuPDFLoader does not have a flag it for removing all line breaks.
+  # comparing with PyPDF, to acchieve the same result, we need to do add space where there is '\n\s' and remove double spaces
+
+  # Best so fot, replace didn't understood '\n\s' so we need to do it in two steps
+  for page in pages:
+    page.page_content = page.page_content.replace("\n ", " ").replace("  ", " ")
+
+  text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
+  pages = text_splitter.split_documents(pages)
+
 
   print(f"-- Working {fullpath} --")
   for page in pages:
@@ -22,11 +43,18 @@ def as_pdf(**kwargs):
     print(f"-- Working page {pg_num} --")
 
     content = page.page_content
+    title = page.metadata.get('title')
+    author = page.metadata.get('author')
+    subject = page.metadata.get('subject')
+
     data = {
-      'id': guid(), 
+      'id': guid(),
       'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
-      'title': f"{filename}_pg{pg_num}{ext}",
-      'description': "a custom file uploaded by the user.",
+      'title': title if title else 'Untitled',
+      'author': author if author else 'Unknown',
+      'description': subject if subject else 'Unknown',
+      'document_source': 'pdf file uploaded by the user.',
+      'chunk_source': f"{filename}_pg{pg_num}{ext}",
       'published': file_creation_time(fullpath),
       'wordCount': len(content),
       'pageContent': content,
@@ -35,4 +63,4 @@ def as_pdf(**kwargs):
     write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}", destination)
 
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
-  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
+  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")

From b7d268cbd2908372ffe529044e7ae1e8bfa4f64d Mon Sep 17 00:00:00 2001
From: Francisco Bischoff <franzbischoff@gmail.com>
Date: Thu, 10 Aug 2023 03:37:01 +0100
Subject: [PATCH 4/6] small changes on other file ingestions in order to try to
 keep the fields equal

---
 .../scripts/watch/convert/as_markdown.py      | 13 ++++++++-----
 collector/scripts/watch/convert/as_mbox.py    | 12 +++++++-----
 collector/scripts/watch/convert/as_text.py    | 19 ++++++++++++++++---
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/collector/scripts/watch/convert/as_markdown.py b/collector/scripts/watch/convert/as_markdown.py
index 49cf538c0de..3bd7270c62c 100644
--- a/collector/scripts/watch/convert/as_markdown.py
+++ b/collector/scripts/watch/convert/as_markdown.py
@@ -18,16 +18,19 @@ def as_markdown(**kwargs):
 
   print(f"-- Working {fullpath} --")
   data = {
-    'id': guid(), 
+    'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
-    'title': f"{filename}{ext}",
-    'description': "a custom file uploaded by the user.",
+    'title':  f"{filename}", # TODO: find a better metadata
+    'author': "Unknown", # TODO: find a better metadata
+    'description': "Unknown", # TODO: find a better metadata
+    'document_source': "markdown file uploaded by the user.",
+    'chuck_source': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
     'token_count_estimate': len(tokenize(content))
   }
-  
+
   write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
-  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
+  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py
index 0fa17985475..4e5ed91470b 100644
--- a/collector/scripts/watch/convert/as_mbox.py
+++ b/collector/scripts/watch/convert/as_mbox.py
@@ -1,5 +1,5 @@
 import os
-import datetime  
+import datetime
 import email.utils
 from mailbox import mbox
 from slugify import slugify
@@ -36,12 +36,14 @@ def as_mbox(**kwargs):
             date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S")
         else:
             date_sent = None
-            
+
         data = {
-            'id': guid(), 
+            'id': guid(),
             'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
-            'title': f"{filename}{ext}",
-            'description': "a custom file uploaded by the user.",
+            'title': message['Subject'],
+            'author': message['From'],
+            'description': f"email {message['From']} to {message['To']}",
+            'document_source': "mbox file uploaded by the user.",
             'published': file_creation_time(fullpath),
             'sender': message['From'],
             'recipient': message['To'],
diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py
index a9935b48ae9..c96f0579b64 100644
--- a/collector/scripts/watch/convert/as_text.py
+++ b/collector/scripts/watch/convert/as_text.py
@@ -14,7 +14,7 @@ def as_text(**kwargs):
 
   print(f"-- Working {fullpath} --")
   data = {
-    'id': guid(), 
+    'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
     'title': f"{filename}{ext}",
     'description': "a custom file uploaded by the user.",
@@ -23,7 +23,20 @@ def as_text(**kwargs):
     'pageContent': content,
     'token_count_estimate': len(tokenize(content))
   }
-  
+  data = {
+    'id': guid(),
+    'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
+    'title': slugify(filename), # TODO: Find a better title
+    'author': 'Unknown', # TODO: Find a better author
+    'description': 'Unknown', # TODO: Find a better description
+    'document_source': "plain text file uploaded by the user.",
+    'published': file_creation_time(fullpath),
+    'wordCount': len(content),
+    'pageContent': content,
+    'token_count_estimate': len(tokenize(content))
+  }
+
+
   write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
-  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
+  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")

From 292bb563bf09f07cee78e0e93b9b5560b004c8f5 Mon Sep 17 00:00:00 2001
From: Francisco Bischoff <franzbischoff@gmail.com>
Date: Thu, 10 Aug 2023 11:36:32 +0100
Subject: [PATCH 5/6] Lint, review, and review

---
 clean.sh                                      |  0
 collector/scripts/gitbook.py                  |  8 ++++----
 collector/scripts/watch/convert/as_docx.py    | 20 ++++++++++++-------
 .../scripts/watch/convert/as_markdown.py      |  8 ++++----
 collector/scripts/watch/convert/as_mbox.py    |  4 ++--
 collector/scripts/watch/convert/as_pdf.py     |  6 +++---
 collector/scripts/watch/convert/as_text.py    | 11 ++++++----
 package.json                                  |  2 +-
 8 files changed, 34 insertions(+), 25 deletions(-)
 mode change 100644 => 100755 clean.sh

diff --git a/clean.sh b/clean.sh
old mode 100644
new mode 100755
diff --git a/collector/scripts/gitbook.py b/collector/scripts/gitbook.py
index 76da8050553..98625bf8983 100644
--- a/collector/scripts/gitbook.py
+++ b/collector/scripts/gitbook.py
@@ -29,10 +29,10 @@ def gitbook():
     data = {
       'id': str(uuid4()),
       'url': metadata.get('source'),
-      "title": metadata.get('title'),
-      "description": metadata.get('title'),
-      "published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
-      "wordCount": len(content),
+      'title': metadata.get('title'),
+      'description': metadata.get('title'),
+      'published': datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
+      'wordCount': len(content),
       'pageContent': content,
       'token_count_estimate': len(tokenize(content))
     }
diff --git a/collector/scripts/watch/convert/as_docx.py b/collector/scripts/watch/convert/as_docx.py
index ade70e579f5..6d16650e3e5 100644
--- a/collector/scripts/watch/convert/as_docx.py
+++ b/collector/scripts/watch/convert/as_docx.py
@@ -18,16 +18,19 @@ def as_docx(**kwargs):
 
   print(f"-- Working {fullpath} --")
   data = {
-    'id': guid(), 
+    'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
     'title': f"{filename}{ext}",
-    'description': "a custom file uploaded by the user.",
+    'docAuthor': 'Unknown', # TODO: Find a better author
+    'description': 'Unknown', # TODO: Find a better bescription
+    'docSource': 'Docx Text file uploaded by the user.',
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
     'token_count_estimate': len(tokenize(content))
   }
-  
+
   write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
   print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
@@ -45,16 +48,19 @@ def as_odt(**kwargs):
 
   print(f"-- Working {fullpath} --")
   data = {
-    'id': guid(), 
+    'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
     'title': f"{filename}{ext}",
-    'description': "a custom file uploaded by the user.",
+    'author': 'Unknown', # TODO: Find a better author
+    'description': 'Unknown', # TODO: Find a better bescription
+    'docSource': 'ODT Text file uploaded by the user.',
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
     'token_count_estimate': len(tokenize(content))
   }
-  
+
   write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
   move_source(parent_dir, f"{filename}{ext}", remove=remove)
-  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
+  print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
diff --git a/collector/scripts/watch/convert/as_markdown.py b/collector/scripts/watch/convert/as_markdown.py
index 3bd7270c62c..3e1a3dba1a7 100644
--- a/collector/scripts/watch/convert/as_markdown.py
+++ b/collector/scripts/watch/convert/as_markdown.py
@@ -21,10 +21,10 @@ def as_markdown(**kwargs):
     'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
     'title':  f"{filename}", # TODO: find a better metadata
-    'author': "Unknown", # TODO: find a better metadata
-    'description': "Unknown", # TODO: find a better metadata
-    'document_source': "markdown file uploaded by the user.",
-    'chuck_source': f"{filename}{ext}",
+    'docAuthor': 'Unknown', # TODO: find a better metadata
+    'description': 'Unknown', # TODO: find a better metadata
+    'docSource': 'markdown file uploaded by the user.',
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py
index 4e5ed91470b..96c9b25bb55 100644
--- a/collector/scripts/watch/convert/as_mbox.py
+++ b/collector/scripts/watch/convert/as_mbox.py
@@ -41,9 +41,9 @@ def as_mbox(**kwargs):
             'id': guid(),
             'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
             'title': message['Subject'],
-            'author': message['From'],
+            'docAuthor': message['From'],
             'description': f"email {message['From']} to {message['To']}",
-            'document_source': "mbox file uploaded by the user.",
+            'docSource': "mbox file uploaded by the user.",
             'published': file_creation_time(fullpath),
             'sender': message['From'],
             'recipient': message['To'],
diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py
index 066e6f51c61..ec2df55607c 100644
--- a/collector/scripts/watch/convert/as_pdf.py
+++ b/collector/scripts/watch/convert/as_pdf.py
@@ -51,10 +51,10 @@ def as_pdf(**kwargs):
       'id': guid(),
       'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
       'title': title if title else 'Untitled',
-      'author': author if author else 'Unknown',
+      'docAuthor': author if author else 'Unknown',
       'description': subject if subject else 'Unknown',
-      'document_source': 'pdf file uploaded by the user.',
-      'chunk_source': f"{filename}_pg{pg_num}{ext}",
+      'docSource': 'pdf file uploaded by the user.',
+      'chunkSource': f"{filename}_pg{pg_num}{ext}",
       'published': file_creation_time(fullpath),
       'wordCount': len(content),
       'pageContent': content,
diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py
index c96f0579b64..e0c05430689 100644
--- a/collector/scripts/watch/convert/as_text.py
+++ b/collector/scripts/watch/convert/as_text.py
@@ -16,8 +16,10 @@ def as_text(**kwargs):
   data = {
     'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
-    'title': f"{filename}{ext}",
-    'description': "a custom file uploaded by the user.",
+    'title': slugify(filename), # TODO: Find a better title
+    'docAuthor': 'Unknown', # TODO: Find a better author
+    'description': 'Unknown', # TODO: Find a better description
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
@@ -27,9 +29,10 @@ def as_text(**kwargs):
     'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
     'title': slugify(filename), # TODO: Find a better title
-    'author': 'Unknown', # TODO: Find a better author
+    'docAuthor': 'Unknown', # TODO: Find a better author
     'description': 'Unknown', # TODO: Find a better description
-    'document_source': "plain text file uploaded by the user.",
+    'docSource': "plain text file uploaded by the user.",
+    'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
     'pageContent': content,
diff --git a/package.json b/package.json
index 12d46fe3e74..0133ef84e35 100644
--- a/package.json
+++ b/package.json
@@ -20,4 +20,4 @@
     "generate::gcp_deployment": "node cloud-deployments/gcp/deployment/generate.mjs"
   },
   "private": false
-}
\ No newline at end of file
+}

From 9cf196f80f247ff785c7a4fb2439a1d391723b56 Mon Sep 17 00:00:00 2001
From: Francisco Bischoff <franzbischoff@gmail.com>
Date: Mon, 11 Sep 2023 05:34:19 +0100
Subject: [PATCH 6/6] fixed unknown chars

---
 collector/scripts/watch/convert/as_pdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py
index ec2df55607c..19aff4e4802 100644
--- a/collector/scripts/watch/convert/as_pdf.py
+++ b/collector/scripts/watch/convert/as_pdf.py
@@ -31,7 +31,7 @@ def as_pdf(**kwargs):
 
   # Best so fot, replace didn't understood '\n\s' so we need to do it in two steps
   for page in pages:
-    page.page_content = page.page_content.replace("\n ", " ").replace("  ", " ")
+    page.page_content = page.page_content.replace("\n ", " ").replace("\uFFFD", " ").replace("  ", " ")
 
   text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
   pages = text_splitter.split_documents(pages)