From 61cbae52b079568eb4848debebcdd270a4cf5514 Mon Sep 17 00:00:00 2001 From: Francisco Bischoff Date: Thu, 10 Aug 2023 03:32:54 +0100 Subject: [PATCH 1/6] cosmetic changes to be compatible to hadolint --- docker/Dockerfile | 55 ++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1625263ddb3..d9013fefb1a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -5,30 +5,33 @@ FROM ubuntu:jammy-20230522 AS base ARG ARG_UID ARG ARG_GID +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + # Install system dependencies +# hadolint ignore=DL3008,DL3013 RUN DEBIAN_FRONTEND=noninteractive apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends \ - curl libgfortran5 python3 python3-pip tzdata netcat \ - libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 \ - libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libx11-6 libx11-xcb1 libxcb1 \ - libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 \ - libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release \ - xdg-utils && \ + curl libgfortran5 python3 python3-pip tzdata netcat \ + libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 \ + libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libx11-6 libx11-xcb1 libxcb1 \ + libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 \ + libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release \ + xdg-utils && \ curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \ apt-get install -yq --no-install-recommends nodejs && \ curl -LO https://github.com/yarnpkg/yarn/releases/download/v1.22.19/yarn_1.22.19_all.deb \ - && dpkg -i yarn_1.22.19_all.deb \ - && rm yarn_1.22.19_all.deb && \ + && dpkg -i yarn_1.22.19_all.deb \ + && rm yarn_1.22.19_all.deb && \ curl -LO https://github.com/jgm/pandoc/releases/download/3.1.3/pandoc-3.1.3-1-amd64.deb \ - && dpkg -i pandoc-3.1.3-1-amd64.deb \ - && rm pandoc-3.1.3-1-amd64.deb && \ + && dpkg -i pandoc-3.1.3-1-amd64.deb \ + && rm pandoc-3.1.3-1-amd64.deb && \ rm -rf /var/lib/apt/lists/* /usr/share/icons && \ dpkg-reconfigure -f noninteractive tzdata && \ python3 -m pip install --no-cache-dir virtualenv # Create a group and user with specific UID and GID RUN groupadd -g $ARG_GID anythingllm && \ - useradd -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \ + useradd -l -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \ mkdir -p /app/frontend/ /app/server/ /app/collector/ && chown -R anythingllm:anythingllm /app # Copy docker helper scripts @@ -36,8 +39,8 @@ COPY ./docker/docker-entrypoint.sh /usr/local/bin/ COPY ./docker/docker-healthcheck.sh /usr/local/bin/ # Ensure the scripts are executable -RUN chmod +x /usr/local/bin/docker-entrypoint.sh && \ - chmod +x /usr/local/bin/docker-healthcheck.sh +RUN chmod +x /usr/local/bin/docker-entrypoint.sh \ + && chmod +x /usr/local/bin/docker-healthcheck.sh USER anythingllm @@ -47,19 +50,25 @@ WORKDIR /app FROM base as frontend-deps COPY ./frontend/package.json ./frontend/yarn.lock ./frontend/ -RUN cd ./frontend/ && yarn install && yarn cache clean +WORKDIR /app/frontend +RUN yarn install && yarn cache clean +WORKDIR /app # Install server dependencies FROM base as server-deps COPY ./server/package.json ./server/yarn.lock ./server/ -RUN cd ./server/ && yarn install --production && yarn cache clean && \ +WORKDIR /app/server +RUN yarn install --production && yarn cache clean && \ rm /app/server/node_modules/vectordb/x86_64-apple-darwin.node && \ rm /app/server/node_modules/vectordb/aarch64-apple-darwin.node +WORKDIR /app # Build the frontend FROM frontend-deps as build-stage COPY ./frontend/ ./frontend/ -RUN cd ./frontend/ && yarn build && yarn cache clean +WORKDIR /app/frontend +RUN yarn build && yarn cache clean +WORKDIR /app # Setup the server FROM server-deps as production-stage @@ -72,10 +81,12 @@ COPY --from=build-stage /app/frontend/dist ./server/public COPY ./collector/ ./collector/ # Install collector dependencies -RUN cd /app/collector && \ - python3 -m virtualenv v-env && \ - . v-env/bin/activate && \ - pip install --no-cache-dir -r requirements.txt +WORKDIR /app/collector +# hadolint ignore=SC1091 +RUN python3 -m virtualenv v-env \ + && source /app/collector/v-env/bin/activate \ + && pip install --no-cache-dir -r requirements.txt +WORKDIR /app # Setup the environment ENV NODE_ENV=production @@ -86,7 +97,7 @@ EXPOSE 3001 # Setup the healthcheck HEALTHCHECK --interval=1m --timeout=10s --start-period=1m \ - CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1 + CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1 # Run the server -ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"] \ No newline at end of file +ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"] From 0bd1cfe8b306469f6764b26202191b2cfab8a8bc Mon Sep 17 00:00:00 2001 From: Francisco Bischoff Date: Thu, 10 Aug 2023 03:34:06 +0100 Subject: [PATCH 2/6] common configuration for most editors until better plugins comes up --- .editorconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000000..5d47c21c4e1 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +# EditorConfig is awesome: https://EditorConfig.org + +# top-most EditorConfig file +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true From f767989eb75688b4990475a6233db3e985560f19 Mon Sep 17 00:00:00 2001 From: Francisco Bischoff Date: Thu, 10 Aug 2023 03:36:14 +0100 Subject: [PATCH 3/6] Changes on PDF metadata, using PyMuPDF (faster and more compatible) --- collector/requirements.txt | 6 ++-- collector/scripts/watch/convert/as_pdf.py | 44 ++++++++++++++++++----- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/collector/requirements.txt b/collector/requirements.txt index 7fc4f05d390..2bb3b71d128 100644 --- a/collector/requirements.txt +++ b/collector/requirements.txt @@ -54,6 +54,7 @@ mypy-extensions==1.0.0 nltk==3.8.1 numexpr==2.8.4 numpy==1.23.5 +oauthlib==3.2.2 olefile==0.46 openapi-schema-pydantic==1.2.4 openpyxl==3.1.2 @@ -68,8 +69,8 @@ pycparser==2.21 pydantic==1.10.8 pyee==8.2.2 Pygments==2.15.1 +PyMuPDF==1.22.5 pypandoc==1.4 -pypdf==3.9.0 pyppeteer==1.0.2 pyquery==2.0.0 python-dateutil==2.8.2 @@ -83,6 +84,7 @@ PyYAML==6.0 regex==2023.5.5 requests==2.31.0 requests-html==0.10.0 +requests-oauthlib==1.3.1 rfc3986==1.5.0 rich==13.0.1 six==1.16.0 @@ -94,6 +96,7 @@ tenacity==8.2.2 text-unidecode==1.3 tiktoken==0.4.0 tqdm==4.65.0 +tweepy==4.14.0 typer==0.9.0 typing-inspect==0.9.0 typing_extensions==4.6.3 @@ -110,4 +113,3 @@ XlsxWriter==3.1.2 yarl==1.9.2 youtube-transcript-api==0.6.0 zipp==3.15.0 -tweepy==4.14.0 diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py index 12163cf2619..066e6f51c61 100644 --- a/collector/scripts/watch/convert/as_pdf.py +++ b/collector/scripts/watch/convert/as_pdf.py @@ -1,9 +1,12 @@ -import os, time -from langchain.document_loaders import PyPDFLoader +import os, time, fitz +from langchain.document_loaders import PyMuPDFLoader # better UTF support and metadata +from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter from slugify import slugify from ..utils import guid, file_creation_time, write_to_server_documents, move_source from ...utils import tokenize + + # Process all text-related documents. def as_pdf(**kwargs): parent_dir = kwargs.get('directory', 'hotdir') @@ -13,8 +16,26 @@ def as_pdf(**kwargs): fullpath = f"{parent_dir}/{filename}{ext}" destination = f"../server/storage/documents/{slugify(filename)}-{int(time.time())}" - loader = PyPDFLoader(fullpath) - pages = loader.load_and_split() + loader = PyMuPDFLoader(fullpath) + + # Custom flags for PyMuPDFLoader. https://pymupdf.readthedocs.io/en/latest/app1.html#text-extraction-flags-defaults + mu_flags = (fitz.TEXT_PRESERVE_WHITESPACE + | fitz.TEXT_PRESERVE_LIGATURES + | fitz.TEXT_MEDIABOX_CLIP + | fitz.TEXT_DEHYPHENATE) & ~fitz.TEXT_PRESERVE_SPANS & ~fitz.TEXT_PRESERVE_IMAGES + + pages = loader.load(flags=mu_flags) + + # The only thing PyMuPDFLoader does not have a flag it for removing all line breaks. + # comparing with PyPDF, to acchieve the same result, we need to do add space where there is '\n\s' and remove double spaces + + # Best so fot, replace didn't understood '\n\s' so we need to do it in two steps + for page in pages: + page.page_content = page.page_content.replace("\n ", " ").replace(" ", " ") + + text_splitter: TextSplitter = RecursiveCharacterTextSplitter() + pages = text_splitter.split_documents(pages) + print(f"-- Working {fullpath} --") for page in pages: @@ -22,11 +43,18 @@ def as_pdf(**kwargs): print(f"-- Working page {pg_num} --") content = page.page_content + title = page.metadata.get('title') + author = page.metadata.get('author') + subject = page.metadata.get('subject') + data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), - 'title': f"{filename}_pg{pg_num}{ext}", - 'description': "a custom file uploaded by the user.", + 'title': title if title else 'Untitled', + 'author': author if author else 'Unknown', + 'description': subject if subject else 'Unknown', + 'document_source': 'pdf file uploaded by the user.', + 'chunk_source': f"{filename}_pg{pg_num}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, @@ -35,4 +63,4 @@ def as_pdf(**kwargs): write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}", destination) move_source(parent_dir, f"{filename}{ext}", remove=remove) - print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") From b7d268cbd2908372ffe529044e7ae1e8bfa4f64d Mon Sep 17 00:00:00 2001 From: Francisco Bischoff Date: Thu, 10 Aug 2023 03:37:01 +0100 Subject: [PATCH 4/6] small changes on other file ingestions in order to try to keep the fields equal --- .../scripts/watch/convert/as_markdown.py | 13 ++++++++----- collector/scripts/watch/convert/as_mbox.py | 12 +++++++----- collector/scripts/watch/convert/as_text.py | 19 ++++++++++++++++--- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/collector/scripts/watch/convert/as_markdown.py b/collector/scripts/watch/convert/as_markdown.py index 49cf538c0de..3bd7270c62c 100644 --- a/collector/scripts/watch/convert/as_markdown.py +++ b/collector/scripts/watch/convert/as_markdown.py @@ -18,16 +18,19 @@ def as_markdown(**kwargs): print(f"-- Working {fullpath} --") data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), - 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'title': f"{filename}", # TODO: find a better metadata + 'author': "Unknown", # TODO: find a better metadata + 'description': "Unknown", # TODO: find a better metadata + 'document_source': "markdown file uploaded by the user.", + 'chuck_source': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } - + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) - print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py index 0fa17985475..4e5ed91470b 100644 --- a/collector/scripts/watch/convert/as_mbox.py +++ b/collector/scripts/watch/convert/as_mbox.py @@ -1,5 +1,5 @@ import os -import datetime +import datetime import email.utils from mailbox import mbox from slugify import slugify @@ -36,12 +36,14 @@ def as_mbox(**kwargs): date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S") else: date_sent = None - + data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"), - 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'title': message['Subject'], + 'author': message['From'], + 'description': f"email {message['From']} to {message['To']}", + 'document_source': "mbox file uploaded by the user.", 'published': file_creation_time(fullpath), 'sender': message['From'], 'recipient': message['To'], diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py index a9935b48ae9..c96f0579b64 100644 --- a/collector/scripts/watch/convert/as_text.py +++ b/collector/scripts/watch/convert/as_text.py @@ -14,7 +14,7 @@ def as_text(**kwargs): print(f"-- Working {fullpath} --") data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': f"{filename}{ext}", 'description': "a custom file uploaded by the user.", @@ -23,7 +23,20 @@ def as_text(**kwargs): 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } - + data = { + 'id': guid(), + 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"), + 'title': slugify(filename), # TODO: Find a better title + 'author': 'Unknown', # TODO: Find a better author + 'description': 'Unknown', # TODO: Find a better description + 'document_source': "plain text file uploaded by the user.", + 'published': file_creation_time(fullpath), + 'wordCount': len(content), + 'pageContent': content, + 'token_count_estimate': len(tokenize(content)) + } + + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) - print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") From 292bb563bf09f07cee78e0e93b9b5560b004c8f5 Mon Sep 17 00:00:00 2001 From: Francisco Bischoff Date: Thu, 10 Aug 2023 11:36:32 +0100 Subject: [PATCH 5/6] Lint, review, and review --- clean.sh | 0 collector/scripts/gitbook.py | 8 ++++---- collector/scripts/watch/convert/as_docx.py | 20 ++++++++++++------- .../scripts/watch/convert/as_markdown.py | 8 ++++---- collector/scripts/watch/convert/as_mbox.py | 4 ++-- collector/scripts/watch/convert/as_pdf.py | 6 +++--- collector/scripts/watch/convert/as_text.py | 11 ++++++---- package.json | 2 +- 8 files changed, 34 insertions(+), 25 deletions(-) mode change 100644 => 100755 clean.sh diff --git a/clean.sh b/clean.sh old mode 100644 new mode 100755 diff --git a/collector/scripts/gitbook.py b/collector/scripts/gitbook.py index 76da8050553..98625bf8983 100644 --- a/collector/scripts/gitbook.py +++ b/collector/scripts/gitbook.py @@ -29,10 +29,10 @@ def gitbook(): data = { 'id': str(uuid4()), 'url': metadata.get('source'), - "title": metadata.get('title'), - "description": metadata.get('title'), - "published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'), - "wordCount": len(content), + 'title': metadata.get('title'), + 'description': metadata.get('title'), + 'published': datetime.today().strftime('%Y-%m-%d %H:%M:%S'), + 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } diff --git a/collector/scripts/watch/convert/as_docx.py b/collector/scripts/watch/convert/as_docx.py index ade70e579f5..6d16650e3e5 100644 --- a/collector/scripts/watch/convert/as_docx.py +++ b/collector/scripts/watch/convert/as_docx.py @@ -18,16 +18,19 @@ def as_docx(**kwargs): print(f"-- Working {fullpath} --") data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'docAuthor': 'Unknown', # TODO: Find a better author + 'description': 'Unknown', # TODO: Find a better bescription + 'docSource': 'Docx Text file uploaded by the user.', + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } - + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") @@ -45,16 +48,19 @@ def as_odt(**kwargs): print(f"-- Working {fullpath} --") data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'author': 'Unknown', # TODO: Find a better author + 'description': 'Unknown', # TODO: Find a better bescription + 'docSource': 'ODT Text file uploaded by the user.', + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } - + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) - print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") diff --git a/collector/scripts/watch/convert/as_markdown.py b/collector/scripts/watch/convert/as_markdown.py index 3bd7270c62c..3e1a3dba1a7 100644 --- a/collector/scripts/watch/convert/as_markdown.py +++ b/collector/scripts/watch/convert/as_markdown.py @@ -21,10 +21,10 @@ def as_markdown(**kwargs): 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': f"{filename}", # TODO: find a better metadata - 'author': "Unknown", # TODO: find a better metadata - 'description': "Unknown", # TODO: find a better metadata - 'document_source': "markdown file uploaded by the user.", - 'chuck_source': f"{filename}{ext}", + 'docAuthor': 'Unknown', # TODO: find a better metadata + 'description': 'Unknown', # TODO: find a better metadata + 'docSource': 'markdown file uploaded by the user.', + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py index 4e5ed91470b..96c9b25bb55 100644 --- a/collector/scripts/watch/convert/as_mbox.py +++ b/collector/scripts/watch/convert/as_mbox.py @@ -41,9 +41,9 @@ def as_mbox(**kwargs): 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"), 'title': message['Subject'], - 'author': message['From'], + 'docAuthor': message['From'], 'description': f"email {message['From']} to {message['To']}", - 'document_source': "mbox file uploaded by the user.", + 'docSource': "mbox file uploaded by the user.", 'published': file_creation_time(fullpath), 'sender': message['From'], 'recipient': message['To'], diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py index 066e6f51c61..ec2df55607c 100644 --- a/collector/scripts/watch/convert/as_pdf.py +++ b/collector/scripts/watch/convert/as_pdf.py @@ -51,10 +51,10 @@ def as_pdf(**kwargs): 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': title if title else 'Untitled', - 'author': author if author else 'Unknown', + 'docAuthor': author if author else 'Unknown', 'description': subject if subject else 'Unknown', - 'document_source': 'pdf file uploaded by the user.', - 'chunk_source': f"{filename}_pg{pg_num}{ext}", + 'docSource': 'pdf file uploaded by the user.', + 'chunkSource': f"{filename}_pg{pg_num}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py index c96f0579b64..e0c05430689 100644 --- a/collector/scripts/watch/convert/as_text.py +++ b/collector/scripts/watch/convert/as_text.py @@ -16,8 +16,10 @@ def as_text(**kwargs): data = { 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), - 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'title': slugify(filename), # TODO: Find a better title + 'docAuthor': 'Unknown', # TODO: Find a better author + 'description': 'Unknown', # TODO: Find a better description + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, @@ -27,9 +29,10 @@ def as_text(**kwargs): 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"), 'title': slugify(filename), # TODO: Find a better title - 'author': 'Unknown', # TODO: Find a better author + 'docAuthor': 'Unknown', # TODO: Find a better author 'description': 'Unknown', # TODO: Find a better description - 'document_source': "plain text file uploaded by the user.", + 'docSource': "plain text file uploaded by the user.", + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, diff --git a/package.json b/package.json index 12d46fe3e74..0133ef84e35 100644 --- a/package.json +++ b/package.json @@ -20,4 +20,4 @@ "generate::gcp_deployment": "node cloud-deployments/gcp/deployment/generate.mjs" }, "private": false -} \ No newline at end of file +} From 9cf196f80f247ff785c7a4fb2439a1d391723b56 Mon Sep 17 00:00:00 2001 From: Francisco Bischoff Date: Mon, 11 Sep 2023 05:34:19 +0100 Subject: [PATCH 6/6] fixed unknown chars --- collector/scripts/watch/convert/as_pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py index ec2df55607c..19aff4e4802 100644 --- a/collector/scripts/watch/convert/as_pdf.py +++ b/collector/scripts/watch/convert/as_pdf.py @@ -31,7 +31,7 @@ def as_pdf(**kwargs): # Best so fot, replace didn't understood '\n\s' so we need to do it in two steps for page in pages: - page.page_content = page.page_content.replace("\n ", " ").replace(" ", " ") + page.page_content = page.page_content.replace("\n ", " ").replace("\uFFFD", " ").replace(" ", " ") text_splitter: TextSplitter = RecursiveCharacterTextSplitter() pages = text_splitter.split_documents(pages)