From ccf87e951f171e31a66d175df93d800e73ba13cd Mon Sep 17 00:00:00 2001 From: Ahren Stevens-Taylor Date: Thu, 11 Jan 2024 14:46:39 +0000 Subject: [PATCH 1/7] [FEAT]: Docker Tags specific to a build version #572 --- .github/workflows/build-and-push-image.yaml | 31 +++++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-and-push-image.yaml b/.github/workflows/build-and-push-image.yaml index 17ca5a9755e..70040fc6876 100644 --- a/.github/workflows/build-and-push-image.yaml +++ b/.github/workflows/build-and-push-image.yaml @@ -36,6 +36,19 @@ jobs: shell: bash run: echo "repo=${GITHUB_REPOSITORY,,}" >> $GITHUB_OUTPUT id: lowercase_repo + + - name: Check if DockerHub build needed + shell: bash + run: | + # Check if the secret for USERNAME is set (don't even check for the password) + if [[ -z "${{ secrets.DOCKER_USERNAME }}" ]]; then + echo "DockerHub build not needed" + echo "enabled=false" >> $GITHUB_OUTPUT + else + echo "DockerHub build needed" + echo "enabled=true" >> $GITHUB_OUTPUT + fi + id: dockerhub - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -45,6 +58,8 @@ jobs: - name: Log in to Docker Hub uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a + # Only login to the Docker Hub if the repo is mintplex/anythingllm, to allow for forks to build on GHCR + if: steps.dockerhub.outputs.enabled == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} @@ -61,9 +76,16 @@ jobs: uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: images: | - mintplexlabs/anythingllm + ${{ steps.dockerhub.outputs.enabled == 'true' && steps.lowercase_repo.outputs.repo || '' }} ghcr.io/${{ github.repository }} - + tags: | + type=raw,value=latest,enable={{is_default_branch}} + type=sha + type=ref,event=branch + type=ref,event=tag + type=ref,event=pr + + - name: Build and push multi-platform Docker image uses: docker/build-push-action@v5 with: @@ -71,8 +93,5 @@ jobs: file: ./docker/Dockerfile push: true platforms: linux/amd64,linux/arm64 - tags: | - ${{ steps.meta.outputs.tags }} - ${{ github.ref_name == 'master' && 'mintplexlabs/anythingllm:latest' || '' }} - ${{ github.ref_name == 'master' && format('ghcr.io/{0}:{1}', steps.lowercase_repo.outputs.repo, 'latest') || '' }} + tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file From 74190ae8367426cf940d220a865e73298c82ab0e Mon Sep 17 00:00:00 2001 From: Ahren Stevens-Taylor Date: Thu, 11 Jan 2024 14:46:39 +0000 Subject: [PATCH 2/7] fix: dockerhub repo name --- .github/workflows/build-and-push-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-push-image.yaml b/.github/workflows/build-and-push-image.yaml index 70040fc6876..dbfd74f3512 100644 --- a/.github/workflows/build-and-push-image.yaml +++ b/.github/workflows/build-and-push-image.yaml @@ -76,7 +76,7 @@ jobs: uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: images: | - ${{ steps.dockerhub.outputs.enabled == 'true' && steps.lowercase_repo.outputs.repo || '' }} + ${{ steps.dockerhub.outputs.enabled == 'true' && 'mintplex/anythingllm' || '' }} ghcr.io/${{ github.repository }} tags: | type=raw,value=latest,enable={{is_default_branch}} From 8e56dd7dd54c1641210285294265922a09c433bb Mon Sep 17 00:00:00 2001 From: Ahren Stevens-Taylor Date: Thu, 11 Jan 2024 14:53:50 +0000 Subject: [PATCH 3/7] feat: add Docker build caches --- .github/workflows/build-and-push-image.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-push-image.yaml b/.github/workflows/build-and-push-image.yaml index dbfd74f3512..5eab8e07c02 100644 --- a/.github/workflows/build-and-push-image.yaml +++ b/.github/workflows/build-and-push-image.yaml @@ -94,4 +94,6 @@ jobs: push: true platforms: linux/amd64,linux/arm64 tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max \ No newline at end of file From 6dd683aa3137e5cc8917dd924836f023af213517 Mon Sep 17 00:00:00 2001 From: Ahren Stevens-Taylor Date: Thu, 11 Jan 2024 16:00:49 +0000 Subject: [PATCH 4/7] fix: docker username Fix the DockerHub repository owner name --- .github/workflows/build-and-push-image.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-and-push-image.yaml b/.github/workflows/build-and-push-image.yaml index 5eab8e07c02..03318320d2c 100644 --- a/.github/workflows/build-and-push-image.yaml +++ b/.github/workflows/build-and-push-image.yaml @@ -76,7 +76,7 @@ jobs: uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: images: | - ${{ steps.dockerhub.outputs.enabled == 'true' && 'mintplex/anythingllm' || '' }} + ${{ steps.dockerhub.outputs.enabled == 'true' && 'mintplexlabs/anythingllm' || '' }} ghcr.io/${{ github.repository }} tags: | type=raw,value=latest,enable={{is_default_branch}} @@ -96,4 +96,4 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha - cache-to: type=gha,mode=max \ No newline at end of file + cache-to: type=gha,mode=max From dbc28ee1515a234cec7edfbae5c5caf4dea0961f Mon Sep 17 00:00:00 2001 From: Matthew Moore Date: Thu, 11 Jan 2024 16:26:11 +0000 Subject: [PATCH 5/7] FEAT: Add doc return data --- collector/index.js | 4 ++-- .../processSingleFile/convert/asAudio.js | 4 +++- collector/processSingleFile/convert/asDocx.js | 12 ++++++---- collector/processSingleFile/convert/asMbox.js | 4 +++- .../processSingleFile/convert/asOfficeMime.js | 4 +++- collector/processSingleFile/convert/asPDF.js | 13 +++++++---- collector/processSingleFile/convert/asTxt.js | 5 +++- server/endpoints/api/document/index.js | 23 ++++++++++--------- 8 files changed, 42 insertions(+), 27 deletions(-) diff --git a/collector/index.js b/collector/index.js index 5070ae72f81..a8040d54a85 100644 --- a/collector/index.js +++ b/collector/index.js @@ -29,8 +29,8 @@ app.post("/process", async function (request, response) { const targetFilename = path .normalize(filename) .replace(/^(\.\.(\/|\\|$))+/, ""); - const { success, reason } = await processSingleFile(targetFilename); - response.status(200).json({ filename: targetFilename, success, reason }); + const {document, success, reason } = await processSingleFile(targetFilename); + response.status(200).json({ document: document, success, reason }); } catch (e) { console.error(e); response.status(200).json({ diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index a15207fba44..9a4f05c615f 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -60,12 +60,14 @@ async function asAudio({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; + const { pageContent, token_count_estimate, ...responseData } = data; + writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); trashFile(fullFilePath); console.log( `[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n` ); - return { success: true, reason: null }; + return { success: true, reason: null, document: data }; } async function convertToWavAudioData(sourcePath) { diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index 7a64a042d17..87f0bf8fe00 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -12,22 +12,22 @@ async function asDocX({ fullFilePath = "", filename = "" }) { const loader = new DocxLoader(fullFilePath); console.log(`-- Working ${filename} --`); - let pageContent = []; + let docxPageContent = []; const docs = await loader.load(); for (const doc of docs) { console.log(doc.metadata); console.log(`-- Parsing content from docx page --`); if (!doc.pageContent.length) continue; - pageContent.push(doc.pageContent); + docxPageContent.push(doc.pageContent); } - if (!pageContent.length) { + if (!docxPageContent.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); return { success: false, reason: `No text content found in ${filename}.` }; } - const content = pageContent.join(""); + const content = docxPageContent.join(""); const data = { id: v4(), url: "file://" + fullFilePath, @@ -42,10 +42,12 @@ async function asDocX({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; + const { pageContent, token_count_estimate, ...responseData } = data; + writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null }; + return { success: true, reason: null, document: data }; } module.exports = asDocX; diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index 30883f21b12..260687a3072 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -55,11 +55,13 @@ async function asMbox({ fullFilePath = "", filename = "" }) { writeToServerDocuments(data, `${slugify(filename)}-${data.id}-msg-${item}`); } + const { pageContent, token_count_estimate, ...responseData } = data; + trashFile(fullFilePath); console.log( `[SUCCESS]: ${filename} messages converted & ready for embedding.\n` ); - return { success: true, reason: null }; + return { success: true, reason: null, document: data }; } module.exports = asMbox; diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index a6eb0351a78..09ed21c72a7 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -37,10 +37,12 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; + const { pageContent, token_count_estimate, ...responseData } = data; + writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null }; + return { success: true, reason: null, document: data }; } module.exports = asOfficeMime; diff --git a/collector/processSingleFile/convert/asPDF.js b/collector/processSingleFile/convert/asPDF.js index f6d869d5c48..a2cacb228df 100644 --- a/collector/processSingleFile/convert/asPDF.js +++ b/collector/processSingleFile/convert/asPDF.js @@ -14,7 +14,7 @@ async function asPDF({ fullFilePath = "", filename = "" }) { }); console.log(`-- Working ${filename} --`); - const pageContent = []; + const pdfPageContent = []; const docs = await pdfLoader.load(); for (const doc of docs) { console.log( @@ -23,16 +23,16 @@ async function asPDF({ fullFilePath = "", filename = "" }) { } --` ); if (!doc.pageContent.length) continue; - pageContent.push(doc.pageContent); + pdfPageContent.push(doc.pageContent); } - if (!pageContent.length) { + if (!pdfPageContent.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); return { success: false, reason: `No text content found in ${filename}.` }; } - const content = pageContent.join(""); + const content = pdfPageContent.join(""); const data = { id: v4(), url: "file://" + fullFilePath, @@ -47,10 +47,13 @@ async function asPDF({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; + const { pageContent, token_count_estimate, ...responseData } = data; + writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null }; + + return { success: true, reason: null, document: responseData }; } module.exports = asPDF; diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index ad35e54762c..53df6b47969 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -37,10 +37,13 @@ async function asTxt({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; + const { pageContent, token_count_estimate, ...responseData } = data; + writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null }; + + return { success: true, reason: null, document: responseData}; } module.exports = asTxt; diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index a813e2df6d0..54dde363695 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -17,7 +17,7 @@ function apiDocumentEndpoints(app) { [validApiKey], handleUploads.single("file"), async (request, response) => { - /* + /* #swagger.tags = ['Documents'] #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.' @@ -49,9 +49,9 @@ function apiDocumentEndpoints(app) { error: null, } } - } + } } - } + } #swagger.responses[403] = { schema: { "$ref": "#/definitions/InvalidAPIKey" @@ -72,16 +72,17 @@ function apiDocumentEndpoints(app) { .end(); } - const { success, reason } = await processDocument(originalname); + const { document, success, reason } = await processDocument(originalname); if (!success) { response.status(500).json({ success: false, error: reason }).end(); } + console.log( `Document ${originalname} uploaded processed and successfully. It is now available in documents.` ); await Telemetry.sendTelemetry("document_uploaded"); - response.status(200).json({ success: true, error: null }); + response.status(200).json({ success: success, error: null, document: document}); } catch (e) { console.log(e.message, e); response.sendStatus(500).end(); @@ -90,7 +91,7 @@ function apiDocumentEndpoints(app) { ); app.get("/v1/documents", [validApiKey], async (_, response) => { - /* + /* #swagger.tags = ['Documents'] #swagger.description = 'List of all locally-stored documents in instance' #swagger.responses[200] = { @@ -115,9 +116,9 @@ function apiDocumentEndpoints(app) { } } } - } + } } - } + } #swagger.responses[403] = { schema: { "$ref": "#/definitions/InvalidAPIKey" @@ -137,7 +138,7 @@ function apiDocumentEndpoints(app) { "/v1/document/accepted-file-types", [validApiKey], async (_, response) => { - /* + /* #swagger.tags = ['Documents'] #swagger.description = 'Check available filetypes and MIMEs that can be uploaded.' #swagger.responses[200] = { @@ -166,9 +167,9 @@ function apiDocumentEndpoints(app) { } } } - } + } } - } + } #swagger.responses[403] = { schema: { "$ref": "#/definitions/InvalidAPIKey" From c0980db94b585c121f556a6a6004028b45c00334 Mon Sep 17 00:00:00 2001 From: Matthew Moore Date: Thu, 11 Jan 2024 16:32:55 +0000 Subject: [PATCH 6/7] FEAT: Added swagger doc updates --- server/swagger/openapi.json | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index 184723ed7e5..04819f22689 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -845,7 +845,18 @@ "type": "object", "example": { "success": true, - "error": null + "error": null, + "document": { + "id": "115f2bab-957b-42e7-b5d0-16cac2379bce", + "url": "file://home/user/Workspace/anything-llm/collector/hotdir/file.txt", + "title": "file.txt", + "docAuthor": "Unknown", + "description": "Unknown", + "docSource": "a text file uploaded by the user.", + "chunkSource": "Bfile.txt", + "published": "11/01/2024, 16:25:09", + "wordCount": 17653 + } } } } From 4c70d0de2733fcd0ee43a8659437e644b924c33b Mon Sep 17 00:00:00 2001 From: Matthew Moore Date: Thu, 11 Jan 2024 16:34:31 +0000 Subject: [PATCH 7/7] FEAT: Document endpoint swagger doc updates --- server/endpoints/api/document/index.js | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index 54dde363695..94e80896b10 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -47,6 +47,17 @@ function apiDocumentEndpoints(app) { example: { success: true, error: null, + "document": { + "id": "115f2bab-957b-42e7-b5d0-16cac2379bce", + "url": "file:///home/user/Workspace/anything-llm/collector/hotdir/file.txt", + "title": "file.txt", + "docAuthor": "Unknown", + "description": "Unknown", + "docSource": "a text file uploaded by the user.", + "chunkSource": "Bfile.txt", + "published": "11/01/2024, 16:25:09", + "wordCount": 17653 + } } } }