diff --git a/collector/extensions/index.js b/collector/extensions/index.js index a88b38eee59..9cf062d7a13 100644 --- a/collector/extensions/index.js +++ b/collector/extensions/index.js @@ -54,6 +54,60 @@ function extensions(app) { } ); + app.post( + "/ext/gitlab-repo", + [verifyPayloadIntegrity, setDataSigner], + async function (request, response) { + try { + const { loadGitlabRepo } = require("../utils/extensions/GitlabRepo"); + const { success, reason, data } = await loadGitlabRepo( + reqBody(request), + response, + ); + response.status(200).json({ + success, + reason, + data, + }); + } catch (e) { + console.error(e); + response.status(200).json({ + success: false, + reason: e.message || "A processing error occurred.", + data: {}, + }); + } + return; + } + ); + + app.post( + "/ext/gitlab-repo", + [verifyPayloadIntegrity, setDataSigner], + async function (request, response) { + try { + const { loadGitlabRepo } = require("../utils/extensions/GitlabRepo"); + const { success, reason, data } = await loadGitlabRepo( + reqBody(request), + response, + ); + response.status(200).json({ + success, + reason, + data, + }); + } catch (e) { + console.error(e); + response.status(200).json({ + success: false, + reason: e.message || "A processing error occurred.", + data: {}, + }); + } + return; + } + ); + // gets all branches for a specific repo app.post( "/ext/github-repo/branches", @@ -85,6 +139,67 @@ function extensions(app) { } ); + app.post( + "/ext/gitlab-repo/branches", + [verifyPayloadIntegrity], + async function (request, response) { + try { + const GitlabRepoLoader = require("../utils/extensions/GitlabRepo/RepoLoader"); + const allBranches = await new GitlabRepoLoader( + reqBody(request) + ).getRepoBranches(); + response.status(200).json({ + success: true, + reason: null, + data: { + branches: allBranches, + }, + }); + } catch (e) { + console.error(e); + response.status(400).json({ + success: false, + reason: e.message, + data: { + branches: [], + }, + }); + } + return; + } + ); + + // gets all branches for a specific GitLab repo + app.post( + "/ext/gitlab-repo/branches", + [verifyPayloadIntegrity], + async function (request, response) { + try { + const GitlabRepoLoader = require("../utils/extensions/GitlabRepo/RepoLoader"); + const allBranches = await new GitlabRepoLoader( + reqBody(request) + ).getRepoBranches(); + response.status(200).json({ + success: true, + reason: null, + data: { + branches: allBranches, + }, + }); + } catch (e) { + console.error(e); + response.status(400).json({ + success: false, + reason: e.message, + data: { + branches: [], + }, + }); + } + return; + } + ); + app.post( "/ext/youtube-transcript", [verifyPayloadIntegrity], diff --git a/collector/utils/extensions/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/GitlabRepo/RepoLoader/index.js new file mode 100644 index 00000000000..35b9b9998e1 --- /dev/null +++ b/collector/utils/extensions/GitlabRepo/RepoLoader/index.js @@ -0,0 +1,162 @@ +const https = require("https"); +const UrlPattern = require("url-pattern"); + +class RepoLoader { + constructor(args = {}) { + this.ready = false; + this.repo = args?.repo; + this.branch = args?.branch; + this.accessToken = args?.accessToken || null; + this.ignorePaths = args?.ignorePaths || []; + + this.projectId = null; + this.branches = []; + } + + #validGitlabUrl() { + const pattern = new UrlPattern("https\\://gitlab.com/(:projectId(*))", { + segmentValueCharset: "a-zA-Z0-9-._~%/+", + }); + const match = pattern.match(this.repo); + if (!match) return false; + + this.projectId = encodeURIComponent(match.projectId); + return true; + } + + async #validBranch() { + await this.getRepoBranches(); + if (!!this.branch && this.branches.includes(this.branch)) return; + + console.log( + "[Gitlab Loader]: Branch not set! Auto-assigning to a default branch." + ); + this.branch = this.branches.includes("main") ? "main" : "master"; + console.log(`[Gitlab Loader]: Branch auto-assigned to ${this.branch}.`); + return; + } + + async #validateAccessToken() { + if (!this.accessToken) return; + try { + await this.#makeRequest("https://gitlab.com/api/v4/user"); + } catch (e) { + console.error( + "Invalid Gitlab Access Token provided! Access token will not be used", + e.message + ); + this.accessToken = null; + } + } + + async init() { + if (!this.#validGitlabUrl()) return; + await this.#validBranch(); + await this.#validateAccessToken(); + this.ready = true; + return this; + } + + async recursiveLoader() { + if (!this.ready) throw new Error("[Gitlab Loader]: not in ready state!"); + + if (this.accessToken) + console.log( + `[Gitlab Loader]: Access token set! Recursive loading enabled!` + ); + + const files = await this.getRepositoryTree(); + const docs = []; + + for (const file of files) { + if (this.ignorePaths.some((path) => file.path.includes(path))) continue; + const content = await this.fetchSingleFile(file.path); + if (content) { + docs.push({ + pageContent: content, + metadata: { source: file.path }, + }); + } + } + + return docs; + } + + #branchPrefSort(branches = []) { + const preferredSort = ["main", "master"]; + return branches.reduce((acc, branch) => { + if (preferredSort.includes(branch)) return [branch, ...acc]; + return [...acc, branch]; + }, []); + } + + async getRepoBranches() { + if (!this.#validGitlabUrl() || !this.projectId) return []; + await this.#validateAccessToken(); + + try { + const data = await this.#makeRequest( + `https://gitlab.com/api/v4/projects/${this.projectId}/repository/branches` + ); + this.branches = JSON.parse(data).map((branch) => branch.name); + return this.#branchPrefSort(this.branches); + } catch (err) { + console.log(`RepoLoader.branches`, err); + return []; + } + } + + async getRepositoryTree() { + try { + const data = await this.#makeRequest( + `https://gitlab.com/api/v4/projects/${this.projectId}/repository/tree?ref=${this.branch}&recursive=true&per_page=100` + ); + return JSON.parse(data).filter((item) => item.type === "blob"); + } catch (e) { + console.error(`RepoLoader.getRepositoryTree`, e); + return []; + } + } + + async fetchSingleFile(sourceFilePath) { + try { + const data = await this.#makeRequest( + `https://gitlab.com/api/v4/projects/${ + this.projectId + }/repository/files/${encodeURIComponent(sourceFilePath)}/raw?ref=${ + this.branch + }` + ); + return data; + } catch (e) { + console.error(`RepoLoader.fetchSingleFile`, e); + return null; + } + } + + #makeRequest(url) { + return new Promise((resolve, reject) => { + const options = { + headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {}, + }; + + https + .get(url, options, (res) => { + let data = ""; + res.on("data", (chunk) => (data += chunk)); + res.on("end", () => { + if (res.statusCode >= 200 && res.statusCode < 300) { + resolve(data); + } else { + reject( + new Error(`Request failed with status code ${res.statusCode}`) + ); + } + }); + }) + .on("error", reject); + }); + } +} + +module.exports = RepoLoader; diff --git a/collector/utils/extensions/GitlabRepo/index.js b/collector/utils/extensions/GitlabRepo/index.js new file mode 100644 index 00000000000..2ba9a24d4bb --- /dev/null +++ b/collector/utils/extensions/GitlabRepo/index.js @@ -0,0 +1,143 @@ +const RepoLoader = require("./RepoLoader"); +const fs = require("fs"); +const path = require("path"); +const { default: slugify } = require("slugify"); +const { v4 } = require("uuid"); +const { writeToServerDocuments } = require("../../files"); +const { tokenizeString } = require("../../tokenizer"); + +/** + * Load in a Gitlab Repo recursively or just the top level if no PAT is provided + * @param {object} args - forwarded request body params + * @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker + * @returns + */ +async function loadGitlabRepo(args, response) { + const repo = new RepoLoader(args); + await repo.init(); + + if (!repo.ready) + return { + success: false, + reason: "Could not prepare Gitlab repo for loading! Check URL", + }; + const repoName = repo.repo.split("/").pop(); + const repoAuthor = repo.repo.split("/").slice(-2)[0]; + console.log(`-- Working GitLab ${repoName}:${repo.branch} --`); + const docs = await repo.recursiveLoader(); + if (!docs.length) { + return { + success: false, + reason: "No files were found for those settings.", + }; + } + + console.log(`[GitLab Loader]: Found ${docs.length} source files. Saving...`); + const outFolder = slugify( + `${repoAuthor}-${repoName}-${repo.branch}-${v4().slice(0, 4)}` + ).toLowerCase(); + + const outFolderPath = + process.env.NODE_ENV === "development" + ? path.resolve( + __dirname, + `../../../../server/storage/documents/${outFolder}` + ) + : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`); + + if (!fs.existsSync(outFolderPath)) + fs.mkdirSync(outFolderPath, { recursive: true }); + + for (const doc of docs) { + if (!doc.pageContent) continue; + const data = { + id: v4(), + url: "gitlab://" + doc.metadata.source, + title: doc.metadata.source, + docAuthor: repoName, + description: "No description found.", + docSource: doc.metadata.source, + chunkSource: generateChunkSource( + repo, + doc, + response.locals.encryptionWorker + ), + published: new Date().toLocaleString(), + wordCount: doc.pageContent.split(" ").length, + pageContent: doc.pageContent, + token_count_estimate: tokenizeString(doc.pageContent).length, + }; + console.log( + `[GitLab Loader]: Saving ${doc.metadata.source} to ${outFolder}` + ); + writeToServerDocuments( + data, + `${slugify(doc.metadata.source)}-${data.id}`, + outFolderPath + ); + } + + return { + success: true, + reason: null, + data: { + projectId: repo.projectId, + branch: repo.branch, + files: docs.length, + destination: outFolder, + }, + }; +} + +async function fetchGitlabFile({ + repoUrl, + branch, + accessToken = null, + sourceFilePath, +}) { + const repo = new RepoLoader({ + repo: repoUrl, + branch, + accessToken, + }); + await repo.init(); + + if (!repo.ready) + return { + success: false, + content: null, + reason: "Could not prepare GitLab repo for loading! Check URL or PAT.", + }; + const repoName = repo.repo.split("/").pop(); + console.log( + `-- Working GitLab ${repoName}:${repo.branch} file:${sourceFilePath} --` + ); + const fileContent = await repo.fetchSingleFile(sourceFilePath); + if (!fileContent) { + return { + success: false, + reason: "Target file returned a null content response.", + content: null, + }; + } + + return { + success: true, + reason: null, + content: fileContent, + }; +} + +function generateChunkSource(repo, doc, encryptionWorker) { + const payload = { + projectId: repo.projectId, + branch: repo.branch, + path: doc.metadata.source, + pat: !!repo.accessToken ? repo.accessToken : null, + }; + return `gitlab://${repo.repo}?payload=${encryptionWorker.encrypt( + JSON.stringify(payload) + )}`; +} + +module.exports = { loadGitlabRepo, fetchGitlabFile }; diff --git a/frontend/src/components/DataConnectorOption/media/gitlab.svg b/frontend/src/components/DataConnectorOption/media/gitlab.svg new file mode 100644 index 00000000000..0d48a00cbb0 --- /dev/null +++ b/frontend/src/components/DataConnectorOption/media/gitlab.svg @@ -0,0 +1,7 @@ + diff --git a/frontend/src/components/DataConnectorOption/media/index.js b/frontend/src/components/DataConnectorOption/media/index.js index dee46a12b5e..cbc80b642db 100644 --- a/frontend/src/components/DataConnectorOption/media/index.js +++ b/frontend/src/components/DataConnectorOption/media/index.js @@ -1,10 +1,12 @@ import Github from "./github.svg"; +import Gitlab from "./gitlab.svg"; import YouTube from "./youtube.svg"; import Link from "./link.svg"; import Confluence from "./confluence.jpeg"; const ConnectorImages = { github: Github, + gitlab: Gitlab, youtube: YouTube, websiteDepth: Link, confluence: Confluence, diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx new file mode 100644 index 00000000000..76528b71642 --- /dev/null +++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx @@ -0,0 +1,316 @@ +import React, { useEffect, useState } from "react"; +import System from "@/models/system"; +import showToast from "@/utils/toast"; +import pluralize from "pluralize"; +import { TagsInput } from "react-tag-input-component"; +import { Info, Warning } from "@phosphor-icons/react"; +import { Tooltip } from "react-tooltip"; + +const DEFAULT_BRANCHES = ["main", "master"]; +export default function GitlabOptions() { + const [loading, setLoading] = useState(false); + const [repo, setRepo] = useState(null); + const [accessToken, setAccessToken] = useState(null); + const [ignores, setIgnores] = useState([]); + const [settings, setSettings] = useState({ + repos: null, + accessToken: null, + }); + + const handleSubmit = async (e) => { + e.preventDefault(); + const form = new FormData(e.target); + + try { + setLoading(true); + const repos = form.get("repos").split("\n").filter(Boolean); + + for (const repo of repos) { + showToast( + `Fetching all files for repo ${repo} - this may take a while.`, + "info", + { clear: true, autoClose: false } + ); + const { data, error } = await System.dataConnectors.gitlab.collect({ + repo: repo.trim(), + accessToken: form.get("accessToken"), + branch: form.get("branch"), + ignorePaths: ignores, + }); + + if (!!error) { + showToast(`Error for ${repo}: ${error}`, "error", { clear: true }); + continue; + } + + showToast( + `${data.files} ${pluralize("file", data.files)} collected from ${ + data.author + }/${data.repo}:${data.branch}. Output folder is ${data.destination}.`, + "success", + { clear: true } + ); + } + + e.target.reset(); + setLoading(false); + } catch (e) { + console.error(e); + showToast(e.message, "error", { clear: true }); + setLoading(false); + } + }; + + return ( +
+ Branch you wish to collect files from. +
++ Branch you wish to collect files from. +
+
+ Without filling out the GitLab Access Token this data connector
+ will only be able to collect the top-level files of the repo
+ due to GitLab's public API rate-limits.
+
+
+ e.stopPropagation()}
+ >
+ {" "}
+ Get a free Personal Access Token with a GitLab account here.
+
+
+ Without a{" "} + e.stopPropagation()} + > + Personal Access Token + + , the GitLab API may limit the number of files that can be collected + due to rate limits. You can{" "} + e.stopPropagation()} + > + create a temporary Access Token + {" "} + to avoid this issue. +
+