From 4e4fc08f268b89a9f54f3cf1e42c342f854c3bfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C5=82a=C5=BCej=20Owczarczyk?=
Date: Fri, 20 Sep 2024 18:56:55 +0200
Subject: [PATCH 1/6] Added an option to fetch issues from gitlab. Made the
file fetching asynchornous to improve performance. #2334
---
.../RepoLoader/GitlabRepo/RepoLoader/index.js | 265 +++++++++++-------
.../extensions/RepoLoader/GitlabRepo/index.js | 34 ++-
.../Connectors/Gitlab/index.jsx | 22 ++
frontend/src/models/dataConnector.js | 16 +-
4 files changed, 226 insertions(+), 111 deletions(-)
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
index 75583229850..e58fc539abf 100644
--- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
@@ -6,6 +6,7 @@ const minimatch = require("minimatch");
* @property {string} [branch] - The branch to load from (optional).
* @property {string} [accessToken] - GitLab access token for authentication (optional).
* @property {string[]} [ignorePaths] - Array of paths to ignore when loading (optional).
+ * @property {boolean} [fetchIssues] - Should issues be fetched (optional).
*/
/**
@@ -33,6 +34,7 @@ class GitLabRepoLoader {
this.branch = args?.branch;
this.accessToken = args?.accessToken || null;
this.ignorePaths = args?.ignorePaths || [];
+ this.withIssues = args?.fetchIssues || false;
this.projectId = null;
this.apiBase = "https://gitlab.com";
@@ -123,22 +125,44 @@ class GitLabRepoLoader {
if (this.accessToken)
console.log(
- `[Gitlab Loader]: Access token set! Recursive loading enabled!`
+ `[Gitlab Loader]: Access token set! Recursive loading enabled for ${this.repo}!`
);
- const files = await this.fetchFilesRecursive();
const docs = [];
+ console.log(`[Gitlab Loader]: Fetching files.`);
+
+ const files = await this.fetchFilesRecursive();
+
+ console.log(`[Gitlab Loader]: Fetched ${files.length} files.`);
+
for (const file of files) {
if (this.ignorePaths.some((path) => file.path.includes(path))) continue;
- const content = await this.fetchSingleFileContents(file.path);
- if (content) {
- docs.push({
- pageContent: content,
- metadata: { source: file.path },
- });
- }
+ docs.push({
+ pageContent: file.content,
+ metadata: {
+ source: file.path,
+ url: `${this.repo}/-/blob/${this.branch}/${file.path}`,
+ },
+ });
+ }
+
+ if (this.withIssues) {
+ console.log(`[Gitlab Loader]: Fetching issues.`);
+ const issues = await this.fetchIssues();
+ console.log(
+ `[Gitlab Loader]: Fetched ${issues.length} issues with discussions.`
+ );
+ docs.push(
+ ...issues.map((issue) => ({
+ issue,
+ metadata: {
+ source: `issue-${this.repo}-${issue.iid}`,
+ url: issue.web_url,
+ },
+ }))
+ );
}
return docs;
@@ -160,51 +184,14 @@ class GitLabRepoLoader {
if (!this.#validGitlabUrl() || !this.projectId) return [];
await this.#validateAccessToken();
this.branches = [];
- let fetching = true;
- let page = 1;
- let perPage = 50;
-
- while (fetching) {
- try {
- const params = new URLSearchParams({
- per_page: perPage,
- page,
- });
- const response = await fetch(
- `${this.apiBase}/api/v4/projects/${
- this.projectId
- }/repository/branches?${params.toString()}`,
- {
- method: "GET",
- headers: {
- Accepts: "application/json",
- ...(this.accessToken
- ? { "PRIVATE-TOKEN": this.accessToken }
- : {}),
- },
- }
- )
- .then((res) => res.json())
- .then((branches) => {
- if (!Array.isArray(branches) || branches.length === 0) {
- fetching = false;
- return [];
- }
- return branches.map((b) => b.name);
- })
- .catch((e) => {
- console.error(e);
- fetching = false;
- return [];
- });
-
- this.branches.push(...response);
- page++;
- } catch (err) {
- console.log(`RepoLoader.getRepoBranches`, err);
- fetching = false;
- return [];
- }
+
+ const branchesRequestData = {
+ endpoint: `/api/v4/projects/${this.projectId}/repository/branches`,
+ };
+
+ let branchesPage = [];
+ while ((branchesPage = await this.fetchNextPage(branchesRequestData))) {
+ this.branches.push(...branchesPage.map((branch) => branch.name));
}
return this.#branchPrefSort(this.branches);
}
@@ -215,62 +202,95 @@ class GitLabRepoLoader {
*/
async fetchFilesRecursive() {
const files = [];
- let perPage = 100;
- let fetching = true;
- let page = 1;
-
- while (fetching) {
- try {
- const params = new URLSearchParams({
- ref: this.branch,
- recursive: true,
- per_page: perPage,
- page,
- });
- const queryUrl = `${this.apiBase}/api/v4/projects/${
- this.projectId
- }/repository/tree?${params.toString()}`;
- const response = await fetch(queryUrl, {
- method: "GET",
- headers: this.accessToken
- ? { "PRIVATE-TOKEN": this.accessToken }
- : {},
- });
- const totalPages = Number(response.headers.get("x-total-pages"));
- const nextPage = Number(response.headers.get("x-next-page"));
- const data = await response.json();
-
- /** @type {FileTreeObject[]} */
- const objects = Array.isArray(data)
- ? data.filter((item) => item.type === "blob")
- : []; // only get files, not paths or submodules
-
- // Apply ignore path rules to found objects. If any rules match it is an invalid file path.
- console.log(
- `Found ${objects.length} blobs from repo from pg ${page}/${totalPages}`
- );
- for (const file of objects) {
+ const filesRequestData = {
+ endpoint: `/api/v4/projects/${this.projectId}/repository/tree`,
+ queryParams: {
+ ref: this.branch,
+ recursive: true,
+ },
+ };
+
+ let filesPage = null;
+ let pagePromises = [];
+ while ((filesPage = await this.fetchNextPage(filesRequestData))) {
+ // Fetch all the files that are not ignored in parallel.
+ pagePromises = filesPage
+ .filter((file) => {
+ if (file.type !== "blob") {
+ return false;
+ }
const isIgnored = this.ignorePaths.some((ignorePattern) =>
minimatch(file.path, ignorePattern, { matchBase: true })
);
- if (!isIgnored) files.push(file);
- }
+ return !isIgnored;
+ })
+ .map(async (file) => {
+ const content = await this.fetchSingleFileContents(file.path);
+ if (!content) return null;
+ return {
+ path: file.path,
+ content,
+ };
+ });
- if (page === totalPages) {
- fetching = false;
- break;
- }
+ const pageFiles = await Promise.all(pagePromises);
- page = Number(nextPage);
- } catch (e) {
- console.error(`RepoLoader.getRepositoryTree`, e);
- fetching = false;
- break;
- }
+ files.push(...pageFiles.filter((item) => item !== null));
+ console.log(`Fetched ${files.length} files.`);
}
+ console.log(`Total files fetched: ${files.length}`);
return files;
}
+ /**
+ * Fetches all issues from the repository.
+ * @returns {Promise} An array of issue objects.
+ */
+ async fetchIssues() {
+ const issues = [];
+ const issuesRequestData = {
+ endpoint: `/api/v4/projects/${this.projectId}/issues`,
+ };
+
+ let issuesPage = null;
+ let pagePromises = [];
+ while ((issuesPage = await this.fetchNextPage(issuesRequestData))) {
+ // Fetch all the issues in parallel.
+ pagePromises = issuesPage.map(async (issue) => {
+ const discussionsRequestData = {
+ endpoint: `/api/v4/projects/${this.projectId}/issues/${issue.iid}/discussions`,
+ };
+ let discussionPage = null;
+ const discussions = [];
+
+ while (
+ (discussionPage = await this.fetchNextPage(discussionsRequestData))
+ ) {
+ discussions.push(
+ ...discussionPage.map(({ notes }) =>
+ notes.map(
+ ({ body, author, created_at }) =>
+ `${author.username} at ${created_at}: ${body}`
+ )
+ )
+ );
+ }
+ const result = {
+ ...issue,
+ discussions,
+ };
+ return result;
+ });
+
+ const pageIssues = await Promise.all(pagePromises);
+
+ issues.push(...pageIssues);
+ console.log(`Fetched ${issues.length} issues.`);
+ }
+ console.log(`Total issues fetched: ${issues.length}`);
+ return issues;
+ }
+
/**
* Fetches the content of a single file from the repository.
* @param {string} sourceFilePath - The path to the file in the repository.
@@ -301,6 +321,49 @@ class GitLabRepoLoader {
return null;
}
}
+
+ /**
+ * Fetches the next page of data from the API.
+ * @param {Object} requestData - The request data.
+ * @returns {Promise|null>} The next page of data, or null if no more pages.
+ */
+ async fetchNextPage(requestData) {
+ try {
+ if (requestData.page === -1) return null;
+ if (!requestData.page) requestData.page = 1;
+
+ const { endpoint, perPage = 100, queryParams = {} } = requestData;
+ const params = new URLSearchParams({
+ ...queryParams,
+ per_page: perPage,
+ page: requestData.page,
+ });
+ const url = `${this.apiBase}${endpoint}?${params.toString()}`;
+
+ const response = await fetch(url, {
+ method: "GET",
+ headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
+ });
+
+ const totalPages = Number(response.headers.get("x-total-pages"));
+ const data = await response.json();
+
+ console.log(
+ `Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}/${totalPages} with ${data.length} records.`
+ );
+
+ if (totalPages === requestData.page) {
+ requestData.page = -1;
+ } else {
+ requestData.page = Number(response.headers.get("x-next-page"));
+ }
+
+ return data;
+ } catch (e) {
+ console.error(`RepoLoader.fetchNextPage`, e);
+ return null;
+ }
+ }
}
module.exports = GitLabRepoLoader;
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
index e756463c752..89f3ecb90bb 100644
--- a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
@@ -50,13 +50,12 @@ async function loadGitlabRepo(args, response) {
fs.mkdirSync(outFolderPath, { recursive: true });
for (const doc of docs) {
- if (!doc.pageContent) continue;
+ if (!doc.metadata || (!doc.pageContent && !doc.issue)) continue;
+ let pageContent = null;
+
const data = {
id: v4(),
url: "gitlab://" + doc.metadata.source,
- title: doc.metadata.source,
- docAuthor: repo.author,
- description: "No description found.",
docSource: doc.metadata.source,
chunkSource: generateChunkSource(
repo,
@@ -64,13 +63,32 @@ async function loadGitlabRepo(args, response) {
response.locals.encryptionWorker
),
published: new Date().toLocaleString(),
- wordCount: doc.pageContent.split(" ").length,
- pageContent: doc.pageContent,
- token_count_estimate: tokenizeString(doc.pageContent).length,
};
+
+ if (doc.pageContent) {
+ pageContent = doc.pageContent;
+
+ data.title = doc.metadata.source;
+ data.docAuthor = repo.author;
+ data.description = "No description found.";
+ } else if (doc.issue) {
+ pageContent = JSON.stringify(doc.issue);
+
+ data.title = `Issue ${doc.issue.iid}: ${doc.issue.title}`;
+ data.docAuthor = doc.issue.author.username;
+ data.description = doc.issue.description;
+ } else {
+ continue;
+ }
+
+ data.wordCount = pageContent.split(" ").length;
+ data.token_count_estimate = tokenizeString(pageContent).length;
+ data.pageContent = pageContent;
+
console.log(
- `[GitLab Loader]: Saving ${doc.metadata.source} to ${outFolder}`
+ `[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
);
+
writeToServerDocuments(
data,
`${slugify(doc.metadata.source)}-${data.id}`,
diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
index f3c34dc8a3c..c426a8610c1 100644
--- a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
+++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
@@ -34,6 +34,7 @@ export default function GitlabOptions() {
accessToken: form.get("accessToken"),
branch: form.get("branch"),
ignorePaths: ignores,
+ fetchIssues: form.get("fetchIssues"),
});
if (!!error) {
@@ -112,6 +113,27 @@ export default function GitlabOptions() {
onBlur={() => setSettings({ ...settings, accessToken })}
/>
+
+
+
+
+ Select additional entities to fetch from the GitLab API.
+
+
+
+
+
+
+
res.json())
.then((res) => {
From 0673d4183ae88e8ff77ca630b2e9e8ad9a21b9f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C5=82a=C5=BCej=20Owczarczyk?=
Date: Fri, 20 Sep 2024 19:19:31 +0200
Subject: [PATCH 2/6] Fixed a typo in loadGitlabRepo.
---
collector/utils/extensions/RepoLoader/GitlabRepo/index.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
index 89f3ecb90bb..3af2a3cf94d 100644
--- a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
@@ -86,7 +86,7 @@ async function loadGitlabRepo(args, response) {
data.pageContent = pageContent;
console.log(
- `[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
+ `[GitLab Loader]: Saving ${doc.metadata.source} to ${outFolder}`
);
writeToServerDocuments(
From 3ac0cadc98f0214f4a88e9766fa499ad985bb234 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C5=82a=C5=BCej=20Owczarczyk?=
Date: Tue, 24 Sep 2024 01:03:02 +0200
Subject: [PATCH 3/6] Convert issues to markdown.
---
.../RepoLoader/GitlabRepo/RepoLoader/index.js | 3 +-
.../extensions/RepoLoader/GitlabRepo/index.js | 91 ++++++++++++++++++-
2 files changed, 92 insertions(+), 2 deletions(-)
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
index e58fc539abf..ef3c61d1364 100644
--- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
@@ -270,7 +270,8 @@ class GitLabRepoLoader {
...discussionPage.map(({ notes }) =>
notes.map(
({ body, author, created_at }) =>
- `${author.username} at ${created_at}: ${body}`
+ `${author.username} at ${created_at}:
+${body}`
)
)
);
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
index 3af2a3cf94d..60dcead26be 100644
--- a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
@@ -72,7 +72,7 @@ async function loadGitlabRepo(args, response) {
data.docAuthor = repo.author;
data.description = "No description found.";
} else if (doc.issue) {
- pageContent = JSON.stringify(doc.issue);
+ pageContent = issueToMarkdown(doc.issue);
data.title = `Issue ${doc.issue.iid}: ${doc.issue.title}`;
data.docAuthor = doc.issue.author.username;
@@ -160,4 +160,93 @@ function generateChunkSource(repo, doc, encryptionWorker) {
)}`;
}
+function issueToMarkdown(issue) {
+ const metadata = {};
+
+ const userFields = ["author", "assignees", "closed_by"];
+ const userToUsername = ({ username }) => username;
+ for (const userField of userFields) {
+ if (issue[userField]) {
+ if (Array.isArray(issue[userField])) {
+ metadata[userField] = issue[userField].map(userToUsername);
+ } else {
+ metadata[userField] = userToUsername(issue[userField]);
+ }
+ }
+ }
+
+ const singleValueFields = [
+ "web_url",
+ "state",
+ "created_at",
+ "updated_at",
+ "closed_at",
+ "due_date",
+ "type",
+ "merge_request_count",
+ "upvotes",
+ "downvotes",
+ "labels",
+ "has_tasks",
+ "task_status",
+ "confidential",
+ "severity",
+ ];
+
+ for (const singleValueField of singleValueFields) {
+ metadata[singleValueField] = issue[singleValueField];
+ }
+
+ if (issue.milestone) {
+ metadata.milestone = `${issue.milestone.title} (${issue.milestone.id})`;
+ }
+
+ if (issue.time_stats) {
+ const timeFields = ["human_time_estimate", "human_total_time_spent"];
+ for (const timeField of timeFields) {
+ const fieldName = `human_${timeField}`;
+ if (issue?.time_stats[fieldName]) {
+ metadata[timeField] = issue.time_stats[fieldName];
+ }
+ }
+ }
+
+ const metadataString = Object.entries(metadata)
+ .map(([name, value]) => {
+ if (!value || value?.length < 1) {
+ return null;
+ }
+ let result = `- ${name.replace("_", " ")}:`;
+
+ if (!Array.isArray(value)) {
+ result += ` ${value}`;
+ } else {
+ result += "\n" + value.map((s) => ` - ${s}`).join("\n");
+ }
+
+ return result;
+ })
+ .filter((item) => item != null)
+ .join("\n");
+
+ let markdown = `# ${issue.title} (${issue.iid})
+
+${issue.description}
+
+## Metadata
+
+${metadataString}`;
+
+ if (issue.discussions.length > 0) {
+ markdown += `
+
+## Activity
+
+${issue.discussions.join("\n\n")}
+`;
+ }
+
+ return markdown;
+}
+
module.exports = { loadGitlabRepo, fetchGitlabFile };
From 6d14e64a648c42b291ab1134d8fe03b4c16b7802 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C5=82a=C5=BCej=20Owczarczyk?=
Date: Tue, 24 Sep 2024 12:02:51 +0200
Subject: [PATCH 4/6] Fixed an issue with time estimate field names in
issueToMarkdown.
---
collector/utils/extensions/RepoLoader/GitlabRepo/index.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
index 60dcead26be..f1c528f1d9f 100644
--- a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
@@ -202,7 +202,7 @@ function issueToMarkdown(issue) {
}
if (issue.time_stats) {
- const timeFields = ["human_time_estimate", "human_total_time_spent"];
+ const timeFields = ["time_estimate", "total_time_spent"];
for (const timeField of timeFields) {
const fieldName = `human_${timeField}`;
if (issue?.time_stats[fieldName]) {
From 5eebbf2183323d6590b6f3af8c81867b114001cf Mon Sep 17 00:00:00 2001
From: shatfield4
Date: Wed, 25 Sep 2024 17:52:55 -0700
Subject: [PATCH 5/6] handle rate limits more gracefully + update checkbox to
toggle switch
---
.../RepoLoader/GitlabRepo/RepoLoader/index.js | 11 +++++++++++
.../Connectors/Gitlab/index.jsx | 19 +++++++++++--------
2 files changed, 22 insertions(+), 8 deletions(-)
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
index ef3c61d1364..7c772ceb81e 100644
--- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
@@ -346,8 +346,19 @@ ${body}`
headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
});
+ // Rate limits get hit very often if no PAT is provided
+ if (response.status === 401) {
+ console.warn(`Rate limit hit for ${endpoint}. Skipping.`);
+ return null;
+ }
+
const totalPages = Number(response.headers.get("x-total-pages"));
const data = await response.json();
+u
+ if (!Array.isArray(data)) {
+ console.warn(`Unexpected response format for ${endpoint}:`, data);
+ return [];
+ }
console.log(
`Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}/${totalPages} with ${data.length} records.`
diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
index c426a8610c1..265f2fe4bae 100644
--- a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
+++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
@@ -123,14 +123,17 @@ export default function GitlabOptions() {
From e83b9892632e05cb533ba34473e29caac5dc307c Mon Sep 17 00:00:00 2001
From: shatfield4
Date: Wed, 25 Sep 2024 17:54:28 -0700
Subject: [PATCH 6/6] lint
---
.../utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js | 1 -
1 file changed, 1 deletion(-)
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
index 7c772ceb81e..9ebc3c0db71 100644
--- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
@@ -354,7 +354,6 @@ ${body}`
const totalPages = Number(response.headers.get("x-total-pages"));
const data = await response.json();
-u
if (!Array.isArray(data)) {
console.warn(`Unexpected response format for ${endpoint}:`, data);
return [];