From 6d746c39b27e867d5bb5f817bda7788fefcb6df0 Mon Sep 17 00:00:00 2001 From: "Emil Rofors (aider)" Date: Thu, 25 Jul 2024 14:19:26 -0700 Subject: [PATCH 1/7] Updated the `GitHubRepoLoader` class to use the new import syntax and adjust the `recursiveLoader` method accordingly. --- .../RepoLoader/GithubRepo/RepoLoader/index.js | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js index 08121f44f28..a439d7ed14f 100644 --- a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js @@ -103,26 +103,23 @@ class GitHubRepoLoader { */ async recursiveLoader() { if (!this.ready) throw new Error("[Github Loader]: not in ready state!"); - const { - GithubRepoLoader: LCGithubLoader, - } = require("langchain/document_loaders/web/github"); + const { GithubRepoLoader } = require("@langchain/community/document_loaders/web/github"); if (this.accessToken) console.log( `[Github Loader]: Access token set! Recursive loading enabled!` ); - const loader = new LCGithubLoader(this.repo, { - accessToken: this.accessToken, + const loader = new GithubRepoLoader(this.repo, { branch: this.branch, recursive: !!this.accessToken, // Recursive will hit rate limits. maxConcurrency: 5, - unknown: "ignore", + unknown: "warn", + accessToken: this.accessToken, ignorePaths: this.ignorePaths, }); - const docs = []; - for await (const doc of loader.loadAsStream()) docs.push(doc); + const docs = await loader.load(); return docs; } From a69e63dc1a480aaf48057fed9bff41aaa17d61bc Mon Sep 17 00:00:00 2001 From: Emil Rofors Date: Sat, 3 Aug 2024 15:25:15 -0700 Subject: [PATCH 2/7] add @langchain/community to collector package.json --- collector/package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/collector/package.json b/collector/package.json index 217c5285757..3640fce7daa 100644 --- a/collector/package.json +++ b/collector/package.json @@ -15,6 +15,7 @@ "lint": "yarn prettier --ignore-path ../.prettierignore --write ./processSingleFile ./processLink ./utils index.js" }, "dependencies": { + "@langchain/community": "^0.2.23", "@xenova/transformers": "^2.11.0", "bcrypt": "^5.1.0", "body-parser": "^1.20.2", @@ -48,4 +49,4 @@ "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} \ No newline at end of file +} From 20e5af9f2843514e5f3c9ac1a507ca486a762158 Mon Sep 17 00:00:00 2001 From: "Emil Rofors (aider)" Date: Sun, 4 Aug 2024 13:36:00 -0700 Subject: [PATCH 3/7] fix: Improve handling of complex ignore patterns in GitLabRepoLoader --- .../RepoLoader/GitlabRepo/RepoLoader/index.js | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js index c909329862a..e9a8b028300 100644 --- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js @@ -233,9 +233,21 @@ class GitLabRepoLoader { `Found ${objects.length} blobs from repo from pg ${page}/${totalPages}` ); for (const file of objects) { - const isIgnored = this.ignorePaths.some((ignorePattern) => - minimatch(file.path, ignorePattern, { matchBase: true }) - ); + let isIgnored = false; + for (const ignorePattern of this.ignorePaths) { + if (ignorePattern.startsWith('!')) { + // This is an "un-ignore" pattern + if (minimatch(file.path, ignorePattern.slice(1), { matchBase: true })) { + isIgnored = false; + break; + } + } else { + // This is an ignore pattern + if (minimatch(file.path, ignorePattern, { matchBase: true })) { + isIgnored = true; + } + } + } if (!isIgnored) files.push(file); } From 24a4d565b00793bd6bfe8076468c453063fd8535 Mon Sep 17 00:00:00 2001 From: "Emil Rofors (aider)" Date: Sun, 4 Aug 2024 13:46:39 -0700 Subject: [PATCH 4/7] refactor: use ignore package for simplified ignore logic --- .../RepoLoader/GitlabRepo/RepoLoader/index.js | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js index e9a8b028300..b0feae6a1c8 100644 --- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js @@ -1,4 +1,4 @@ -const minimatch = require("minimatch"); +const ignore = require('ignore'); /** * @typedef {Object} RepoLoaderArgs @@ -39,6 +39,9 @@ class GitLabRepoLoader { this.author = null; this.project = null; this.branches = []; + + // Create an ignore instance + this.ignoreFilter = ignore().add(this.ignorePaths); } #validGitlabUrl() { @@ -233,22 +236,9 @@ class GitLabRepoLoader { `Found ${objects.length} blobs from repo from pg ${page}/${totalPages}` ); for (const file of objects) { - let isIgnored = false; - for (const ignorePattern of this.ignorePaths) { - if (ignorePattern.startsWith('!')) { - // This is an "un-ignore" pattern - if (minimatch(file.path, ignorePattern.slice(1), { matchBase: true })) { - isIgnored = false; - break; - } - } else { - // This is an ignore pattern - if (minimatch(file.path, ignorePattern, { matchBase: true })) { - isIgnored = true; - } - } + if (!this.ignoreFilter.ignores(file.path)) { + files.push(file); } - if (!isIgnored) files.push(file); } if (page === totalPages) { From 6fcf5b45d99d2c25b40d0c3f65275134af91e05d Mon Sep 17 00:00:00 2001 From: Emil Rofors Date: Sun, 4 Aug 2024 15:40:42 -0700 Subject: [PATCH 5/7] run yarn lint --- .../extensions/RepoLoader/GithubRepo/RepoLoader/index.js | 4 +++- .../extensions/RepoLoader/GitlabRepo/RepoLoader/index.js | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js index a439d7ed14f..8f37725d47a 100644 --- a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js @@ -103,7 +103,9 @@ class GitHubRepoLoader { */ async recursiveLoader() { if (!this.ready) throw new Error("[Github Loader]: not in ready state!"); - const { GithubRepoLoader } = require("@langchain/community/document_loaders/web/github"); + const { + GithubRepoLoader, + } = require("@langchain/community/document_loaders/web/github"); if (this.accessToken) console.log( diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js index b0feae6a1c8..e9512e198f0 100644 --- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js @@ -1,4 +1,4 @@ -const ignore = require('ignore'); +const ignore = require("ignore"); /** * @typedef {Object} RepoLoaderArgs From b747a6494c0d1fe18e0db620d938d38c2284e5f5 Mon Sep 17 00:00:00 2001 From: Emil Rofors Date: Mon, 5 Aug 2024 09:21:12 -0700 Subject: [PATCH 6/7] add @langchain/community@^0.2.23 --- collector/yarn.lock | 109 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/collector/yarn.lock b/collector/yarn.lock index 6f8b49c74cf..3a16c981e05 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -64,6 +64,23 @@ resolved "https://registry.yarnpkg.com/@huggingface/jinja/-/jinja-0.2.2.tgz#faeb205a9d6995089bef52655ddd8245d3190627" integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA== +"@langchain/community@^0.2.23": + version "0.2.23" + resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.2.23.tgz#20560e107bcc8432c42e499f1b9292d41a3732f2" + integrity sha512-p1n/zZ1F+O5l51RzeoUeJyhpzq6Wp11tkqKOj8oThKOQJgLhO7q6iFIvmKThzL7mZCNNuPM5r1OPnU4wO6iF/A== + dependencies: + "@langchain/core" ">=0.2.16 <0.3.0" + "@langchain/openai" ">=0.1.0 <0.3.0" + binary-extensions "^2.2.0" + expr-eval "^2.0.2" + flat "^5.0.2" + js-yaml "^4.1.0" + langchain "~0.2.3" + langsmith "~0.1.30" + uuid "^10.0.0" + zod "^3.22.3" + zod-to-json-schema "^3.22.5" + "@langchain/community@~0.0.47": version "0.0.53" resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.53.tgz#a9aaedffa0ed2977e8d302d74e9f90a49a6da037" @@ -78,6 +95,23 @@ zod "^3.22.3" zod-to-json-schema "^3.22.5" +"@langchain/core@>=0.2.11 <0.3.0", "@langchain/core@>=0.2.16 <0.3.0": + version "0.2.20" + resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.2.20.tgz#5115781b0a86db3ce4b697e473405892c09621ca" + integrity sha512-WPBjrzOj79/yqjloDUIw1GDhuRQfHis07TyyDj+qS81nHh0svSasetKcqAZ3L5JoPcBmEL7rRBtM+OcyC3mLVg== + dependencies: + ansi-styles "^5.0.0" + camelcase "6" + decamelize "1.2.0" + js-tiktoken "^1.0.12" + langsmith "~0.1.39" + mustache "^4.2.0" + p-queue "^6.6.2" + p-retry "4" + uuid "^10.0.0" + zod "^3.22.4" + zod-to-json-schema "^3.22.3" + "@langchain/core@~0.1", "@langchain/core@~0.1.56", "@langchain/core@~0.1.60": version "0.1.61" resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736" @@ -96,6 +130,17 @@ zod "^3.22.4" zod-to-json-schema "^3.22.3" +"@langchain/openai@>=0.1.0 <0.3.0": + version "0.2.5" + resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.2.5.tgz#e85b983986a7415ea743d4c854bb0674134334d4" + integrity sha512-gQXS5VBFyAco0jgSnUVan6fYVSIxlffmDaeDGpXrAmz2nQPgiN/h24KYOt2NOZ1zRheRzRuO/CfRagMhyVUaFA== + dependencies: + "@langchain/core" ">=0.2.16 <0.3.0" + js-tiktoken "^1.0.12" + openai "^4.49.1" + zod "^3.22.4" + zod-to-json-schema "^3.22.3" + "@langchain/openai@~0.0.28": version "0.0.28" resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.28.tgz#afaeec61b44816935db9ae937496c964c81ab571" @@ -1769,6 +1814,13 @@ js-tiktoken@^1.0.11, js-tiktoken@^1.0.7, js-tiktoken@^1.0.8: dependencies: base64-js "^1.5.1" +js-tiktoken@^1.0.12: + version "1.0.12" + resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.12.tgz#af0f5cf58e5e7318240d050c8413234019424211" + integrity sha512-L7wURW1fH9Qaext0VzaUDpFGVQgjkdE3Dgsy9/+yXyGEpBKnylTd0mU0bfbNkKDlXRb6TEsZkwuflu1B8uQbJQ== + dependencies: + base64-js "^1.5.1" + js-tokens@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499" @@ -1844,6 +1896,28 @@ langchain@0.1.36: zod "^3.22.4" zod-to-json-schema "^3.22.3" +langchain@~0.2.3: + version "0.2.12" + resolved "https://registry.yarnpkg.com/langchain/-/langchain-0.2.12.tgz#3fac0b9519a070689b6dd679d5854abc57824dcf" + integrity sha512-ZHtJrHUpridZ7IQu7N/wAQ6iMAAO7VLzkupHqKP79S6p+alrPbn1BjRnh+PeGm92YiY5DafTCuvchmujxx7bCQ== + dependencies: + "@langchain/core" ">=0.2.11 <0.3.0" + "@langchain/openai" ">=0.1.0 <0.3.0" + "@langchain/textsplitters" "~0.0.0" + binary-extensions "^2.2.0" + js-tiktoken "^1.0.12" + js-yaml "^4.1.0" + jsonpointer "^5.0.1" + langchainhub "~0.0.8" + langsmith "~0.1.30" + ml-distance "^4.0.0" + openapi-types "^12.1.3" + p-retry "4" + uuid "^10.0.0" + yaml "^2.2.1" + zod "^3.22.4" + zod-to-json-schema "^3.22.3" + langchainhub@~0.0.8: version "0.0.8" resolved "https://registry.yarnpkg.com/langchainhub/-/langchainhub-0.0.8.tgz#fd4b96dc795e22e36c1a20bad31b61b0c33d3110" @@ -1860,6 +1934,18 @@ langsmith@~0.1.1, langsmith@~0.1.7: p-retry "4" uuid "^9.0.0" +langsmith@~0.1.30, langsmith@~0.1.39: + version "0.1.40" + resolved "https://registry.yarnpkg.com/langsmith/-/langsmith-0.1.40.tgz#9708889386a5b9d0eb43dd3a9eba93513b57101d" + integrity sha512-11E2WLbh/+41+Qc0w8fJJTC/iz91BA+zXRMX/Wz0KSstnfzIPBoiWa++Kp2X8yCIDNywWWLJhy/B8gYzm7VKig== + dependencies: + "@types/uuid" "^9.0.1" + commander "^10.0.1" + p-queue "^6.6.2" + p-retry "4" + semver "^7.6.3" + uuid "^9.0.0" + leac@^0.6.0: version "0.6.0" resolved "https://registry.yarnpkg.com/leac/-/leac-0.6.0.tgz#dcf136e382e666bd2475f44a1096061b70dc0912" @@ -2417,6 +2503,19 @@ openai@^4.32.1: node-fetch "^2.6.7" web-streams-polyfill "^3.2.1" +openai@^4.49.1: + version "4.54.0" + resolved "https://registry.yarnpkg.com/openai/-/openai-4.54.0.tgz#eeb209c6892b997e524181b6ddb7e27bf4d09389" + integrity sha512-e/12BdtTtj+tXs7iHm+Dm7H7WjEWnw7O52B2wSfCQ6lD5F6cvjzo7cANXy5TJ1Q3/qc8YRPT5wBTTFtP5sBp1g== + dependencies: + "@types/node" "^18.11.18" + "@types/node-fetch" "^2.6.4" + abort-controller "^3.0.0" + agentkeepalive "^4.2.1" + form-data-encoder "1.7.2" + formdata-node "^4.3.2" + node-fetch "^2.6.7" + openapi-types@^12.1.3: version "12.1.3" resolved "https://registry.yarnpkg.com/openapi-types/-/openapi-types-12.1.3.tgz#471995eb26c4b97b7bd356aacf7b91b73e777dd3" @@ -2863,6 +2962,11 @@ semver@^7.3.5, semver@^7.5.4: dependencies: lru-cache "^6.0.0" +semver@^7.6.3: + version "7.6.3" + resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.3.tgz#980f7b5550bc175fb4dc09403085627f9eb33143" + integrity sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A== + semver@~7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/semver/-/semver-7.0.0.tgz#5f3ca35761e47e05b206c6daff2cf814f0316b8e" @@ -3336,6 +3440,11 @@ utils-merge@1.0.1: resolved "https://registry.yarnpkg.com/utils-merge/-/utils-merge-1.0.1.tgz#9f95710f50a267947b2ccc124741c1028427e713" integrity sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA== +uuid@^10.0.0: + version "10.0.0" + resolved "https://registry.yarnpkg.com/uuid/-/uuid-10.0.0.tgz#5a95aa454e6e002725c79055fd42aaba30ca6294" + integrity sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ== + uuid@^9.0.0: version "9.0.1" resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30" From 6840ed90ed7dd599a143e2db818b7d890229fb42 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Thu, 26 Sep 2024 12:48:49 -0700 Subject: [PATCH 7/7] remove unused dep lint --- collector/package.json | 3 +-- .../RepoLoader/GithubRepo/RepoLoader/index.js | 4 ++-- collector/yarn.lock | 14 -------------- 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/collector/package.json b/collector/package.json index 3640fce7daa..4ce85e68e10 100644 --- a/collector/package.json +++ b/collector/package.json @@ -31,7 +31,6 @@ "mammoth": "^1.6.0", "mbox-parser": "^1.0.1", "mime": "^3.0.0", - "minimatch": "5.1.0", "moment": "^2.29.4", "node-html-parser": "^6.1.13", "officeparser": "^4.0.5", @@ -49,4 +48,4 @@ "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} +} \ No newline at end of file diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js index b65de492032..61f208742ec 100644 --- a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js @@ -104,7 +104,7 @@ class GitHubRepoLoader { async recursiveLoader() { if (!this.ready) throw new Error("[Github Loader]: not in ready state!"); const { - GithubRepoLoader, + GithubRepoLoader: LCGithubLoader, } = require("@langchain/community/document_loaders/web/github"); if (this.accessToken) @@ -112,7 +112,7 @@ class GitHubRepoLoader { `[Github Loader]: Access token set! Recursive loading enabled!` ); - const loader = new GithubRepoLoader(this.repo, { + const loader = new LCGithubLoader(this.repo, { branch: this.branch, recursive: !!this.accessToken, // Recursive will hit rate limits. maxConcurrency: 5, diff --git a/collector/yarn.lock b/collector/yarn.lock index 3a16c981e05..2786692e091 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -604,13 +604,6 @@ brace-expansion@^1.1.7: balanced-match "^1.0.0" concat-map "0.0.1" -brace-expansion@^2.0.1: - version "2.0.1" - resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-2.0.1.tgz#1edc459e0f0c548486ecf9fc99f2221364b9a0ae" - integrity sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA== - dependencies: - balanced-match "^1.0.0" - braces@~3.0.2: version "3.0.2" resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107" @@ -2168,13 +2161,6 @@ mimic-response@^3.1.0: resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9" integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ== -minimatch@5.1.0: - version "5.1.0" - resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-5.1.0.tgz#1717b464f4971b144f6aabe8f2d0b8e4511e09c7" - integrity sha512-9TPBGGak4nHfGZsPBohm9AWg6NoT7QTCehS3BIJABslyZbzxfV78QM2Y6+i741OPZIafFAaiiEMh5OyIrJPgtg== - dependencies: - brace-expansion "^2.0.1" - minimatch@^3.1.1, minimatch@^3.1.2: version "3.1.2" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b"