From bd74111311c98d2a928155140639eaacedd0c030 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Thu, 8 Jun 2023 10:58:23 -0700 Subject: [PATCH 1/6] add start of lanceDB support --- server/package.json | 5 +- server/utils/helpers/index.js | 3 + server/utils/lancedb/index.js | 354 ++++++++++++++++++++++++++++++++++ 3 files changed, 360 insertions(+), 2 deletions(-) create mode 100644 server/utils/lancedb/index.js diff --git a/server/package.json b/server/package.json index 5810cabd27c..eeedfab5b22 100644 --- a/server/package.json +++ b/server/package.json @@ -29,10 +29,11 @@ "slugify": "^1.6.6", "sqlite": "^4.2.1", "sqlite3": "^5.1.6", - "uuid": "^9.0.0" + "uuid": "^9.0.0", + "vectordb": "^0.1.4" }, "devDependencies": { "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} \ No newline at end of file +} diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js index aa4cf3e89d6..90d4682b008 100644 --- a/server/utils/helpers/index.js +++ b/server/utils/helpers/index.js @@ -1,5 +1,6 @@ const { Pinecone } = require("../pinecone"); const { Chroma } = require("../chroma"); +const { LanceDb } = require('../lancedb'); function getVectorDbClass() { const vectorSelection = process.env.VECTOR_DB || "pinecone"; @@ -8,6 +9,8 @@ function getVectorDbClass() { return Pinecone; case "chroma": return Chroma; + case "lancedb": + return LanceDb default: return Pinecone; } diff --git a/server/utils/lancedb/index.js b/server/utils/lancedb/index.js new file mode 100644 index 00000000000..48c95f3b78f --- /dev/null +++ b/server/utils/lancedb/index.js @@ -0,0 +1,354 @@ +const lancedb = require("vectordb"); + +// const { ChromaClient, OpenAIEmbeddingFunction } = require("chromadb"); +// const { Chroma: ChromaStore } = require("langchain/vectorstores/chroma"); +// const { OpenAI } = require("langchain/llms/openai"); +// const { ChatOpenAI } = require("langchain/chat_models/openai"); +// const { +// VectorDBQAChain, +// LLMChain, +// RetrievalQAChain, +// ConversationalRetrievalQAChain, +// } = require("langchain/chains"); +// const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); +// const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +// const { storeVectorResult, cachedVectorInformation } = require("../files"); +// const { Configuration, OpenAIApi } = require("openai"); +// const { v4: uuidv4 } = require("uuid"); + +const toChunks = (arr, size) => { + return Array.from({ length: Math.ceil(arr.length / size) }, (_v, i) => + arr.slice(i * size, i * size + size) + ); +}; + +function curateSources(sources = []) { + const knownDocs = []; + const documents = []; + for (const source of sources) { + const { metadata = {} } = source; + if ( + Object.keys(metadata).length > 0 && + !knownDocs.includes(metadata.title) + ) { + documents.push({ ...metadata }); + knownDocs.push(metadata.title); + } + } + + return documents; +} + +const LanceDb = { + uri: '../lancedb', + name: 'LanceDb', + connect: async function () { + const client = await lancedb.connect(this.uri); + return { client }; + }, + heartbeat: async function () { + await this.connect(); + return { heartbeat: Number(new Date()) }; + }, + totalIndicies: async function () { + const { client } = await this.connect(); + const collections = await client.listCollections(); + var totalVectors = 0; + for (const collectionObj of collections) { + const collection = await client + .getCollection({ name: collectionObj.name }) + .catch(() => null); + if (!collection) continue; + totalVectors += await collection.count(); + } + return totalVectors; + }, + embeddingFunc: function () { + return new OpenAIEmbeddingFunction({ + openai_api_key: process.env.OPEN_AI_KEY, + }); + }, + embedder: function () { + return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY }); + }, + openai: function () { + const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY }); + const openai = new OpenAIApi(config); + return openai; + }, + llm: function () { + const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; + return new OpenAI({ + openAIApiKey: process.env.OPEN_AI_KEY, + temperature: 0.7, + modelName: model, + }); + }, + chatLLM: function () { + const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; + return new ChatOpenAI({ + openAIApiKey: process.env.OPEN_AI_KEY, + temperature: 0.7, + modelName: model, + }); + }, + embedChunk: async function (openai, textChunk) { + const { + data: { data }, + } = await openai.createEmbedding({ + model: "text-embedding-ada-002", + input: textChunk, + }); + return data.length > 0 && data[0].hasOwnProperty("embedding") + ? data[0].embedding + : null; + }, + namespace: async function (client, namespace = null) { + if (!namespace) throw new Error("No namespace value provided."); + const collection = await client + .getCollection({ name: namespace }) + .catch(() => null); + if (!collection) return null; + + return { + ...collection, + vectorCount: await collection.count(), + }; + }, + hasNamespace: async function (namespace = null) { + if (!namespace) return false; + const { client } = await this.connect(); + return await this.namespaceExists(client, namespace); + }, + namespaceExists: async function (client, namespace = null) { + if (!namespace) throw new Error("No namespace value provided."); + const collection = await client + .getCollection({ name: namespace }) + .catch((e) => { + console.error("ChromaDB::namespaceExists", e.message); + return null; + }); + return !!collection; + }, + deleteVectorsInNamespace: async function (client, namespace = null) { + await client.deleteCollection({ name: namespace }); + return true; + }, + addDocumentToNamespace: async function ( + namespace, + documentData = {}, + fullFilePath = null + ) { + const { DocumentVectors } = require("../../models/vectors"); + try { + const { pageContent, docId, ...metadata } = documentData; + if (!pageContent || pageContent.length == 0) return false; + + console.log("Adding new vectorized document into namespace", namespace); + const cacheResult = await cachedVectorInformation(fullFilePath); + if (cacheResult.exists) { + const { client } = await this.connect(); + const collection = await client.getOrCreateCollection({ + name: namespace, + metadata: { "hnsw:space": "cosine" }, + embeddingFunction: this.embeddingFunc(), + }); + const { chunks } = cacheResult; + const documentVectors = []; + + for (const chunk of chunks) { + const submission = { + ids: [], + embeddings: [], + metadatas: [], + documents: [], + }; + + // Before sending to Chroma and saving the records to our db + // we need to assign the id of each chunk that is stored in the cached file. + chunk.forEach((chunk) => { + const id = uuidv4(); + const { id: _id, ...metadata } = chunk.metadata; + documentVectors.push({ docId, vectorId: id }); + submission.ids.push(id); + submission.embeddings.push(chunk.values); + submission.metadatas.push(metadata); + submission.documents.push(metadata.text); + }); + + const additionResult = await collection.add(submission); + if (!additionResult) + throw new Error("Error embedding into ChromaDB", additionResult); + } + + await DocumentVectors.bulkInsert(documentVectors); + return true; + } + + // If we are here then we are going to embed and store a novel document. + // We have to do this manually as opposed to using LangChains `Chroma.fromDocuments` + // because we then cannot atomically control our namespace to granularly find/remove documents + // from vectordb. + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 20, + }); + const textChunks = await textSplitter.splitText(pageContent); + + console.log("Chunks created from document:", textChunks.length); + const documentVectors = []; + const vectors = []; + const openai = this.openai(); + + const submission = { + ids: [], + embeddings: [], + metadatas: [], + documents: [], + }; + + for (const textChunk of textChunks) { + const vectorValues = await this.embedChunk(openai, textChunk); + + if (!!vectorValues) { + const vectorRecord = { + id: uuidv4(), + values: vectorValues, + // [DO NOT REMOVE] + // LangChain will be unable to find your text if you embed manually and dont include the `text` key. + // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64 + metadata: { ...metadata, text: textChunk }, + }; + + submission.ids.push(vectorRecord.id); + submission.embeddings.push(vectorRecord.values); + submission.metadatas.push(metadata); + submission.documents.push(textChunk); + + vectors.push(vectorRecord); + documentVectors.push({ docId, vectorId: vectorRecord.id }); + } else { + console.error( + "Could not use OpenAI to embed document chunk! This document will not be recorded." + ); + } + } + + const { client } = await this.connect(); + const collection = await client.getOrCreateCollection({ + name: namespace, + metadata: { "hnsw:space": "cosine" }, + embeddingFunction: this.embeddingFunc(), + }); + + if (vectors.length > 0) { + const chunks = []; + + console.log("Inserting vectorized chunks into Chroma collection."); + for (const chunk of toChunks(vectors, 500)) chunks.push(chunk); + + const additionResult = await collection.add(submission); + if (!additionResult) + throw new Error("Error embedding into ChromaDB", additionResult); + + await storeVectorResult(chunks, fullFilePath); + } + + await DocumentVectors.bulkInsert(documentVectors); + return true; + } catch (e) { + console.error("addDocumentToNamespace", e.message); + return false; + } + }, + deleteDocumentFromNamespace: async function (namespace, docId) { + const { DocumentVectors } = require("../../models/vectors"); + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) return; + const collection = await client.getCollection({ + name: namespace, + embeddingFunction: this.embeddingFunc(), + }); + + const knownDocuments = await DocumentVectors.where(`docId = '${docId}'`); + if (knownDocuments.length === 0) return; + + const vectorIds = knownDocuments.map((doc) => doc.vectorId); + await collection.delete({ ids: vectorIds }); + + const indexes = knownDocuments.map((doc) => doc.id); + await DocumentVectors.deleteIds(indexes); + return true; + }, + query: async function (reqBody = {}) { + const { namespace = null, input } = reqBody; + if (!namespace || !input) throw new Error("Invalid request body"); + + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) { + return { + response: null, + sources: [], + message: "Invalid query - no documents found for workspace!", + }; + } + + // const collection = await client.getCollection({ name: namespace, embeddingFunction: this.embeddingFunc() }) + // const results = await collection.get({ + // where: { + // description: 'a custom file uploaded by the user.' + // }, + // includes: ['ids'] + // }) + // console.log(results) + // return { response: null, sources: [], } + + const vectorStore = await ChromaStore.fromExistingCollection( + this.embedder(), + { collectionName: namespace, url: process.env.CHROMA_ENDPOINT } + ); + const model = this.llm(); + const chain = VectorDBQAChain.fromLLM(model, vectorStore, { + k: 5, + returnSourceDocuments: true, + }); + const response = await chain.call({ query: input }); + return { + response: response.text, + sources: curateSources(response.sourceDocuments), + message: false, + }; + }, + "namespace-stats": async function (reqBody = {}) { + const { namespace = null } = reqBody; + if (!namespace) throw new Error("namespace required"); + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) + throw new Error("Namespace by that name does not exist."); + const stats = await this.namespace(client, namespace); + return stats + ? stats + : { message: "No stats were able to be fetched from DB for namespace" }; + }, + "delete-namespace": async function (reqBody = {}) { + const { namespace = null } = reqBody; + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) + throw new Error("Namespace by that name does not exist."); + + const details = await this.namespace(client, namespace); + await this.deleteVectorsInNamespace(client, namespace); + return { + message: `Namespace ${namespace} was deleted along with ${details?.vectorCount} vectors.`, + }; + }, + reset: async function () { + const { client } = await this.connect(); + await client.reset(); + return { reset: true }; + }, +}; + +module.exports = { + LanceDb, +}; From 186941dca604fee00182a4b4108324ae1a52a002 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Thu, 8 Jun 2023 18:10:37 -0700 Subject: [PATCH 2/6] lancedb initial support --- server/.gitignore | 3 +- server/package.json | 4 +- server/utils/chroma/index.js | 18 --- server/utils/helpers/index.js | 5 +- server/utils/lancedb/index.js | 268 +++++++++++++++++++++++++++++++--- 5 files changed, 256 insertions(+), 42 deletions(-) diff --git a/server/.gitignore b/server/.gitignore index 46aadf28c3b..06425c05716 100644 --- a/server/.gitignore +++ b/server/.gitignore @@ -4,4 +4,5 @@ documents/* vector-cache/*.json !documents/DOCUMENTS.md logs/server.log -*.db \ No newline at end of file +*.db +lancedb \ No newline at end of file diff --git a/server/package.json b/server/package.json index eeedfab5b22..313ee83a5db 100644 --- a/server/package.json +++ b/server/package.json @@ -30,10 +30,10 @@ "sqlite": "^4.2.1", "sqlite3": "^5.1.6", "uuid": "^9.0.0", - "vectordb": "^0.1.4" + "vectordb": "0.1.5-beta" }, "devDependencies": { "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} +} \ No newline at end of file diff --git a/server/utils/chroma/index.js b/server/utils/chroma/index.js index 002041419b1..f6fee744a16 100644 --- a/server/utils/chroma/index.js +++ b/server/utils/chroma/index.js @@ -65,14 +65,6 @@ const Chroma = { modelName: model, }); }, - chatLLM: function () { - const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; - return new ChatOpenAI({ - openAIApiKey: process.env.OPEN_AI_KEY, - temperature: 0.7, - modelName: model, - }); - }, embedChunk: async function (openai, textChunk) { const { data: { data }, @@ -274,16 +266,6 @@ const Chroma = { }; } - // const collection = await client.getCollection({ name: namespace, embeddingFunction: this.embeddingFunc() }) - // const results = await collection.get({ - // where: { - // description: 'a custom file uploaded by the user.' - // }, - // includes: ['ids'] - // }) - // console.log(results) - // return { response: null, sources: [], } - const vectorStore = await ChromaStore.fromExistingCollection( this.embedder(), { collectionName: namespace, url: process.env.CHROMA_ENDPOINT } diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js index 518a4a4cbc1..08a84cc881c 100644 --- a/server/utils/helpers/index.js +++ b/server/utils/helpers/index.js @@ -1,8 +1,7 @@ - function getVectorDbClass() { const { Pinecone } = require("../pinecone"); const { Chroma } = require("../chroma"); - const { LanceDb } = require('../lancedb'); + const { LanceDb } = require("../lancedb"); const vectorSelection = process.env.VECTOR_DB || "pinecone"; switch (vectorSelection) { @@ -11,7 +10,7 @@ function getVectorDbClass() { case "chroma": return Chroma; case "lancedb": - return LanceDb + return LanceDb; default: throw new Error("ENV: No VECTOR_DB value found in environment!"); } diff --git a/server/utils/lancedb/index.js b/server/utils/lancedb/index.js index 19b8761798e..e157be9be07 100644 --- a/server/utils/lancedb/index.js +++ b/server/utils/lancedb/index.js @@ -1,25 +1,24 @@ const lancedb = require("vectordb"); -const { toChunks, curateSources } = require('../helpers') -// const { ChromaClient, OpenAIEmbeddingFunction } = require("chromadb"); -// const { Chroma: ChromaStore } = require("langchain/vectorstores/chroma"); -// const { OpenAI } = require("langchain/llms/openai"); -// const { ChatOpenAI } = require("langchain/chat_models/openai"); -// const { -// VectorDBQAChain, -// LLMChain, -// RetrievalQAChain, -// ConversationalRetrievalQAChain, -// } = require("langchain/chains"); -// const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); -// const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); -// const { storeVectorResult, cachedVectorInformation } = require("../files"); -// const { Configuration, OpenAIApi } = require("openai"); -// const { v4: uuidv4 } = require("uuid"); +const { toChunks } = require("../helpers"); +const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); +const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { storeVectorResult, cachedVectorInformation } = require("../files"); +const { Configuration, OpenAIApi } = require("openai"); +const { v4: uuidv4 } = require("uuid"); + +// Since we roll our own results for prompting we +// have to manually curate sources as well. +function curateSources(results = []) {} const LanceDb = { - uri: `${!!process.env.STORAGE_DIR ? `${process.env.STORAGE_DIR}/` : "./"}lancedb`, - name: 'LanceDb', + uri: `${ + !!process.env.STORAGE_DIR ? `${process.env.STORAGE_DIR}/` : "./" + }lancedb`, + name: "LanceDb", connect: async function () { + if (process.env.VECTOR_DB !== "lancedb") + throw new Error("LanceDB::Invalid ENV settings"); + const client = await lancedb.connect(this.uri); return { client }; }, @@ -27,6 +26,239 @@ const LanceDb = { await this.connect(); return { heartbeat: Number(new Date()) }; }, + totalIndicies: async function () { + return 0; // Unsupported for LanceDB - so always zero + }, + embeddingFunc: function () { + return new lancedb.OpenAIEmbeddingFunction( + "context", + process.env.OPEN_AI_KEY + ); + }, + embedder: function () { + return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY }); + }, + openai: function () { + const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY }); + const openai = new OpenAIApi(config); + return openai; + }, + embedChunk: async function (openai, textChunk) { + const { + data: { data }, + } = await openai.createEmbedding({ + model: "text-embedding-ada-002", + input: textChunk, + }); + return data.length > 0 && data[0].hasOwnProperty("embedding") + ? data[0].embedding + : null; + }, + getChatCompletion: async function (openai, messages = []) { + const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; + const { data } = await openai.createChatCompletion({ + model, + messages, + }); + + if (!data.hasOwnProperty("choices")) return null; + return data.choices[0].message.content; + }, + namespace: async function (client, namespace = null) { + if (!namespace) throw new Error("No namespace value provided."); + const collection = await client.openTable(namespace).catch(() => false); + if (!collection) return null; + + return { + ...collection, + }; + }, + updateOrCreateCollection: async function (client, data = [], namespace) { + if (await this.hasNamespace(namespace)) { + const collection = await client.openTable(namespace); + const result = await collection.add(data); + console.log({ result }); + return true; + } + + const result = await client.createTable(namespace, data); + console.log({ result }); + return true; + }, + hasNamespace: async function (namespace = null) { + if (!namespace) return false; + const { client } = await this.connect(); + const exists = await this.namespaceExists(client, namespace); + return exists; + }, + namespaceExists: async function (client, namespace = null) { + if (!namespace) throw new Error("No namespace value provided."); + const collections = await client.tableNames(); + return collections.includes(namespace); + }, + deleteVectorsInNamespace: async function (client, namespace = null) { + const fs = require("fs"); + fs.rm(`${client.uri}/${namespace}.lance`, { recursive: true }, () => null); + return true; + }, + addDocumentToNamespace: async function ( + namespace, + documentData = {}, + fullFilePath = null + ) { + const { DocumentVectors } = require("../../models/vectors"); + try { + const { pageContent, docId, ...metadata } = documentData; + if (!pageContent || pageContent.length == 0) return false; + + console.log("Adding new vectorized document into namespace", namespace); + const cacheResult = await cachedVectorInformation(fullFilePath); + if (cacheResult.exists) { + const { client } = await this.connect(); + const { chunks } = cacheResult; + const documentVectors = []; + const submissions = []; + + for (const chunk of chunks) { + chunk.forEach((chunk) => { + const id = uuidv4(); + const { id: _id, ...metadata } = chunk.metadata; + documentVectors.push({ docId, vectorId: id }); + submissions.push({ id: id, vector: chunk.values, ...metadata }); + }); + } + + console.log(submissions); + await this.updateOrCreateCollection(client, submissions, namespace); + await DocumentVectors.bulkInsert(documentVectors); + return true; + } + + // If we are here then we are going to embed and store a novel document. + // We have to do this manually as opposed to using LangChains `xyz.fromDocuments` + // because we then cannot atomically control our namespace to granularly find/remove documents + // from vectordb. + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 20, + }); + const textChunks = await textSplitter.splitText(pageContent); + + console.log("Chunks created from document:", textChunks.length); + const documentVectors = []; + const vectors = []; + const submissions = []; + const openai = this.openai(); + + for (const textChunk of textChunks) { + const vectorValues = await this.embedChunk(openai, textChunk); + + if (!!vectorValues) { + const vectorRecord = { + id: uuidv4(), + values: vectorValues, + // [DO NOT REMOVE] + // LangChain will be unable to find your text if you embed manually and dont include the `text` key. + // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64 + metadata: { ...metadata, text: textChunk }, + }; + + vectors.push(vectorRecord); + submissions.push({ + id: vectorRecord.id, + vector: vectorRecord.values, + ...vectorRecord.metadata, + }); + documentVectors.push({ docId, vectorId: vectorRecord.id }); + } else { + console.error( + "Could not use OpenAI to embed document chunk! This document will not be recorded." + ); + } + } + + if (vectors.length > 0) { + const chunks = []; + for (const chunk of toChunks(vectors, 500)) chunks.push(chunk); + + console.log("Inserting vectorized chunks into LanceDB collection."); + const { client } = await this.connect(); + await this.updateOrCreateCollection(client, submissions, namespace); + await storeVectorResult(chunks, fullFilePath); + } + + await DocumentVectors.bulkInsert(documentVectors); + return true; + } catch (e) { + console.error("addDocumentToNamespace", e.message); + return false; + } + }, + query: async function (reqBody = {}) { + const { namespace = null, input } = reqBody; + if (!namespace || !input) throw new Error("Invalid request body"); + + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) { + return { + response: null, + sources: [], + message: "Invalid query - no documents found for workspace!", + }; + } + + // LanceDB does not have langchainJS support so we roll our own here. + const queryVector = await this.embedChunk(this.openai(), input); + const collection = await client.openTable(namespace); + const relevantResults = await collection + .search(queryVector) + .metric("cosine") + .limit(2) + .execute(); + const messages = [ + { + role: "system", + content: `The following is a friendly conversation between a human and an AI. The AI is very casual and talkative and responds with a friendly tone. If the AI does not know the answer to a question, it truthfully says it does not know. + Relevant pieces of information for context of the current query: + ${relevantResults.map((result) => result.text).join("\n\n")}`, + }, + { role: "user", content: input }, + ]; + const responseText = await this.getChatCompletion(this.openai(), messages); + return { + response: responseText, + sources: curateSources(relevantResults), + message: "tmp", + }; + }, + "namespace-stats": async function (reqBody = {}) { + const { namespace = null } = reqBody; + if (!namespace) throw new Error("namespace required"); + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) + throw new Error("Namespace by that name does not exist."); + const stats = await this.namespace(client, namespace); + return stats + ? stats + : { message: "No stats were able to be fetched from DB for namespace" }; + }, + "delete-namespace": async function (reqBody = {}) { + const { namespace = null } = reqBody; + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) + throw new Error("Namespace by that name does not exist."); + + await this.deleteVectorsInNamespace(client, namespace); + return { + message: `Namespace ${namespace} was deleted.`, + }; + }, + reset: async function () { + const { client } = await this.connect(); + const fs = require("fs"); + fs.rm(`${client.uri}`, { recursive: true }, () => null); + return { reset: true }; + }, }; module.exports = { From 82730ac75b1956393cb4ecd7db003f5b471c04e5 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Thu, 8 Jun 2023 18:18:54 -0700 Subject: [PATCH 3/6] add null method for deletion of documents from namespace since LanceDB does not support show warning modal on frontend for this --- .../src/components/Modals/ManageWorkspace.jsx | 61 +++++++++++++++++++ server/utils/lancedb/index.js | 6 ++ 2 files changed, 67 insertions(+) diff --git a/frontend/src/components/Modals/ManageWorkspace.jsx b/frontend/src/components/Modals/ManageWorkspace.jsx index 67dfcf1a824..4d64bb41726 100644 --- a/frontend/src/components/Modals/ManageWorkspace.jsx +++ b/frontend/src/components/Modals/ManageWorkspace.jsx @@ -14,6 +14,7 @@ import { nFormatter } from "../../utils/numbers"; import { dollarFormat } from "../../utils/numbers"; import paths from "../../utils/paths"; import { useParams } from "react-router-dom"; +import { titleCase } from "text-case"; const noop = () => false; export default function ManageWorkspace({ hideModal = noop, workspace }) { @@ -24,15 +25,19 @@ export default function ManageWorkspace({ hideModal = noop, workspace }) { const [directories, setDirectories] = useState(null); const [originalDocuments, setOriginalDocuments] = useState([]); const [selectedFiles, setSelectFiles] = useState([]); + const [vectordb, setVectorDB] = useState(null); + const [showingNoRemovalModal, setShowingNoRemovalModal] = useState(false); useEffect(() => { async function fetchKeys() { const _workspace = await Workspace.bySlug(workspace.slug); const localFiles = await System.localFiles(); + const settings = await System.keys(); const originalDocs = _workspace.documents.map((doc) => doc.docpath) || []; setDirectories(localFiles); setOriginalDocuments([...originalDocs]); setSelectFiles([...originalDocs]); + setVectorDB(settings?.VectorDB); setLoading(false); } fetchKeys(); @@ -97,11 +102,25 @@ export default function ManageWorkspace({ hideModal = noop, workspace }) { : selectedFiles.some((doc) => doc.includes(filepath)); }; + const isOriginalDoc = (filepath) => { + const isFolder = !filepath.includes("/"); + return isFolder + ? originalDocuments.some((doc) => doc.includes(filepath.split("/")[0])) + : originalDocuments.some((doc) => doc.includes(filepath)); + }; + const toggleSelection = (filepath) => { const isFolder = !filepath.includes("/"); const parent = isFolder ? filepath : filepath.split("/")[0]; if (isSelected(filepath)) { + // Certain vector DBs do not contain the ability to delete vectors + // so we cannot remove from these. The user will have to clear the entire workspace. + if (["lancedb"].includes(vectordb) && isOriginalDoc(filepath)) { + setShowingNoRemovalModal(true); + return false; + } + const updatedDocs = isFolder ? selectedFiles.filter((doc) => !doc.includes(parent)) : selectedFiles.filter((doc) => !doc.includes(filepath)); @@ -168,6 +187,12 @@ export default function ManageWorkspace({ hideModal = noop, workspace }) { updateWorkspace={updateWorkspace} /> )} + {showingNoRemovalModal && ( + setShowingNoRemovalModal(false)} + vectordb={vectordb} + /> + )}
+
+
+

+ You cannot remove this document! +

+ +
+

+ {titleCase(vectordb)} does not support atomic removal of + documents. +
+ Unfortunately, you will have to delete the entire workspace to + remove this document from being referenced. +

+
+
+ +
+
+
+ + ); +} + export function useManageWorkspaceModal() { const [showing, setShowing] = useState(false); const showModal = () => { diff --git a/server/utils/lancedb/index.js b/server/utils/lancedb/index.js index e157be9be07..0f8572cd416 100644 --- a/server/utils/lancedb/index.js +++ b/server/utils/lancedb/index.js @@ -101,6 +101,12 @@ const LanceDb = { fs.rm(`${client.uri}/${namespace}.lance`, { recursive: true }, () => null); return true; }, + deleteDocumentFromNamespace: async function (_namespace, _docId) { + console.error( + `LanceDB:deleteDocumentFromNamespace - unsupported operation. No changes made to vector db.` + ); + return false; + }, addDocumentToNamespace: async function ( namespace, documentData = {}, From c6eb3add9e01f450435bded51ada1ebc14af1b47 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Thu, 8 Jun 2023 18:35:07 -0700 Subject: [PATCH 4/6] update .env.example and lancedb methods for sourcing --- server/.env.example | 3 +++ server/utils/lancedb/index.js | 24 ++++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/server/.env.example b/server/.env.example index 0383ed25f97..541134f0ccc 100644 --- a/server/.env.example +++ b/server/.env.example @@ -13,6 +13,9 @@ PINECONE_ENVIRONMENT= PINECONE_API_KEY= PINECONE_INDEX= +# Enable all below if you are using vector database: LanceDB. +# VECTOR_DB="lancedb" + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # STORAGE_DIR= # absolute filesystem path with no trailing slash \ No newline at end of file diff --git a/server/utils/lancedb/index.js b/server/utils/lancedb/index.js index 0f8572cd416..af010158ab3 100644 --- a/server/utils/lancedb/index.js +++ b/server/utils/lancedb/index.js @@ -8,7 +8,22 @@ const { v4: uuidv4 } = require("uuid"); // Since we roll our own results for prompting we // have to manually curate sources as well. -function curateSources(results = []) {} +function curateLanceSources(sources = []) { + const knownDocs = []; + const documents = []; + for (const source of sources) { + const { text: _t, vector: _v, score: _s, ...metadata } = source; + if ( + Object.keys(metadata).length > 0 && + !knownDocs.includes(metadata.title) + ) { + documents.push({ ...metadata }); + knownDocs.push(metadata.title); + } + } + + return documents; +} const LanceDb = { uri: `${ @@ -218,7 +233,7 @@ const LanceDb = { const collection = await client.openTable(namespace); const relevantResults = await collection .search(queryVector) - .metric("cosine") + .metricType("cosine") .limit(2) .execute(); const messages = [ @@ -231,10 +246,11 @@ const LanceDb = { { role: "user", content: input }, ]; const responseText = await this.getChatCompletion(this.openai(), messages); + return { response: responseText, - sources: curateSources(relevantResults), - message: "tmp", + sources: curateLanceSources(relevantResults), + message: false, }; }, "namespace-stats": async function (reqBody = {}) { From c9ba60264ddf0c5dd8fdba7560b31f98e256d7e7 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Thu, 8 Jun 2023 18:36:27 -0700 Subject: [PATCH 5/6] change export method --- server/utils/lancedb/index.js | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/server/utils/lancedb/index.js b/server/utils/lancedb/index.js index af010158ab3..f2851d3292a 100644 --- a/server/utils/lancedb/index.js +++ b/server/utils/lancedb/index.js @@ -26,9 +26,8 @@ function curateLanceSources(sources = []) { } const LanceDb = { - uri: `${ - !!process.env.STORAGE_DIR ? `${process.env.STORAGE_DIR}/` : "./" - }lancedb`, + uri: `${!!process.env.STORAGE_DIR ? `${process.env.STORAGE_DIR}/` : "./" + }lancedb`, name: "LanceDb", connect: async function () { if (process.env.VECTOR_DB !== "lancedb") @@ -283,6 +282,4 @@ const LanceDb = { }, }; -module.exports = { - LanceDb, -}; +module.exports.LanceDb = LanceDb From 22fd1c5aac10a4fe0e6c8ff79fb1091f1a3830bc Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Thu, 8 Jun 2023 18:40:07 -0700 Subject: [PATCH 6/6] update readme --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4aebf2635b9..42f2b033817 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/tim.svg?style=social&label=Follow%20%40Timothy%20Carambat)](https://twitter.com/tcarambat) [![](https://dcbadge.vercel.app/api/server/6UyHPeGZAC?compact=true&style=flat)](https://discord.gg/6UyHPeGZAC) -A full-stack application and tool suite that enables you to turn any document, resource, or piece of content into a piece of data that any LLM can use as reference during chatting. This application runs with very minimal overhead as by default the LLM and vectorDB are hosted remotely, but can be swapped for local instances. Currently this project supports Pinecone & ChromaDB for vector storage and OpenAI for chatting. +A full-stack application and tool suite that enables you to turn any document, resource, or piece of content into a piece of data that any LLM can use as reference during chatting. This application runs with very minimal overhead as by default the LLM and vectorDB are hosted remotely, but can be swapped for local instances. Currently this project supports [Pinecone](https://pinecone.io), [ChromaDB](https://trychroma.com) & more for vector storage and [OpenAI](https://openai.com) for LLM/chatting. + ![Chatting](/images/screenshots/chat.png) [view more screenshots](/images/screenshots/SCREENSHOTS.md) @@ -38,8 +39,8 @@ This monorepo consists of three main sections: - `yarn` and `node` on your machine - `python` 3.8+ for running scripts in `collector/`. - access to an LLM like `GPT-3.5`, `GPT-4`*. -- a [Pinecone.io](https://pinecone.io) free account* **or** Local Chroma instance running. -*you can use drop in replacements for these. This is just the easiest to get up and running fast. +- a [Pinecone.io](https://pinecone.io) free account*. +*you can use drop in replacements for these. This is just the easiest to get up and running fast. We support multiple vector database providers. ### How to get started - `yarn setup` from the project root directory.