θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content

fix: sanitizeNamespace #3246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 75 additions & 25 deletions server/utils/vectorDbProviders/astra/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,27 @@ const { v4: uuidv4 } = require("uuid");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");

const sanitizeNamespace = (namespace) => {
// If namespace already starts with ns_, don't add it again
if (namespace.startsWith("ns_")) return namespace;

// Remove any invalid characters, ensure starts with letter
return `ns_${namespace.replace(/[^a-zA-Z0-9_]/g, "_")}`;
};

// Add this helper method to check if collection exists more reliably
const collectionExists = async function (client, namespace) {
try {
const collections = await AstraDB.allNamespaces(client);
if (collections) {
return collections.includes(namespace);
}
} catch (error) {
console.log("Astra::collectionExists check error", error?.message || error);
return false; // Return false for any error to allow creation attempt
}
};

const AstraDB = {
name: "AstraDB",
connect: async function () {
Expand Down Expand Up @@ -49,7 +70,10 @@ const AstraDB = {
},
namespace: async function (client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client.collection(namespace).catch(() => null);
const sanitizedNamespace = sanitizeNamespace(namespace);
const collection = await client
.collection(sanitizedNamespace)
.catch(() => null);
if (!(await this.isRealCollection(collection))) return null;

const count = await collection.countDocuments().catch((e) => {
Expand All @@ -70,32 +94,50 @@ const AstraDB = {
},
namespaceExists: async function (client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client.collection(namespace);
const sanitizedNamespace = sanitizeNamespace(namespace);
const collection = await client.collection(sanitizedNamespace);
return await this.isRealCollection(collection);
},
deleteVectorsInNamespace: async function (client, namespace = null) {
await client.dropCollection(namespace);
const sanitizedNamespace = sanitizeNamespace(namespace);
await client.dropCollection(sanitizedNamespace);
return true;
},
// AstraDB requires a dimension aspect for collection creation
// we pass this in from the first chunk to infer the dimensions like other
// providers do.
getOrCreateCollection: async function (client, namespace, dimensions = null) {
const isExists = await this.namespaceExists(client, namespace);
if (!isExists) {
if (!dimensions)
throw new Error(
`AstraDB:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
);
const sanitizedNamespace = sanitizeNamespace(namespace);
try {
const exists = await collectionExists(client, sanitizedNamespace);

await client.createCollection(namespace, {
vector: {
dimension: dimensions,
metric: "cosine",
},
});
if (!exists) {
if (!dimensions) {
throw new Error(
`AstraDB:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on Github for support.`
);
}

// Create new collection
await client.createCollection(sanitizedNamespace, {
vector: {
dimension: dimensions,
metric: "cosine",
},
});

// Get the newly created collection
return await client.collection(sanitizedNamespace);
}

return await client.collection(sanitizedNamespace);
} catch (error) {
console.error(
"Astra::getOrCreateCollection error",
error?.message || error
);
throw error;
}
return await client.collection(namespace);
},
addDocumentToNamespace: async function (
namespace,
Expand Down Expand Up @@ -150,11 +192,14 @@ const AstraDB = {

const EmbedderEngine = getEmbeddingEngineSelection();
const textSplitter = new TextSplitter({
chunkSize: TextSplitter.determineMaxChunkSize(
await SystemSettings.getValueOrFallback({
label: "text_splitter_chunk_size",
}),
EmbedderEngine?.embeddingMaxChunkLength
chunkSize: Math.min(
7500,
TextSplitter.determineMaxChunkSize(
await SystemSettings.getValueOrFallback({
label: "text_splitter_chunk_size",
}),
EmbedderEngine?.embeddingMaxChunkLength
)
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
Expand Down Expand Up @@ -227,6 +272,7 @@ const AstraDB = {
deleteDocumentFromNamespace: async function (namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { client } = await this.connect();
namespace = sanitizeNamespace(namespace);
if (!(await this.namespaceExists(client, namespace)))
throw new Error(
"Invalid namespace - has it been collected and populated yet?"
Expand Down Expand Up @@ -259,7 +305,10 @@ const AstraDB = {
throw new Error("Invalid request to performSimilaritySearch.");

const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) {
// Sanitize namespace before checking existence
const sanitizedNamespace = sanitizeNamespace(namespace);

if (!(await this.namespaceExists(client, sanitizedNamespace))) {
return {
contextTexts: [],
sources: [],
Expand All @@ -271,7 +320,7 @@ const AstraDB = {
const queryVector = await LLMConnector.embedTextInput(input);
const { contextTexts, sourceDocuments } = await this.similarityResponse({
client,
namespace,
namespace: sanitizedNamespace,
queryVector,
similarityThreshold,
topN,
Expand Down Expand Up @@ -300,8 +349,9 @@ const AstraDB = {
sourceDocuments: [],
scores: [],
};

const collection = await client.collection(namespace);
// Namespace should already be sanitized, but let's be defensive
const sanitizedNamespace = sanitizeNamespace(namespace);
const collection = await client.collection(sanitizedNamespace);
const responses = await collection
.find(
{},
Expand Down