θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 85 additions & 4 deletions server/utils/TextSplitter/index.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
/**
* @typedef {object} DocumentMetadata
* @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000"
* @property {string} url - eg; "file://example.com/index.html"
* @property {string} title - eg; "example.com/index.html"
* @property {string} docAuthor - eg; "no author found"
* @property {string} description - eg; "No description found."
* @property {string} docSource - eg; "URL link uploaded by the user."
* @property {string} chunkSource - eg; link://https://example.com
* @property {string} published - ISO 8601 date string
* @property {number} wordCount - Number of words in the document
* @property {string} pageContent - The raw text content of the document
* @property {number} token_count_estimate - Number of tokens in the document
*/

function isNullOrNaN(value) {
if (value === null) return true;
return isNaN(value);
Expand Down Expand Up @@ -29,10 +44,12 @@ class TextSplitter {
console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
}

// Does a quick check to determine the text chunk length limit.
// Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
// so here we want to allow override of the default 1000, but up to the models maximum, which is
// sometimes user defined.
/**
* Does a quick check to determine the text chunk length limit.
* Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
* so here we want to allow override of the default 1000, but up to the models maximum, which is
* sometimes user defined.
*/
static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
const prefValue = isNullOrNaN(preferred)
? Number(embedderLimit)
Expand All @@ -45,6 +62,70 @@ class TextSplitter {
return prefValue > limit ? limit : prefValue;
}

/**
* Creates a string of metadata to be prepended to each chunk.
* @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk.
* @returns {{[key: ('title' | 'published' | 'source')]: string}} Object of metadata that will be prepended to each chunk.
*/
static buildHeaderMeta(metadata = {}) {
if (!metadata || Object.keys(metadata).length === 0) return null;
const PLUCK_MAP = {
title: {
as: "sourceDocument",
pluck: (metadata) => {
return metadata?.title || null;
},
},
published: {
as: "published",
pluck: (metadata) => {
return metadata?.published || null;
},
},
chunkSource: {
as: "source",
pluck: (metadata) => {
const validPrefixes = ["link://", "youtube://"];
// If the chunkSource is a link or youtube link, we can add the URL
// as its source in the metadata so the LLM can use it for context.
// eg prompt: Where did you get this information? -> answer: "from https://example.com"
if (
!metadata?.chunkSource || // Exists
!metadata?.chunkSource.length || // Is not empty
typeof metadata.chunkSource !== "string" || // Is a string
!validPrefixes.some(
(prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect
)
)
return null;

// We know a prefix is present, so we can split on it and return the rest.
// If nothing is found, return null and it will not be added to the metadata.
let source = null;
for (const prefix of validPrefixes) {
source = metadata.chunkSource.split(prefix)?.[1] || null;
if (source) break;
}

return source;
},
},
};

const pluckedData = {};
Object.entries(PLUCK_MAP).forEach(([key, value]) => {
if (!(key in metadata)) return; // Skip if the metadata key is not present.
const pluckedValue = value.pluck(metadata);
if (!pluckedValue) return; // Skip if the plucked value is null/empty.
pluckedData[value.as] = pluckedValue;
});

return pluckedData;
}

/**
* Creates a string of metadata to be prepended to each chunk.
*/
stringifyHeader() {
if (!this.config.chunkHeaderMeta) return null;
let content = "";
Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/astra/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,7 @@ const AstraDB = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/chroma/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,7 @@ const Chroma = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/lance/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,7 @@ const LanceDb = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/milvus/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,7 @@ const Milvus = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/pinecone/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,7 @@ const PineconeDB = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/qdrant/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,7 @@ const QDrant = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/weaviate/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,7 @@ const Weaviate = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/zilliz/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,7 @@ const Zilliz = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down