diff --git a/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js b/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js new file mode 100644 index 00000000000..33d6266ac49 --- /dev/null +++ b/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js @@ -0,0 +1,76 @@ +const { PGVector } = require("../../../../utils/vectorDbProviders/pgvector"); + +describe("PGVector.sanitizeForJsonb", () => { + it("returns null/undefined as-is", () => { + expect(PGVector.sanitizeForJsonb(null)).toBeNull(); + expect(PGVector.sanitizeForJsonb(undefined)).toBeUndefined(); + }); + + it("keeps safe whitespace (tab, LF, CR) and removes disallowed C0 controls", () => { + const input = "a\u0000\u0001\u0002\tline\ncarriage\rreturn\u001Fend"; + const result = PGVector.sanitizeForJsonb(input); + // Expect all < 0x20 except 9,10,13 removed; keep letters and allowed whitespace + expect(result).toBe("a\tline\ncarriage\rreturnend"); + }); + + it("removes only disallowed control chars; keeps normal printable chars", () => { + const input = "Hello\u0000, World! \u0007\u0008\u000B\u000C\u001F"; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toBe("Hello, World! "); + }); + + it("deeply sanitizes objects", () => { + const input = { + plain: "ok", + bad: "has\u0000nul", + nested: { + arr: ["fine", "bad\u0001", { deep: "\u0002oops" }], + }, + }; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toEqual({ + plain: "ok", + bad: "hasnul", + nested: { arr: ["fine", "bad", { deep: "oops" }] }, + }); + }); + + it("deeply sanitizes arrays", () => { + const input = ["\u0000", 1, true, { s: "bad\u0003" }, ["ok", "\u0004bad"]]; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toEqual(["", 1, true, { s: "bad" }, ["ok", "bad"]]); + }); + + it("converts Date to ISO string", () => { + const d = new Date("2020-01-02T03:04:05.000Z"); + expect(PGVector.sanitizeForJsonb(d)).toBe(d.toISOString()); + }); + + it("returns primitives unchanged (number, boolean, bigint)", () => { + expect(PGVector.sanitizeForJsonb(42)).toBe(42); + expect(PGVector.sanitizeForJsonb(3.14)).toBe(3.14); + expect(PGVector.sanitizeForJsonb(true)).toBe(true); + expect(PGVector.sanitizeForJsonb(false)).toBe(false); + expect(PGVector.sanitizeForJsonb(BigInt(1))).toBe(BigInt(1)); + }); + + it("returns symbol unchanged", () => { + const sym = Symbol("x"); + expect(PGVector.sanitizeForJsonb(sym)).toBe(sym); + }); + + it("does not mutate original objects/arrays", () => { + const obj = { a: "bad\u0000", nested: { b: "ok" } }; + const arr = ["\u0001", { c: "bad\u0002" }]; + const objCopy = JSON.parse(JSON.stringify(obj)); + const arrCopy = JSON.parse(JSON.stringify(arr)); + const resultObj = PGVector.sanitizeForJsonb(obj); + const resultArr = PGVector.sanitizeForJsonb(arr); + // Original inputs remain unchanged + expect(obj).toEqual(objCopy); + expect(arr).toEqual(arrCopy); + // Results are sanitized copies + expect(resultObj).toEqual({ a: "bad", nested: { b: "ok" } }); + expect(resultArr).toEqual(["", { c: "bad" }]); + }); +}); diff --git a/server/utils/vectorDbProviders/pgvector/index.js b/server/utils/vectorDbProviders/pgvector/index.js index d5c86907566..990498eb5cc 100644 --- a/server/utils/vectorDbProviders/pgvector/index.js +++ b/server/utils/vectorDbProviders/pgvector/index.js @@ -52,6 +52,55 @@ const PGVector = { console.log(`\x1b[35m[PGVectorDb]\x1b[0m ${message}`, ...args); }, + /** + * Recursively sanitize values intended for JSONB to prevent Postgres errors + * like "unsupported Unicode escape sequence". This primarily removes the + * NUL character (\u0000) and other disallowed control characters from + * strings. Arrays and objects are traversed and sanitized deeply. + * @param {any} value + * @returns {any} + */ + sanitizeForJsonb: function (value) { + // Fast path for null/undefined and primitives that do not need changes + if (value === null || value === undefined) return value; + + // Strings: strip NUL and unsafe C0 control characters except common whitespace + if (typeof value === "string") { + // Build a sanitized string by excluding C0 control characters except + // horizontal tab (9), line feed (10), and carriage return (13). + let sanitized = ""; + for (let i = 0; i < value.length; i++) { + const code = value.charCodeAt(i); + if (code === 9 || code === 10 || code === 13 || code >= 0x20) { + sanitized += value[i]; + } + } + return sanitized; + } + + // Arrays: sanitize each element + if (Array.isArray(value)) { + return value.map((item) => this.sanitizeForJsonb(item)); + } + + // Dates: keep as ISO string + if (value instanceof Date) { + return value.toISOString(); + } + + // Objects: sanitize each property value + if (typeof value === "object") { + const result = {}; + for (const [k, v] of Object.entries(value)) { + result[k] = this.sanitizeForJsonb(v); + } + return result; + } + + // Numbers, booleans, etc. + return value; + }, + client: function (connectionString = null) { return new pgsql.Client({ connectionString: connectionString || PGVector.connectionString(), @@ -362,9 +411,11 @@ const PGVector = { /** * Update or create a collection in the database - * @param {pgsql.Connection} connection - * @param {{id: number, vector: number[], metadata: Object}[]} submissions - * @param {string} namespace + * @param {Object} params + * @param {pgsql.Connection} params.connection + * @param {{id: number, vector: number[], metadata: Object}[]} params.submissions + * @param {string} params.namespace + * @param {number} params.dimensions * @returns {Promise} */ updateOrCreateCollection: async function ({ @@ -381,9 +432,10 @@ const PGVector = { await connection.query(`BEGIN`); for (const submission of submissions) { const embedding = `[${submission.vector.map(Number).join(",")}]`; // stringify the vector for pgvector + const sanitizedMetadata = this.sanitizeForJsonb(submission.metadata); await connection.query( `INSERT INTO "${PGVector.tableName()}" (id, namespace, embedding, metadata) VALUES ($1, $2, $3, $4)`, - [submission.id, namespace, embedding, submission.metadata] + [submission.id, namespace, embedding, sanitizedMetadata] ); } this.log(`Committing ${submissions.length} vectors to ${namespace}`);