From 4101afc2c490fab03c6b2a9303b92f10d8772d83 Mon Sep 17 00:00:00 2001 From: angelplusultra Date: Wed, 24 Sep 2025 16:02:10 -0700 Subject: [PATCH 1/3] Fix JSDOC for updateOrCreateCollection --- server/utils/vectorDbProviders/pgvector/index.js | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/server/utils/vectorDbProviders/pgvector/index.js b/server/utils/vectorDbProviders/pgvector/index.js index 057f1b799d6..b209f9eec44 100644 --- a/server/utils/vectorDbProviders/pgvector/index.js +++ b/server/utils/vectorDbProviders/pgvector/index.js @@ -361,9 +361,11 @@ const PGVector = { /** * Update or create a collection in the database - * @param {pgsql.Connection} connection - * @param {{id: number, vector: number[], metadata: Object}[]} submissions - * @param {string} namespace + * @param {Object} params + * @param {pgsql.Connection} params.connection + * @param {{id: number, vector: number[], metadata: Object}[]} params.submissions + * @param {string} params.namespace + * @param {number} params.dimensions * @returns {Promise} */ updateOrCreateCollection: async function ({ From 47c8679ca0874deea4de8be11aabeb3470e584e1 Mon Sep 17 00:00:00 2001 From: angelplusultra Date: Wed, 24 Sep 2025 16:02:30 -0700 Subject: [PATCH 2/3] Add sanitizeForJsonb method to PGVector for safe JSONB handling This new method recursively sanitizes values intended for JSONB storage, removing disallowed control characters and ensuring safe insertion into PostgreSQL. The method is integrated into the vector insertion process to sanitize metadata before database operations. --- .../utils/vectorDbProviders/pgvector/index.js | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/server/utils/vectorDbProviders/pgvector/index.js b/server/utils/vectorDbProviders/pgvector/index.js index b209f9eec44..b95ecf9c0d7 100644 --- a/server/utils/vectorDbProviders/pgvector/index.js +++ b/server/utils/vectorDbProviders/pgvector/index.js @@ -51,6 +51,55 @@ const PGVector = { console.log(`\x1b[35m[PGVectorDb]\x1b[0m ${message}`, ...args); }, + /** + * Recursively sanitize values intended for JSONB to prevent Postgres errors + * like "unsupported Unicode escape sequence". This primarily removes the + * NUL character (\u0000) and other disallowed control characters from + * strings. Arrays and objects are traversed and sanitized deeply. + * @param {any} value + * @returns {any} + */ + sanitizeForJsonb: function (value) { + // Fast path for null/undefined and primitives that do not need changes + if (value === null || value === undefined) return value; + + // Strings: strip NUL and unsafe C0 control characters except common whitespace + if (typeof value === "string") { + // Build a sanitized string by excluding C0 control characters except + // horizontal tab (9), line feed (10), and carriage return (13). + let sanitized = ""; + for (let i = 0; i < value.length; i++) { + const code = value.charCodeAt(i); + if (code === 9 || code === 10 || code === 13 || code >= 0x20) { + sanitized += value[i]; + } + } + return sanitized; + } + + // Arrays: sanitize each element + if (Array.isArray(value)) { + return value.map((item) => this.sanitizeForJsonb(item)); + } + + // Dates: keep as ISO string + if (value instanceof Date) { + return value.toISOString(); + } + + // Objects: sanitize each property value + if (typeof value === "object") { + const result = {}; + for (const [k, v] of Object.entries(value)) { + result[k] = this.sanitizeForJsonb(v); + } + return result; + } + + // Numbers, booleans, etc. + return value; + }, + client: function (connectionString = null) { return new pgsql.Client({ connectionString: connectionString || PGVector.connectionString(), @@ -382,9 +431,10 @@ const PGVector = { await connection.query(`BEGIN`); for (const submission of submissions) { const embedding = `[${submission.vector.map(Number).join(",")}]`; // stringify the vector for pgvector + const sanitizedMetadata = this.sanitizeForJsonb(submission.metadata); await connection.query( `INSERT INTO "${PGVector.tableName()}" (id, namespace, embedding, metadata) VALUES ($1, $2, $3, $4)`, - [submission.id, namespace, embedding, submission.metadata] + [submission.id, namespace, embedding, sanitizedMetadata] ); } this.log(`Committing ${submissions.length} vectors to ${namespace}`); From 0eb23b0b2c2077dd4e966c8698258dea87385f41 Mon Sep 17 00:00:00 2001 From: angelplusultra Date: Fri, 26 Sep 2025 11:47:52 -0700 Subject: [PATCH 3/3] Add unit tests for PGVector.sanitizeForJsonb method This commit introduces a comprehensive test suite for the PGVector.sanitizeForJsonb method, ensuring it correctly handles various input types, including null, undefined, strings with disallowed control characters, objects, arrays, and Date objects. The tests verify that the method sanitizes inputs without mutating the original data structures. --- .../vectorDbProviders/pgvector/index.test.js | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 server/__tests__/utils/vectorDbProviders/pgvector/index.test.js diff --git a/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js b/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js new file mode 100644 index 00000000000..33d6266ac49 --- /dev/null +++ b/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js @@ -0,0 +1,76 @@ +const { PGVector } = require("../../../../utils/vectorDbProviders/pgvector"); + +describe("PGVector.sanitizeForJsonb", () => { + it("returns null/undefined as-is", () => { + expect(PGVector.sanitizeForJsonb(null)).toBeNull(); + expect(PGVector.sanitizeForJsonb(undefined)).toBeUndefined(); + }); + + it("keeps safe whitespace (tab, LF, CR) and removes disallowed C0 controls", () => { + const input = "a\u0000\u0001\u0002\tline\ncarriage\rreturn\u001Fend"; + const result = PGVector.sanitizeForJsonb(input); + // Expect all < 0x20 except 9,10,13 removed; keep letters and allowed whitespace + expect(result).toBe("a\tline\ncarriage\rreturnend"); + }); + + it("removes only disallowed control chars; keeps normal printable chars", () => { + const input = "Hello\u0000, World! \u0007\u0008\u000B\u000C\u001F"; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toBe("Hello, World! "); + }); + + it("deeply sanitizes objects", () => { + const input = { + plain: "ok", + bad: "has\u0000nul", + nested: { + arr: ["fine", "bad\u0001", { deep: "\u0002oops" }], + }, + }; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toEqual({ + plain: "ok", + bad: "hasnul", + nested: { arr: ["fine", "bad", { deep: "oops" }] }, + }); + }); + + it("deeply sanitizes arrays", () => { + const input = ["\u0000", 1, true, { s: "bad\u0003" }, ["ok", "\u0004bad"]]; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toEqual(["", 1, true, { s: "bad" }, ["ok", "bad"]]); + }); + + it("converts Date to ISO string", () => { + const d = new Date("2020-01-02T03:04:05.000Z"); + expect(PGVector.sanitizeForJsonb(d)).toBe(d.toISOString()); + }); + + it("returns primitives unchanged (number, boolean, bigint)", () => { + expect(PGVector.sanitizeForJsonb(42)).toBe(42); + expect(PGVector.sanitizeForJsonb(3.14)).toBe(3.14); + expect(PGVector.sanitizeForJsonb(true)).toBe(true); + expect(PGVector.sanitizeForJsonb(false)).toBe(false); + expect(PGVector.sanitizeForJsonb(BigInt(1))).toBe(BigInt(1)); + }); + + it("returns symbol unchanged", () => { + const sym = Symbol("x"); + expect(PGVector.sanitizeForJsonb(sym)).toBe(sym); + }); + + it("does not mutate original objects/arrays", () => { + const obj = { a: "bad\u0000", nested: { b: "ok" } }; + const arr = ["\u0001", { c: "bad\u0002" }]; + const objCopy = JSON.parse(JSON.stringify(obj)); + const arrCopy = JSON.parse(JSON.stringify(arr)); + const resultObj = PGVector.sanitizeForJsonb(obj); + const resultArr = PGVector.sanitizeForJsonb(arr); + // Original inputs remain unchanged + expect(obj).toEqual(objCopy); + expect(arr).toEqual(arrCopy); + // Results are sanitized copies + expect(resultObj).toEqual({ a: "bad", nested: { b: "ok" } }); + expect(resultArr).toEqual(["", { c: "bad" }]); + }); +});