From 474d4252b28bbfa4a7216ed769de61098eec1c6a Mon Sep 17 00:00:00 2001 From: shatfield4 Date: Tue, 2 Sep 2025 15:52:15 -0700 Subject: [PATCH 1/4] export image support for json and jsonl --- server/utils/helpers/chat/convertTo.js | 55 +++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/server/utils/helpers/chat/convertTo.js b/server/utils/helpers/chat/convertTo.js index 5f3f752ba5e..c9eeedf6682 100644 --- a/server/utils/helpers/chat/convertTo.js +++ b/server/utils/helpers/chat/convertTo.js @@ -72,6 +72,13 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { sent_at: chat.createdAt, }; + // Only add images for JSON format since we cannot + // make an array of images in csv + if (format === "json") { + const attachments = responseJson.attachments || []; + baseData.images = attachments.map(attachmentToDataUrl); + } + if (chatType === "embed") { return { ...baseData, @@ -120,28 +127,55 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { const workspaceChatsMap = chats.reduce((acc, chat) => { const { prompt, response, workspaceId } = chat; const responseJson = JSON.parse(response); + const attachments = responseJson.attachments || []; if (!acc[workspaceId]) { acc[workspaceId] = { messages: [ { role: "system", - content: - chat.workspace?.openAiPrompt || - "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.", + content: [ + { + type: "text", + text: + chat.workspace?.openAiPrompt || + "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.", + }, + ], }, ], }; } + const userContent = [ + { + type: "text", + text: prompt, + }, + ]; + + if (attachments.length > 0) { + attachments.forEach((attachment) => { + userContent.push({ + type: "image", + image: attachmentToDataUrl(attachment), + }); + }); + } + acc[workspaceId].messages.push( { role: "user", - content: prompt, + content: userContent, }, { role: "assistant", - content: responseJson.text, + content: [ + { + type: "text", + text: responseJson.text, + }, + ], } ); @@ -203,6 +237,17 @@ function buildSystemPrompt(chat, prompt = null) { return `${prompt ?? STANDARD_PROMPT}${context}`; } +/** + * Converts an attachment's content string to a proper data URL format if needed + * @param {Object} attachment - The attachment object containing contentString and mime type + * @returns {string} The properly formatted data URL + */ +function attachmentToDataUrl(attachment) { + return attachment.contentString.startsWith("data:") + ? attachment.contentString + : `data:${attachment.mime};base64,${attachment.contentString}`; +} + module.exports = { prepareChatsForExport, exportChatsAsType, From fffe2aaf2914fff9d9ce3c19284ea25c6552556e Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 3 Sep 2025 10:23:58 -0700 Subject: [PATCH 2/4] add tests and cleanup functionality --- server/utils/helpers/chat/convertTo.js | 44 ++++++++++++++------------ 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/server/utils/helpers/chat/convertTo.js b/server/utils/helpers/chat/convertTo.js index c9eeedf6682..1593fc81102 100644 --- a/server/utils/helpers/chat/convertTo.js +++ b/server/utils/helpers/chat/convertTo.js @@ -64,21 +64,26 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { if (format === "csv" || format === "json") { const preparedData = chats.map((chat) => { - const responseJson = JSON.parse(chat.response); + const responseJson = safeJsonParse(chat.response, {}); const baseData = { id: chat.id, prompt: chat.prompt, response: responseJson.text, sent_at: chat.createdAt, + // Only add attachments to the json format since we cannot arrange attachments in csv format + ...(format === "json" + ? { + attachments: + responseJson.attachments?.length > 0 + ? responseJson.attachments.map((attachment) => ({ + type: "image", + image: attachmentToDataUrl(attachment), + })) + : [], + } + : {}), }; - // Only add images for JSON format since we cannot - // make an array of images in csv - if (format === "json") { - const attachments = responseJson.attachments || []; - baseData.images = attachments.map(attachmentToDataUrl); - } - if (chatType === "embed") { return { ...baseData, @@ -108,9 +113,10 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { return preparedData; } + // jsonAlpaca format does not support array outputs if (format === "jsonAlpaca") { const preparedData = chats.map((chat) => { - const responseJson = JSON.parse(chat.response); + const responseJson = safeJsonParse(chat.response, {}); return { instruction: buildSystemPrompt( chat, @@ -126,8 +132,8 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { const workspaceChatsMap = chats.reduce((acc, chat) => { const { prompt, response, workspaceId } = chat; - const responseJson = JSON.parse(response); - const attachments = responseJson.attachments || []; + const responseJson = safeJsonParse(response, { attachments: [] }); + const attachments = responseJson.attachments; if (!acc[workspaceId]) { acc[workspaceId] = { @@ -147,22 +153,20 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { }; } + // Build the user content array with the prompt and attachments (if any) const userContent = [ { type: "text", text: prompt, }, + ...(attachments?.length > 0 + ? attachments.map((attachment) => ({ + type: "image", + image: attachmentToDataUrl(attachment), + })) + : []), ]; - if (attachments.length > 0) { - attachments.forEach((attachment) => { - userContent.push({ - type: "image", - image: attachmentToDataUrl(attachment), - }); - }); - } - acc[workspaceId].messages.push( { role: "user", From de44721b7ab7b777333d7ca3bb80c247e4ad7de8 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 3 Sep 2025 10:24:46 -0700 Subject: [PATCH 3/4] add test for convertTo prepare function --- .../__tests__/utils/helpers/convertTo.test.js | 238 ++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 server/__tests__/utils/helpers/convertTo.test.js diff --git a/server/__tests__/utils/helpers/convertTo.test.js b/server/__tests__/utils/helpers/convertTo.test.js new file mode 100644 index 00000000000..2b974d87ed7 --- /dev/null +++ b/server/__tests__/utils/helpers/convertTo.test.js @@ -0,0 +1,238 @@ +/* eslint-env jest */ +const { prepareChatsForExport } = require("../../../utils/helpers/chat/convertTo"); + +// Mock the database models +jest.mock("../../../models/workspaceChats"); +jest.mock("../../../models/embedChats"); + +const { WorkspaceChats } = require("../../../models/workspaceChats"); +const { EmbedChats } = require("../../../models/embedChats"); + +const mockChat = (withImages = false) => { + return { + id: 1, + prompt: "Test prompt", + response: JSON.stringify({ + text: "Test response", + attachments: withImages ? [ + { mime: "image/png", name: "image.png", contentString: "....=" }, + { mime: "image/jpeg", name: "image2.jpeg", contentString: "....=" } + ] : [], + sources: [], + metrics: {}, + }), + createdAt: new Date(), + workspace: { name: "Test Workspace", openAiPrompt: "Test OpenAI Prompt" }, + user: { username: "testuser" }, + feedbackScore: 1, + } +}; + +describe("prepareChatsForExport", () => { + beforeEach(() => { + jest.clearAllMocks(); + WorkspaceChats.whereWithData = jest.fn().mockResolvedValue([]); + EmbedChats.whereWithEmbedAndWorkspace = jest.fn().mockResolvedValue([]); + }); + + test("should throw error for invalid chat type", async () => { + await expect(prepareChatsForExport("json", "invalid")) + .rejects + .toThrow("Invalid chat type: invalid"); + }); + + test("should throw error for invalid export type", async () => { + await expect(prepareChatsForExport("invalid", "workspace")) + .rejects + .toThrow("Invalid export type: invalid"); + }); + + // CSV and JSON are the same format, so we can test them together + test("should return prepared data in csv and json format for workspace chat type", async () => { + const chatExample = mockChat(); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample]); + const result = await prepareChatsForExport("json", "workspace"); + + const responseJson = JSON.parse(chatExample.response); + expect(result).toBeDefined(); + expect(result).toEqual([{ + id: chatExample.id, + prompt: chatExample.prompt, + response: responseJson.text, + sent_at: chatExample.createdAt, + rating: chatExample.feedbackScore ? "GOOD" : "BAD", + username: chatExample.user.username, + workspace: chatExample.workspace.name, + attachments: [], + }]); + }); + + test("Should handle attachments for workspace chat type when json format is selected", async () => { + const chatExample = mockChat(true); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample]); + const result = await prepareChatsForExport("json", "workspace"); + + const responseJson = JSON.parse(chatExample.response); + expect(result).toBeDefined(); + expect(result).toEqual([{ + id: chatExample.id, + prompt: chatExample.prompt, + response: responseJson.text, + sent_at: chatExample.createdAt, + rating: chatExample.feedbackScore ? "GOOD" : "BAD", + username: chatExample.user.username, + workspace: chatExample.workspace.name, + attachments: [ + { + type: "image", + image: responseJson.attachments[0].contentString, + }, + { + type: "image", + image: responseJson.attachments[1].contentString, + }, + ] + }]); + }); + + test("Should ignore attachments for workspace chat type when csv format is selected", async () => { + const chatExample = mockChat(true); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample]); + const result = await prepareChatsForExport("csv", "workspace"); + + const responseJson = JSON.parse(chatExample.response); + expect(result).toBeDefined(); + expect(result.attachments).not.toBeDefined(); + expect(result).toEqual([{ + id: chatExample.id, + prompt: chatExample.prompt, + response: responseJson.text, + sent_at: chatExample.createdAt, + rating: chatExample.feedbackScore ? "GOOD" : "BAD", + username: chatExample.user.username, + workspace: chatExample.workspace.name, + }]); + }); + + test("should return prepared data in jsonAlpaca format for workspace chat type", async () => { + const chatExample = mockChat(); + const imageChatExample = mockChat(true); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample, imageChatExample]); + const result = await prepareChatsForExport("jsonAlpaca", "workspace"); + + const responseJson1 = JSON.parse(chatExample.response); + const responseJson2 = JSON.parse(imageChatExample.response); + expect(result).toBeDefined(); + + // Alpaca format does not support attachments - so they are not included + expect(result[0].attachments).not.toBeDefined(); + expect(result[1].attachments).not.toBeDefined(); + expect(result).toEqual([{ + instruction: chatExample.workspace.openAiPrompt, + input: chatExample.prompt, + output: responseJson1.text, + }, + { + instruction: chatExample.workspace.openAiPrompt, + input: imageChatExample.prompt, + output: responseJson2.text, + }]); + }); + + test("should return prepared data in jsonl format for workspace chat type", async () => { + const chatExample = mockChat(); + const responseJson = JSON.parse(chatExample.response); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample]); + const result = await prepareChatsForExport("jsonl", "workspace"); + expect(result).toBeDefined(); + expect(result).toEqual( + { + [chatExample.workspace.id]: { + messages: [ + { + role: "system", + content: [{ + type: "text", + text: chatExample.workspace.openAiPrompt, + }], + }, + { + role: "user", + content: [{ + type: "text", + text: chatExample.prompt, + }], + }, + { + role: "assistant", + content: [{ + type: "text", + text: responseJson.text, + }], + }, + ], + }, + }, + ); + }); + + test("should return prepared data in jsonl format for workspace chat type with attachments", async () => { + const chatExample = mockChat(); + const imageChatExample = mockChat(true); + const responseJson = JSON.parse(chatExample.response); + const imageResponseJson = JSON.parse(imageChatExample.response); + + WorkspaceChats.whereWithData.mockResolvedValue([chatExample, imageChatExample]); + const result = await prepareChatsForExport("jsonl", "workspace"); + expect(result).toBeDefined(); + expect(result).toEqual( + { + [chatExample.workspace.id]: { + messages: [ + { + role: "system", + content: [{ + type: "text", + text: chatExample.workspace.openAiPrompt, + }], + }, + { + role: "user", + content: [{ + type: "text", + text: chatExample.prompt, + }], + }, + { + role: "assistant", + content: [{ + type: "text", + text: responseJson.text, + }], + }, + { + role: "user", + content: [{ + type: "text", + text: imageChatExample.prompt, + }, { + type: "image", + image: imageResponseJson.attachments[0].contentString, + }, { + type: "image", + image: imageResponseJson.attachments[1].contentString, + }], + }, + { + role: "assistant", + content: [{ + type: "text", + text: imageResponseJson.text, + }], + }, + ], + }, + }, + ); + }); +}); \ No newline at end of file From 10a30fd56ff7f585b5d07beae399bed1116d777a Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 3 Sep 2025 10:28:22 -0700 Subject: [PATCH 4/4] comment --- server/utils/helpers/chat/convertTo.js | 29 +++++++++++++------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/server/utils/helpers/chat/convertTo.js b/server/utils/helpers/chat/convertTo.js index 1593fc81102..9ec2b838eea 100644 --- a/server/utils/helpers/chat/convertTo.js +++ b/server/utils/helpers/chat/convertTo.js @@ -34,6 +34,7 @@ async function convertToJSONAlpaca(preparedData) { return JSON.stringify(preparedData, null, 4); } +// You can validate JSONL outputs on https://jsonlines.org/validator/ async function convertToJSONL(workspaceChatsMap) { return Object.values(workspaceChatsMap) .map((workspaceChats) => JSON.stringify(workspaceChats)) @@ -130,6 +131,7 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { return preparedData; } + // Export to JSONL format (recommended for fine-tuning) const workspaceChatsMap = chats.reduce((acc, chat) => { const { prompt, response, workspaceId } = chat; const responseJson = safeJsonParse(response, { attachments: [] }); @@ -153,24 +155,21 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { }; } - // Build the user content array with the prompt and attachments (if any) - const userContent = [ - { - type: "text", - text: prompt, - }, - ...(attachments?.length > 0 - ? attachments.map((attachment) => ({ - type: "image", - image: attachmentToDataUrl(attachment), - })) - : []), - ]; - acc[workspaceId].messages.push( { role: "user", - content: userContent, + content: [ + { + type: "text", + text: prompt, + }, + ...(attachments?.length > 0 + ? attachments.map((attachment) => ({ + type: "image", + image: attachmentToDataUrl(attachment), + })) + : []), + ], }, { role: "assistant",