diff --git a/server/__tests__/utils/helpers/convertTo.test.js b/server/__tests__/utils/helpers/convertTo.test.js new file mode 100644 index 00000000000..2b974d87ed7 --- /dev/null +++ b/server/__tests__/utils/helpers/convertTo.test.js @@ -0,0 +1,238 @@ +/* eslint-env jest */ +const { prepareChatsForExport } = require("../../../utils/helpers/chat/convertTo"); + +// Mock the database models +jest.mock("../../../models/workspaceChats"); +jest.mock("../../../models/embedChats"); + +const { WorkspaceChats } = require("../../../models/workspaceChats"); +const { EmbedChats } = require("../../../models/embedChats"); + +const mockChat = (withImages = false) => { + return { + id: 1, + prompt: "Test prompt", + response: JSON.stringify({ + text: "Test response", + attachments: withImages ? [ + { mime: "image/png", name: "image.png", contentString: "data:image/png;base64,iVBORw0KGg....=" }, + { mime: "image/jpeg", name: "image2.jpeg", contentString: "data:image/jpeg;base64,iVBORw0KGg....=" } + ] : [], + sources: [], + metrics: {}, + }), + createdAt: new Date(), + workspace: { name: "Test Workspace", openAiPrompt: "Test OpenAI Prompt" }, + user: { username: "testuser" }, + feedbackScore: 1, + } +}; + +describe("prepareChatsForExport", () => { + beforeEach(() => { + jest.clearAllMocks(); + WorkspaceChats.whereWithData = jest.fn().mockResolvedValue([]); + EmbedChats.whereWithEmbedAndWorkspace = jest.fn().mockResolvedValue([]); + }); + + test("should throw error for invalid chat type", async () => { + await expect(prepareChatsForExport("json", "invalid")) + .rejects + .toThrow("Invalid chat type: invalid"); + }); + + test("should throw error for invalid export type", async () => { + await expect(prepareChatsForExport("invalid", "workspace")) + .rejects + .toThrow("Invalid export type: invalid"); + }); + + // CSV and JSON are the same format, so we can test them together + test("should return prepared data in csv and json format for workspace chat type", async () => { + const chatExample = mockChat(); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample]); + const result = await prepareChatsForExport("json", "workspace"); + + const responseJson = JSON.parse(chatExample.response); + expect(result).toBeDefined(); + expect(result).toEqual([{ + id: chatExample.id, + prompt: chatExample.prompt, + response: responseJson.text, + sent_at: chatExample.createdAt, + rating: chatExample.feedbackScore ? "GOOD" : "BAD", + username: chatExample.user.username, + workspace: chatExample.workspace.name, + attachments: [], + }]); + }); + + test("Should handle attachments for workspace chat type when json format is selected", async () => { + const chatExample = mockChat(true); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample]); + const result = await prepareChatsForExport("json", "workspace"); + + const responseJson = JSON.parse(chatExample.response); + expect(result).toBeDefined(); + expect(result).toEqual([{ + id: chatExample.id, + prompt: chatExample.prompt, + response: responseJson.text, + sent_at: chatExample.createdAt, + rating: chatExample.feedbackScore ? "GOOD" : "BAD", + username: chatExample.user.username, + workspace: chatExample.workspace.name, + attachments: [ + { + type: "image", + image: responseJson.attachments[0].contentString, + }, + { + type: "image", + image: responseJson.attachments[1].contentString, + }, + ] + }]); + }); + + test("Should ignore attachments for workspace chat type when csv format is selected", async () => { + const chatExample = mockChat(true); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample]); + const result = await prepareChatsForExport("csv", "workspace"); + + const responseJson = JSON.parse(chatExample.response); + expect(result).toBeDefined(); + expect(result.attachments).not.toBeDefined(); + expect(result).toEqual([{ + id: chatExample.id, + prompt: chatExample.prompt, + response: responseJson.text, + sent_at: chatExample.createdAt, + rating: chatExample.feedbackScore ? "GOOD" : "BAD", + username: chatExample.user.username, + workspace: chatExample.workspace.name, + }]); + }); + + test("should return prepared data in jsonAlpaca format for workspace chat type", async () => { + const chatExample = mockChat(); + const imageChatExample = mockChat(true); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample, imageChatExample]); + const result = await prepareChatsForExport("jsonAlpaca", "workspace"); + + const responseJson1 = JSON.parse(chatExample.response); + const responseJson2 = JSON.parse(imageChatExample.response); + expect(result).toBeDefined(); + + // Alpaca format does not support attachments - so they are not included + expect(result[0].attachments).not.toBeDefined(); + expect(result[1].attachments).not.toBeDefined(); + expect(result).toEqual([{ + instruction: chatExample.workspace.openAiPrompt, + input: chatExample.prompt, + output: responseJson1.text, + }, + { + instruction: chatExample.workspace.openAiPrompt, + input: imageChatExample.prompt, + output: responseJson2.text, + }]); + }); + + test("should return prepared data in jsonl format for workspace chat type", async () => { + const chatExample = mockChat(); + const responseJson = JSON.parse(chatExample.response); + WorkspaceChats.whereWithData.mockResolvedValue([chatExample]); + const result = await prepareChatsForExport("jsonl", "workspace"); + expect(result).toBeDefined(); + expect(result).toEqual( + { + [chatExample.workspace.id]: { + messages: [ + { + role: "system", + content: [{ + type: "text", + text: chatExample.workspace.openAiPrompt, + }], + }, + { + role: "user", + content: [{ + type: "text", + text: chatExample.prompt, + }], + }, + { + role: "assistant", + content: [{ + type: "text", + text: responseJson.text, + }], + }, + ], + }, + }, + ); + }); + + test("should return prepared data in jsonl format for workspace chat type with attachments", async () => { + const chatExample = mockChat(); + const imageChatExample = mockChat(true); + const responseJson = JSON.parse(chatExample.response); + const imageResponseJson = JSON.parse(imageChatExample.response); + + WorkspaceChats.whereWithData.mockResolvedValue([chatExample, imageChatExample]); + const result = await prepareChatsForExport("jsonl", "workspace"); + expect(result).toBeDefined(); + expect(result).toEqual( + { + [chatExample.workspace.id]: { + messages: [ + { + role: "system", + content: [{ + type: "text", + text: chatExample.workspace.openAiPrompt, + }], + }, + { + role: "user", + content: [{ + type: "text", + text: chatExample.prompt, + }], + }, + { + role: "assistant", + content: [{ + type: "text", + text: responseJson.text, + }], + }, + { + role: "user", + content: [{ + type: "text", + text: imageChatExample.prompt, + }, { + type: "image", + image: imageResponseJson.attachments[0].contentString, + }, { + type: "image", + image: imageResponseJson.attachments[1].contentString, + }], + }, + { + role: "assistant", + content: [{ + type: "text", + text: imageResponseJson.text, + }], + }, + ], + }, + }, + ); + }); +}); \ No newline at end of file diff --git a/server/utils/helpers/chat/convertTo.js b/server/utils/helpers/chat/convertTo.js index 5f3f752ba5e..9ec2b838eea 100644 --- a/server/utils/helpers/chat/convertTo.js +++ b/server/utils/helpers/chat/convertTo.js @@ -34,6 +34,7 @@ async function convertToJSONAlpaca(preparedData) { return JSON.stringify(preparedData, null, 4); } +// You can validate JSONL outputs on https://jsonlines.org/validator/ async function convertToJSONL(workspaceChatsMap) { return Object.values(workspaceChatsMap) .map((workspaceChats) => JSON.stringify(workspaceChats)) @@ -64,12 +65,24 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { if (format === "csv" || format === "json") { const preparedData = chats.map((chat) => { - const responseJson = JSON.parse(chat.response); + const responseJson = safeJsonParse(chat.response, {}); const baseData = { id: chat.id, prompt: chat.prompt, response: responseJson.text, sent_at: chat.createdAt, + // Only add attachments to the json format since we cannot arrange attachments in csv format + ...(format === "json" + ? { + attachments: + responseJson.attachments?.length > 0 + ? responseJson.attachments.map((attachment) => ({ + type: "image", + image: attachmentToDataUrl(attachment), + })) + : [], + } + : {}), }; if (chatType === "embed") { @@ -101,9 +114,10 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { return preparedData; } + // jsonAlpaca format does not support array outputs if (format === "jsonAlpaca") { const preparedData = chats.map((chat) => { - const responseJson = JSON.parse(chat.response); + const responseJson = safeJsonParse(chat.response, {}); return { instruction: buildSystemPrompt( chat, @@ -117,18 +131,25 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { return preparedData; } + // Export to JSONL format (recommended for fine-tuning) const workspaceChatsMap = chats.reduce((acc, chat) => { const { prompt, response, workspaceId } = chat; - const responseJson = JSON.parse(response); + const responseJson = safeJsonParse(response, { attachments: [] }); + const attachments = responseJson.attachments; if (!acc[workspaceId]) { acc[workspaceId] = { messages: [ { role: "system", - content: - chat.workspace?.openAiPrompt || - "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.", + content: [ + { + type: "text", + text: + chat.workspace?.openAiPrompt || + "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.", + }, + ], }, ], }; @@ -137,11 +158,27 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") { acc[workspaceId].messages.push( { role: "user", - content: prompt, + content: [ + { + type: "text", + text: prompt, + }, + ...(attachments?.length > 0 + ? attachments.map((attachment) => ({ + type: "image", + image: attachmentToDataUrl(attachment), + })) + : []), + ], }, { role: "assistant", - content: responseJson.text, + content: [ + { + type: "text", + text: responseJson.text, + }, + ], } ); @@ -203,6 +240,17 @@ function buildSystemPrompt(chat, prompt = null) { return `${prompt ?? STANDARD_PROMPT}${context}`; } +/** + * Converts an attachment's content string to a proper data URL format if needed + * @param {Object} attachment - The attachment object containing contentString and mime type + * @returns {string} The properly formatted data URL + */ +function attachmentToDataUrl(attachment) { + return attachment.contentString.startsWith("data:") + ? attachment.contentString + : `data:${attachment.mime};base64,${attachment.contentString}`; +} + module.exports = { prepareChatsForExport, exportChatsAsType,