From cfb53af4d434fdb3826d6a0c724a7cd1c1162d5f Mon Sep 17 00:00:00 2001 From: Allen Hu Date: Wed, 6 Nov 2024 14:20:59 +0800 Subject: [PATCH] chat completion api support new model: gpt-4o-audio-preview --- README-zh.md | 6 +-- README.md | 6 +-- api/pom.xml | 2 +- .../completion/chat/AssistantMessage.java | 9 ++-- .../chat/AssistantMessageAudio.java | 39 ++++++++++++++++ .../openai/completion/chat/Audio.java | 27 +++++++++++ .../chat/ChatCompletionRequest.java | 13 ++++++ .../completion/chat/ContentDeserializer.java | 17 +++++++ .../completion/chat/ContentSerializer.java | 3 ++ .../openai/completion/chat/ImageContent.java | 45 ++++++++++++++++++- .../openai/completion/chat/InputAudio.java | 28 ++++++++++++ .../openai/completion/chat/UserMessage.java | 16 ++++++- client/pom.xml | 2 +- example/pom.xml | 4 +- pom.xml | 2 +- service/pom.xml | 2 +- .../openai/service/ChatCompletionTest.java | 23 ++++++++++ 17 files changed, 225 insertions(+), 19 deletions(-) create mode 100644 api/src/main/java/com/theokanning/openai/completion/chat/AssistantMessageAudio.java create mode 100644 api/src/main/java/com/theokanning/openai/completion/chat/Audio.java create mode 100644 api/src/main/java/com/theokanning/openai/completion/chat/InputAudio.java diff --git a/README-zh.md b/README-zh.md index 9740fd1..9d3dfe7 100644 --- a/README-zh.md +++ b/README-zh.md @@ -21,14 +21,14 @@ OpenAi4J是一个非官方的Java库,旨在帮助java开发者与OpenAI的GPT ## 导入依赖 ### Gradle -`implementation 'io.github.lambdua::0.22.4'` +`implementation 'io.github.lambdua::0.22.5'` ### Maven ```xml io.github.lambdua service - 0.22.4 + 0.22.5 ``` @@ -61,7 +61,7 @@ static void simpleChat() { io.github.lambdua api - 0.22.4 + 0.22.5 ``` diff --git a/README.md b/README.md index d896e2e..88b640b 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,14 @@ applications effortlessly. ## Import ### Gradle -`implementation 'io.github.lambdua::0.22.4'` +`implementation 'io.github.lambdua::0.22.5'` ### Maven ```xml io.github.lambdua service - 0.22.4 + 0.22.5 ``` @@ -67,7 +67,7 @@ To utilize pojos, import the api module: io.github.lambdua api - 0.22.4 + 0.22.5 ``` diff --git a/api/pom.xml b/api/pom.xml index 96da0cc..2d063f2 100644 --- a/api/pom.xml +++ b/api/pom.xml @@ -6,7 +6,7 @@ io.github.lambdua openai-java - 0.22.4 + 0.22.5 jar api diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/AssistantMessage.java b/api/src/main/java/com/theokanning/openai/completion/chat/AssistantMessage.java index 903a343..b3c00a0 100644 --- a/api/src/main/java/com/theokanning/openai/completion/chat/AssistantMessage.java +++ b/api/src/main/java/com/theokanning/openai/completion/chat/AssistantMessage.java @@ -1,15 +1,14 @@ package com.theokanning.openai.completion.chat; -import java.util.List; - import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; import com.theokanning.openai.utils.JsonUtil; - import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; +import java.util.List; + /** * @author LiangTao * @date 2024年04月10 10:31 @@ -41,6 +40,10 @@ public class AssistantMessage implements ChatMessage { */ private String refusal; + /** + * Data about a previous audio response from the model. + */ + private AssistantMessageAudio audio; public AssistantMessage(String content) { diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/AssistantMessageAudio.java b/api/src/main/java/com/theokanning/openai/completion/chat/AssistantMessageAudio.java new file mode 100644 index 0000000..00bfd59 --- /dev/null +++ b/api/src/main/java/com/theokanning/openai/completion/chat/AssistantMessageAudio.java @@ -0,0 +1,39 @@ +package com.theokanning.openai.completion.chat; + +import com.fasterxml.jackson.annotation.JsonProperty; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.NonNull; + +/** + * @author Allen Hu + * @date 2024/11/6 + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +class AssistantMessageAudio { + + /** + * Unique identifier for a previous audio response from the model. + */ + @NonNull + private String id; + + /** + * The Unix timestamp (in seconds) for when this audio response will no longer be accessible on the server for use in multi-turn conversations. + */ + @JsonProperty("expires_at") + private Integer expiresAt; + + /** + * Transcript of the audio generated by the model. + */ + private String transcript; + + /** + * Base64 encoded audio bytes generated by the model, in the format specified in the request. + */ + private String data; +} diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/Audio.java b/api/src/main/java/com/theokanning/openai/completion/chat/Audio.java new file mode 100644 index 0000000..95a6cf8 --- /dev/null +++ b/api/src/main/java/com/theokanning/openai/completion/chat/Audio.java @@ -0,0 +1,27 @@ +package com.theokanning.openai.completion.chat; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * Parameters for audio output. Required when audio output is requested with modalities: ["audio"] + * + * @author Allen Hu + * @date 2024/11/5 + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +public class Audio { + + /** + * The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse. + */ + String voice; + + /** + * Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16. + */ + String format; +} diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/ChatCompletionRequest.java b/api/src/main/java/com/theokanning/openai/completion/chat/ChatCompletionRequest.java index 3284f7c..3291b8c 100644 --- a/api/src/main/java/com/theokanning/openai/completion/chat/ChatCompletionRequest.java +++ b/api/src/main/java/com/theokanning/openai/completion/chat/ChatCompletionRequest.java @@ -168,5 +168,18 @@ public class ChatCompletionRequest { @JsonProperty("parallel_tool_calls") Boolean parallelToolCalls; + /** + * Output types that you would like the model to generate for this request. Most models are capable of generating text, which is the default: + * ["text"] + * The gpt-4o-audio-preview model can also be used to generate audio. To request that this model generate both text and audio responses, you can use: + * ["text", "audio"] + * + * {@see https://platform.openai.com/docs/api-reference/chat/create#chat-create-modalities} + */ + List modalities; + /** + * Parameters for audio output. Required when audio output is requested with modalities: ["audio"]. + */ + Audio audio; } diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/ContentDeserializer.java b/api/src/main/java/com/theokanning/openai/completion/chat/ContentDeserializer.java index c1c8022..c07baa7 100644 --- a/api/src/main/java/com/theokanning/openai/completion/chat/ContentDeserializer.java +++ b/api/src/main/java/com/theokanning/openai/completion/chat/ContentDeserializer.java @@ -49,6 +49,8 @@ ImageContent parseContent(JsonParser jsonParser) throws IOException { content.setImageUrl(parseImageUrl(jsonParser)); } else if ("image_file".equals(fieldName)) { content.setImageFile(parseImageFile(jsonParser)); + } else if ("input_audio".equals(fieldName)) { + content.setInputAudio(parseInputAudio(jsonParser)); } } return content; @@ -83,4 +85,19 @@ private ImageUrl parseImageUrl(JsonParser jsonParser) throws IOException { } return new ImageUrl(url, detail); } + + private InputAudio parseInputAudio(JsonParser jsonParser) throws IOException { + String data = null; + String format = null; + while (jsonParser.nextToken() != JsonToken.END_OBJECT) { + String fieldName = jsonParser.getCurrentName(); + jsonParser.nextToken(); + if ("data".equals(fieldName)) { + data = jsonParser.getText(); + } else if ("format".equals(fieldName)) { + format = jsonParser.getText(); + } + } + return new InputAudio(data, format); + } } diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/ContentSerializer.java b/api/src/main/java/com/theokanning/openai/completion/chat/ContentSerializer.java index 10e809e..1d004a0 100644 --- a/api/src/main/java/com/theokanning/openai/completion/chat/ContentSerializer.java +++ b/api/src/main/java/com/theokanning/openai/completion/chat/ContentSerializer.java @@ -35,6 +35,9 @@ public void serialize(Object o, JsonGenerator jsonGenerator, SerializerProvider if (ic.getType().equals("image_file")) { jsonGenerator.writeObjectField("image_file", ic.getImageFile()); } + if (ic.getType().equals("input_audio")) { + jsonGenerator.writeObjectField("input_audio", ic.getInputAudio()); + } jsonGenerator.writeEndObject(); } jsonGenerator.writeEndArray(); diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/ImageContent.java b/api/src/main/java/com/theokanning/openai/completion/chat/ImageContent.java index 0e428a7..b86e2d6 100644 --- a/api/src/main/java/com/theokanning/openai/completion/chat/ImageContent.java +++ b/api/src/main/java/com/theokanning/openai/completion/chat/ImageContent.java @@ -23,7 +23,7 @@ public class ImageContent { /** - * The type of the content. Either "text" or "image_url". + * The type of the content. Either "text", "image_url" or "input_audio". */ @NonNull private String type; @@ -39,6 +39,10 @@ public class ImageContent { @JsonProperty("image_file") private ImageFile imageFile; + @JsonInclude(JsonInclude.Include.NON_NULL) + @JsonProperty("input_audio") + private InputAudio inputAudio; + public ImageContent(String text) { this.type = "text"; @@ -50,6 +54,10 @@ public ImageContent(ImageUrl imageUrl) { this.imageUrl = imageUrl; } + /** + * @deprecated {@link #ofImagePath(Path)} + */ + @Deprecated public ImageContent(Path imagePath){ this.type = "image_url"; String imagePathString = imagePath.toAbsolutePath().toString(); @@ -57,7 +65,31 @@ public ImageContent(Path imagePath){ this.imageUrl=new ImageUrl( "data:image/" + extension + ";base64," + encodeImage(imagePath)); } - private String encodeImage(Path imagePath) { + public ImageContent(InputAudio inputAudio) { + this.type = "input_audio"; + this.inputAudio = inputAudio; + } + + public static ImageContent ofImagePath(Path imagePath){ + String imagePathString = imagePath.toAbsolutePath().toString(); + String extension = imagePathString.substring(imagePathString.lastIndexOf('.') + 1); + ImageUrl imageUrl = new ImageUrl("data:image/" + extension + ";base64," + encode2base64(imagePath)); + return new ImageContent(imageUrl); + } + + public static ImageContent ofAudioPath(Path inputAudioPath) { + String inputAudioPathString = inputAudioPath.toAbsolutePath().toString(); + String extension = inputAudioPathString.substring(inputAudioPathString.lastIndexOf('.') + 1); + String base64 = encode2base64(inputAudioPath); + InputAudio inputAudio = new InputAudio(base64, extension); + return new ImageContent(inputAudio); + } + + /** + * @deprecated use {@link #encode2base64(Path)} + */ + @Deprecated + private static String encodeImage(Path imagePath) { byte[] fileContent; try { fileContent = Files.readAllBytes(imagePath); @@ -67,4 +99,13 @@ private String encodeImage(Path imagePath) { } } + private static String encode2base64(Path path) { + byte[] fileContent; + try { + fileContent = Files.readAllBytes(path); + return Base64.getEncoder().encodeToString(fileContent); + } catch (IOException e) { + throw new RuntimeException(e); + } + } } diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/InputAudio.java b/api/src/main/java/com/theokanning/openai/completion/chat/InputAudio.java new file mode 100644 index 0000000..3d91646 --- /dev/null +++ b/api/src/main/java/com/theokanning/openai/completion/chat/InputAudio.java @@ -0,0 +1,28 @@ +package com.theokanning.openai.completion.chat; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.NonNull; + +/** + * @author Allen Hu + * @date 2024/11/6 + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +public class InputAudio { + + /** + * Base64 encoded audio data. + */ + @NonNull + private String data; + + /** + * The format of the encoded audio data. Currently supports "wav" and "mp3". + */ + @NonNull + private String format; +} diff --git a/api/src/main/java/com/theokanning/openai/completion/chat/UserMessage.java b/api/src/main/java/com/theokanning/openai/completion/chat/UserMessage.java index 77d5f0a..48dd720 100644 --- a/api/src/main/java/com/theokanning/openai/completion/chat/UserMessage.java +++ b/api/src/main/java/com/theokanning/openai/completion/chat/UserMessage.java @@ -83,11 +83,23 @@ public static UserMessage buildImageMessage(String prompt, String... imageUrls) * @return com.theokanning.openai.completion.chat.UserMessage **/ public static UserMessage buildImageMessage(String prompt, Path... imagePaths) { - List imageContents = Arrays.stream(imagePaths).map(ImageContent::new).collect(Collectors.toList()); + List imageContents = Arrays.stream(imagePaths).map(ImageContent::ofImagePath).collect(Collectors.toList()); imageContents.add(0, new ImageContent(prompt)); return new UserMessage(imageContents); } - + /** + * 构建一个音频识别请求消息,支持多个音频 + * @param prompt query text + * @param inputAudioPaths 音频文件本地路径 + * @return com.theokanning.openai.completion.chat.UserMessage + * @author Allen Hu + * @date 2024/11/6 + */ + public static UserMessage buildInputAudioMessage(String prompt, Path... inputAudioPaths) { + List imageContents = Arrays.stream(inputAudioPaths).map(ImageContent::ofAudioPath).collect(Collectors.toList()); + imageContents.add(0, new ImageContent(prompt)); + return new UserMessage(imageContents); + } } diff --git a/client/pom.xml b/client/pom.xml index 78e9188..63f507e 100644 --- a/client/pom.xml +++ b/client/pom.xml @@ -6,7 +6,7 @@ io.github.lambdua openai-java - 0.22.4 + 0.22.5 jar diff --git a/example/pom.xml b/example/pom.xml index db08674..791a894 100644 --- a/example/pom.xml +++ b/example/pom.xml @@ -6,7 +6,7 @@ io.github.lambdua example - 0.22.4 + 0.22.5 example @@ -17,7 +17,7 @@ io.github.lambdua service - 0.22.4 + 0.22.5 diff --git a/pom.xml b/pom.xml index 4c3fd5f..79112cb 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ io.github.lambdua openai-java - 0.22.4 + 0.22.5 pom openai java 版本 https://github.com/Lambdua/openai-java diff --git a/service/pom.xml b/service/pom.xml index 0e6a651..7e6d9f6 100644 --- a/service/pom.xml +++ b/service/pom.xml @@ -6,7 +6,7 @@ io.github.lambdua openai-java - 0.22.4 + 0.22.5 jar diff --git a/service/src/test/java/com/theokanning/openai/service/ChatCompletionTest.java b/service/src/test/java/com/theokanning/openai/service/ChatCompletionTest.java index 5b4d332..10a9f74 100644 --- a/service/src/test/java/com/theokanning/openai/service/ChatCompletionTest.java +++ b/service/src/test/java/com/theokanning/openai/service/ChatCompletionTest.java @@ -665,6 +665,29 @@ void createLocalImageChatCompletion() throws URISyntaxException { assertNotNull(choice.getMessage().getContent()); } + @Test + void createInputAudioChatCompletion() throws URISyntaxException { + final List messages = new ArrayList<>(); + final ChatMessage systemMessage = new SystemMessage("You are a helpful assistant."); + Path audioPath= Paths.get(Objects.requireNonNull(ChatCompletionTest.class.getClassLoader().getResource("hello-world.mp3")).toURI()); + + final ChatMessage audioMessage = UserMessage.buildInputAudioMessage("What'\''s in this audio?", audioPath); + messages.add(systemMessage); + messages.add(audioMessage); + ChatCompletionRequest chatCompletionRequest = ChatCompletionRequest + .builder() + .model("gpt-4o-audio-preview") + .messages(messages) + .n(1) + .maxTokens(200) + .modalities(Arrays.asList("text", "audio")) + .audio(new Audio("alloy", "wav")) + .build(); + + ChatCompletionChoice choice = service.createChatCompletion(chatCompletionRequest).getChoices().get(0); + assertNotNull(choice.getMessage().getAudio()); + } + /** * 流式请求中使用多个tool调用场景下的测试