com.openai.unity 8.6.6 (#366)

StephenHodgson · Copilot · web-flow · commit 069ede77af16 · 2025-04-27T14:45:32.000-04:00
- fix mp3 and wav playback from SpeechRequests
- added SemanticVAD options to realtime
- added new audio models to static model list

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/Runtime/Audio/AudioEndpoint.cs b/Runtime/Audio/AudioEndpoint.cs
@@ -73,15 +73,28 @@ public async Task<SpeechClip> GetSpeechAsync(SpeechRequest request, Action<Speec
 
             Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);
 
-            var part = 0;
-            var pcmResponse = await Rest.PostAsync(GetUrl("/speech"), payload, StreamCallback, 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
-            pcmResponse.Validate(EnableDebug);
-            await File.WriteAllBytesAsync(cachedPath, pcmResponse.Data, cancellationToken).ConfigureAwait(true);
-            return new SpeechClip(clipName, cachedPath, new ReadOnlyMemory<byte>(pcmResponse.Data));
-
-            void StreamCallback(Response partialResponse)
+            switch (request.ResponseFormat)
             {
-                partialClipCallback?.Invoke(new SpeechClip($"{clipName}_{++part}", null, partialResponse.Data));
+                case SpeechResponseFormat.PCM:
+                {
+                    var part = 0;
+                    var pcmResponse = await Rest.PostAsync(GetUrl("/speech"), payload, partialResponse =>
+                    {
+                        partialClipCallback?.Invoke(new SpeechClip($"{clipName}_{++part}", null, partialResponse.Data));
+                    }, 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
+                    pcmResponse.Validate(EnableDebug);
+                    await File.WriteAllBytesAsync(cachedPath, pcmResponse.Data, cancellationToken).ConfigureAwait(true);
+                    return new SpeechClip(clipName, cachedPath, new ReadOnlyMemory<byte>(pcmResponse.Data));
+                }
+                default:
+                {
+                    var audioResponse = await Rest.PostAsync(GetUrl("/speech"), payload, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
+                    audioResponse.Validate(EnableDebug);
+                    await File.WriteAllBytesAsync(cachedPath, audioResponse.Data, cancellationToken).ConfigureAwait(true);
+                    var audioType = request.ResponseFormat == SpeechResponseFormat.MP3 ? AudioType.MPEG : AudioType.WAV;
+                    var finalClip = await Rest.DownloadAudioClipAsync(cachedPath, audioType, fileName: clipName, cancellationToken: cancellationToken);
+                    return new SpeechClip(clipName, cachedPath, finalClip);
+                }
             }
         }
 
@@ -91,6 +104,7 @@ void StreamCallback(Response partialResponse)
         /// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
         /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
         /// <returns>The transcribed text.</returns>
+        [Function("Transcribes audio into the input language. Returns transcribed text.")]
         public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
         {
             var response = await Internal_CreateTranscriptionAsync(request, cancellationToken);
@@ -106,6 +120,7 @@ public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest
         /// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
         /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
         /// <returns><see cref="AudioResponse"/>.</returns>
+        [Function("Transcribes audio into the input language. Returns Json parsed AudioResponse.")]
         public async Task<AudioResponse> CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
         {
             if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
@@ -170,6 +185,7 @@ private async Task<string> Internal_CreateTranscriptionAsync(AudioTranscriptionR
         /// <param name="request"></param>
         /// <param name="cancellationToken"></param>
         /// <returns>The translated text.</returns>
+        [Function("Translates audio into English. Returns translated text.")]
         public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
         {
             var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken);
@@ -185,6 +201,7 @@ public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest req
         /// <param name="cancellationToken"></param>
         /// <returns></returns>
         /// <exception cref="ArgumentException"></exception>
+        [Function("Translates audio into English. Returns Json parsed AudioResponse.")]
         public async Task<AudioResponse> CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
         {
             if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
diff --git a/Runtime/Audio/SpeechClip.cs b/Runtime/Audio/SpeechClip.cs
@@ -10,6 +10,18 @@ namespace OpenAI.Audio
     [Preserve]
     public sealed class SpeechClip
     {
+        [Preserve]
+        internal SpeechClip(string name, string cachePath, AudioClip audioClip)
+        {
+            Name = name;
+            CachePath = cachePath;
+            this.audioClip = audioClip;
+            SampleRate = audioClip.frequency;
+            var samples = new float[audioClip.samples];
+            audioClip.GetData(samples, 0);
+            AudioData = PCMEncoder.Encode(samples);
+        }
+
         [Preserve]
         internal SpeechClip(string name, string cachePath, ReadOnlyMemory<byte> audioData, int sampleRate = 24000)
         {
diff --git a/Runtime/Extensions/VoiceActivityDetectionSettingsConverter.cs b/Runtime/Extensions/VoiceActivityDetectionSettingsConverter.cs
@@ -1,33 +1,46 @@
 ﻿// Licensed under the MIT License. See LICENSE in the project root for license information.
 
 using Newtonsoft.Json;
+using Newtonsoft.Json.Linq;
 using OpenAI.Realtime;
 using System;
 using UnityEngine.Scripting;
 
 namespace OpenAI
 {
     [Preserve]
-    internal class VoiceActivityDetectionSettingsConverter : JsonConverter<VoiceActivityDetectionSettings>
+    internal class VoiceActivityDetectionSettingsConverter : JsonConverter
     {
         [Preserve]
-        public override VoiceActivityDetectionSettings ReadJson(JsonReader reader, Type objectType, VoiceActivityDetectionSettings existingValue, bool hasExistingValue, JsonSerializer serializer)
+        public override bool CanWrite => true;
+
+        [Preserve]
+        public override bool CanConvert(Type objectType) => typeof(IVoiceActivityDetectionSettings).IsAssignableFrom(objectType);
+
+        [Preserve]
+        public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
         {
-            return reader.TokenType == JsonToken.Null
-                ? VoiceActivityDetectionSettings.Disabled()
-                : serializer.Deserialize<VoiceActivityDetectionSettings>(reader);
+            var jObject = JObject.Load(reader);
+            var type = jObject["type"]?.Value<string>() ?? "disabled";
+
+            return type switch
+            {
+                "disabled" => new DisabledVAD(),
+                "server_vad" => jObject.ToObject<ServerVAD>(serializer),
+                "semantic_vad" => jObject.ToObject<SemanticVAD>(serializer),
+                _ => throw new NotImplementedException($"Unknown VAD type: {type}")
+            };
         }
 
         [Preserve]
-        public override void WriteJson(JsonWriter writer, VoiceActivityDetectionSettings value, JsonSerializer serializer)
+        public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
         {
-            switch (value.Type)
+            switch (value)
             {
-                case TurnDetectionType.Disabled:
+                case DisabledVAD:
                     writer.WriteNull();
                     break;
                 default:
-                case TurnDetectionType.Server_VAD:
                     serializer.Serialize(writer, value);
                     break;
             }
diff --git a/Runtime/Models/Model.cs b/Runtime/Models/Model.cs
@@ -1,8 +1,8 @@
 // Licensed under the MIT License. See LICENSE in the project root for license information.
 
+using Newtonsoft.Json;
 using System;
 using System.Collections.Generic;
-using Newtonsoft.Json;
 using UnityEngine.Scripting;
 
 namespace OpenAI.Models
@@ -205,27 +205,18 @@ internal Model(
         [Obsolete("Removed")]
         public static Model Moderation_Stable { get; } = new("text-moderation-stable", "openai");
 
-        /// <summary>
-        /// The latest text to speech model, optimized for speed.
-        /// </summary>
-        /// <remarks>
-        /// The default model for <see cref="Audio.SpeechRequest"/>s.
-        /// </remarks>
         public static Model TTS_1 { get; } = new("tts-1", "openai");
 
-        /// <summary>
-        /// The latest text to speech model, optimized for quality.
-        /// </summary>
         public static Model TTS_1HD { get; } = new("tts-1-hd", "openai");
 
-        /// <summary>
-        /// The default model for <see cref="Audio.AudioEndpoint"/>.
-        /// </summary>
+        public static Model TTS_GPT_4o_Mini { get; } = new("gpt-4o-mini-tts", "openai");
+
         public static Model Whisper1 { get; } = new("whisper-1", "openai");
 
-        /// <summary>
-        /// The default model for <see cref="Images.ImagesEndpoint"/>.
-        /// </summary>
+        public static Model Transcribe_GPT_4o { get; } = new("gpt-4o-transcribe", "openai");
+
+        public static Model Transcribe_GPT_4o_Mini { get; } = new("gpt-4o-mini-transcribe", "openai");
+
         public static Model DallE_2 { get; } = new("dall-e-2", "openai");
 
         public static Model DallE_3 { get; } = new("dall-e-3", "openai");
diff --git a/Runtime/OpenAIClient.cs b/Runtime/OpenAIClient.cs
@@ -131,7 +131,7 @@ protected override void ValidateAuthentication()
             {
                 new RealtimeClientEventConverter(),
                 new RealtimeServerEventConverter(),
-                new StringEnumConverter(new SnakeCaseNamingStrategy()),
+                new StringEnumConverter(new SnakeCaseNamingStrategy())
             }
         };
 
diff --git a/Runtime/Realtime/InputAudioBufferStartedResponse.cs b/Runtime/Realtime/InputAudioBufferStartedResponse.cs
@@ -5,15 +5,23 @@
 
 namespace OpenAI.Realtime
 {
+    /// <summary>
+    /// Sent by the server when in server_vad mode to indicate that speech has been detected in the audio buffer.
+    /// This can happen any time audio is added to the buffer (unless speech is already detected).
+    /// The client may want to use this event to interrupt audio playback or provide visual feedback to the user.
+    /// The client should expect to receive a input_audio_buffer.speech_stopped event when speech stops.
+    /// The item_id property is the ID of the user message item that will be created when speech stops and
+    /// will also be included in the input_audio_buffer.speech_stopped event (unless the client manually commits the audio buffer during VAD activation).
+    /// </summary>
     [Preserve]
     public sealed class InputAudioBufferStartedResponse : BaseRealtimeEvent, IServerEvent
     {
         [Preserve]
         [JsonConstructor]
         internal InputAudioBufferStartedResponse(
-            [JsonProperty("event_id")] string eventId,
-            [JsonProperty("type")] string type,
-            [JsonProperty("audio_start_ms")] int audioStartMs,
+        [JsonProperty("event_id")] string eventId,
+        [JsonProperty("type")] string type,
+        [JsonProperty("audio_start_ms")] int audioStartMs,
             [JsonProperty("item_id")] string itemId)
         {
             EventId = eventId;
diff --git a/Runtime/Realtime/InputAudioBufferStoppedResponse.cs b/Runtime/Realtime/InputAudioBufferStoppedResponse.cs
@@ -5,6 +5,10 @@
 
 namespace OpenAI.Realtime
 {
+    /// <summary>
+    /// Returned in server_vad mode when the server detects the end of speech in the audio buffer.
+    /// The server will also send an conversation.item.created event with the user message item that is created from the audio buffer.
+    /// </summary>
     [Preserve]
     public sealed class InputAudioBufferStoppedResponse : BaseRealtimeEvent, IServerEvent
     {
diff --git a/Runtime/Realtime/Options.cs b/Runtime/Realtime/Options.cs
@@ -208,7 +208,7 @@ public Options(
         [Preserve]
         [JsonProperty("turn_detection")]
         [JsonConverter(typeof(VoiceActivityDetectionSettingsConverter))]
-        public VoiceActivityDetectionSettings VoiceActivityDetectionSettings { get; private set; }
+        public IVoiceActivityDetectionSettings VoiceActivityDetectionSettings { get; private set; }
 
         [Preserve]
         [JsonProperty("tools")]
diff --git a/Runtime/Realtime/SessionConfiguration.cs b/Runtime/Realtime/SessionConfiguration.cs
@@ -21,7 +21,7 @@ public SessionConfiguration(
             RealtimeAudioFormat inputAudioFormat = RealtimeAudioFormat.PCM16,
             RealtimeAudioFormat outputAudioFormat = RealtimeAudioFormat.PCM16,
             Model transcriptionModel = null,
-            VoiceActivityDetectionSettings turnDetectionSettings = null,
+            IVoiceActivityDetectionSettings turnDetectionSettings = null,
             IEnumerable<Tool> tools = null,
             string toolChoice = null,
             float? temperature = null,
@@ -45,7 +45,7 @@ public SessionConfiguration(
             InputAudioTranscriptionSettings = new(string.IsNullOrWhiteSpace(transcriptionModel)
                 ? "whisper-1"
                 : transcriptionModel);
-            VoiceActivityDetectionSettings = turnDetectionSettings ?? new(TurnDetectionType.Server_VAD);
+            VoiceActivityDetectionSettings = turnDetectionSettings ?? new ServerVAD();
 
             var toolList = tools?.ToList();
 
@@ -104,7 +104,7 @@ internal SessionConfiguration(
             RealtimeAudioFormat inputAudioFormat,
             RealtimeAudioFormat outputAudioFormat,
             InputAudioTranscriptionSettings inputAudioTranscriptionSettings,
-            VoiceActivityDetectionSettings voiceActivityDetectionSettings,
+            IVoiceActivityDetectionSettings voiceActivityDetectionSettings,
             IReadOnlyList<Function> tools,
             object toolChoice,
             float? temperature,
@@ -156,7 +156,7 @@ internal SessionConfiguration(
         [Preserve]
         [JsonProperty("turn_detection")]
         [JsonConverter(typeof(VoiceActivityDetectionSettingsConverter))]
-        public VoiceActivityDetectionSettings VoiceActivityDetectionSettings { get; private set; }
+        public IVoiceActivityDetectionSettings VoiceActivityDetectionSettings { get; private set; }
 
         [Preserve]
         [JsonProperty("tools")]
diff --git a/Runtime/Realtime/TurnDetectionType.cs b/Runtime/Realtime/TurnDetectionType.cs
@@ -9,5 +9,7 @@ public enum TurnDetectionType
         Disabled = 0,
         [EnumMember(Value = "server_vad")]
         Server_VAD,
+        [EnumMember(Value = "semantic_vad")]
+        Semantic_VAD,
     }
 }
diff --git a/Runtime/Realtime/VoiceActivityDetectionSettings.cs b/Runtime/Realtime/VoiceActivityDetectionSettings.cs
diff --git a/Tests/TestFixture_13_Realtime.cs b/Tests/TestFixture_13_Realtime.cs
diff --git a/package.json b/package.json

Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,7 @@ protected override void ValidateAuthentication()`
`131`	`131`	`{`
`132`	`132`	`new RealtimeClientEventConverter(),`
`133`	`133`	`new RealtimeServerEventConverter(),`
`134`		`- new StringEnumConverter(new SnakeCaseNamingStrategy()),`
	`134`	`+ new StringEnumConverter(new SnakeCaseNamingStrategy())`
`135`	`135`	`}`
`136`	`136`	`};`
`137`	`137`
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,10 @@`
`5`	`5`
`6`	`6`	`namespace OpenAI.Realtime`
`7`	`7`	`{`
	`8`	`+ /// <summary>`
	`9`	`+ /// Returned in server_vad mode when the server detects the end of speech in the audio buffer.`
	`10`	`+ /// The server will also send an conversation.item.created event with the user message item that is created from the audio buffer.`
	`11`	`+ /// </summary>`
`8`	`12`	`[Preserve]`
`9`	`13`	`public sealed class InputAudioBufferStoppedResponse : BaseRealtimeEvent, IServerEvent`
`10`	`14`	`{`
Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,7 @@ public enum TurnDetectionType`
`9`	`9`	`Disabled = 0,`
`10`	`10`	`[EnumMember(Value = "server_vad")]`
`11`	`11`	`Server_VAD,`
	`12`	`+ [EnumMember(Value = "semantic_vad")]`
	`13`	`+ Semantic_VAD,`
`12`	`14`	`}`
`13`	`15`	`}`