Skip to content

Commit 069ede7

Browse files
com.openai.unity 8.6.6 (#366)
- fix mp3 and wav playback from SpeechRequests - added SemanticVAD options to realtime - added new audio models to static model list --------- Co-authored-by: Copilot <[email protected]>
1 parent 01a4f76 commit 069ede7

13 files changed

+292
-51
lines changed

Runtime/Audio/AudioEndpoint.cs

+25-8
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,28 @@ public async Task<SpeechClip> GetSpeechAsync(SpeechRequest request, Action<Speec
7373

7474
Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);
7575

76-
var part = 0;
77-
var pcmResponse = await Rest.PostAsync(GetUrl("/speech"), payload, StreamCallback, 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
78-
pcmResponse.Validate(EnableDebug);
79-
await File.WriteAllBytesAsync(cachedPath, pcmResponse.Data, cancellationToken).ConfigureAwait(true);
80-
return new SpeechClip(clipName, cachedPath, new ReadOnlyMemory<byte>(pcmResponse.Data));
81-
82-
void StreamCallback(Response partialResponse)
76+
switch (request.ResponseFormat)
8377
{
84-
partialClipCallback?.Invoke(new SpeechClip($"{clipName}_{++part}", null, partialResponse.Data));
78+
case SpeechResponseFormat.PCM:
79+
{
80+
var part = 0;
81+
var pcmResponse = await Rest.PostAsync(GetUrl("/speech"), payload, partialResponse =>
82+
{
83+
partialClipCallback?.Invoke(new SpeechClip($"{clipName}_{++part}", null, partialResponse.Data));
84+
}, 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
85+
pcmResponse.Validate(EnableDebug);
86+
await File.WriteAllBytesAsync(cachedPath, pcmResponse.Data, cancellationToken).ConfigureAwait(true);
87+
return new SpeechClip(clipName, cachedPath, new ReadOnlyMemory<byte>(pcmResponse.Data));
88+
}
89+
default:
90+
{
91+
var audioResponse = await Rest.PostAsync(GetUrl("/speech"), payload, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
92+
audioResponse.Validate(EnableDebug);
93+
await File.WriteAllBytesAsync(cachedPath, audioResponse.Data, cancellationToken).ConfigureAwait(true);
94+
var audioType = request.ResponseFormat == SpeechResponseFormat.MP3 ? AudioType.MPEG : AudioType.WAV;
95+
var finalClip = await Rest.DownloadAudioClipAsync(cachedPath, audioType, fileName: clipName, cancellationToken: cancellationToken);
96+
return new SpeechClip(clipName, cachedPath, finalClip);
97+
}
8598
}
8699
}
87100

@@ -91,6 +104,7 @@ void StreamCallback(Response partialResponse)
91104
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
92105
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
93106
/// <returns>The transcribed text.</returns>
107+
[Function("Transcribes audio into the input language. Returns transcribed text.")]
94108
public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
95109
{
96110
var response = await Internal_CreateTranscriptionAsync(request, cancellationToken);
@@ -106,6 +120,7 @@ public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest
106120
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
107121
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
108122
/// <returns><see cref="AudioResponse"/>.</returns>
123+
[Function("Transcribes audio into the input language. Returns Json parsed AudioResponse.")]
109124
public async Task<AudioResponse> CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
110125
{
111126
if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
@@ -170,6 +185,7 @@ private async Task<string> Internal_CreateTranscriptionAsync(AudioTranscriptionR
170185
/// <param name="request"></param>
171186
/// <param name="cancellationToken"></param>
172187
/// <returns>The translated text.</returns>
188+
[Function("Translates audio into English. Returns translated text.")]
173189
public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
174190
{
175191
var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken);
@@ -185,6 +201,7 @@ public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest req
185201
/// <param name="cancellationToken"></param>
186202
/// <returns></returns>
187203
/// <exception cref="ArgumentException"></exception>
204+
[Function("Translates audio into English. Returns Json parsed AudioResponse.")]
188205
public async Task<AudioResponse> CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
189206
{
190207
if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))

Runtime/Audio/SpeechClip.cs

+12
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,18 @@ namespace OpenAI.Audio
1010
[Preserve]
1111
public sealed class SpeechClip
1212
{
13+
[Preserve]
14+
internal SpeechClip(string name, string cachePath, AudioClip audioClip)
15+
{
16+
Name = name;
17+
CachePath = cachePath;
18+
this.audioClip = audioClip;
19+
SampleRate = audioClip.frequency;
20+
var samples = new float[audioClip.samples];
21+
audioClip.GetData(samples, 0);
22+
AudioData = PCMEncoder.Encode(samples);
23+
}
24+
1325
[Preserve]
1426
internal SpeechClip(string name, string cachePath, ReadOnlyMemory<byte> audioData, int sampleRate = 24000)
1527
{

Runtime/Extensions/VoiceActivityDetectionSettingsConverter.cs

+22-9
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,46 @@
11
// Licensed under the MIT License. See LICENSE in the project root for license information.
22

33
using Newtonsoft.Json;
4+
using Newtonsoft.Json.Linq;
45
using OpenAI.Realtime;
56
using System;
67
using UnityEngine.Scripting;
78

89
namespace OpenAI
910
{
1011
[Preserve]
11-
internal class VoiceActivityDetectionSettingsConverter : JsonConverter<VoiceActivityDetectionSettings>
12+
internal class VoiceActivityDetectionSettingsConverter : JsonConverter
1213
{
1314
[Preserve]
14-
public override VoiceActivityDetectionSettings ReadJson(JsonReader reader, Type objectType, VoiceActivityDetectionSettings existingValue, bool hasExistingValue, JsonSerializer serializer)
15+
public override bool CanWrite => true;
16+
17+
[Preserve]
18+
public override bool CanConvert(Type objectType) => typeof(IVoiceActivityDetectionSettings).IsAssignableFrom(objectType);
19+
20+
[Preserve]
21+
public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
1522
{
16-
return reader.TokenType == JsonToken.Null
17-
? VoiceActivityDetectionSettings.Disabled()
18-
: serializer.Deserialize<VoiceActivityDetectionSettings>(reader);
23+
var jObject = JObject.Load(reader);
24+
var type = jObject["type"]?.Value<string>() ?? "disabled";
25+
26+
return type switch
27+
{
28+
"disabled" => new DisabledVAD(),
29+
"server_vad" => jObject.ToObject<ServerVAD>(serializer),
30+
"semantic_vad" => jObject.ToObject<SemanticVAD>(serializer),
31+
_ => throw new NotImplementedException($"Unknown VAD type: {type}")
32+
};
1933
}
2034

2135
[Preserve]
22-
public override void WriteJson(JsonWriter writer, VoiceActivityDetectionSettings value, JsonSerializer serializer)
36+
public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
2337
{
24-
switch (value.Type)
38+
switch (value)
2539
{
26-
case TurnDetectionType.Disabled:
40+
case DisabledVAD:
2741
writer.WriteNull();
2842
break;
2943
default:
30-
case TurnDetectionType.Server_VAD:
3144
serializer.Serialize(writer, value);
3245
break;
3346
}

Runtime/Models/Model.cs

+7-16
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// Licensed under the MIT License. See LICENSE in the project root for license information.
22

3+
using Newtonsoft.Json;
34
using System;
45
using System.Collections.Generic;
5-
using Newtonsoft.Json;
66
using UnityEngine.Scripting;
77

88
namespace OpenAI.Models
@@ -205,27 +205,18 @@ internal Model(
205205
[Obsolete("Removed")]
206206
public static Model Moderation_Stable { get; } = new("text-moderation-stable", "openai");
207207

208-
/// <summary>
209-
/// The latest text to speech model, optimized for speed.
210-
/// </summary>
211-
/// <remarks>
212-
/// The default model for <see cref="Audio.SpeechRequest"/>s.
213-
/// </remarks>
214208
public static Model TTS_1 { get; } = new("tts-1", "openai");
215209

216-
/// <summary>
217-
/// The latest text to speech model, optimized for quality.
218-
/// </summary>
219210
public static Model TTS_1HD { get; } = new("tts-1-hd", "openai");
220211

221-
/// <summary>
222-
/// The default model for <see cref="Audio.AudioEndpoint"/>.
223-
/// </summary>
212+
public static Model TTS_GPT_4o_Mini { get; } = new("gpt-4o-mini-tts", "openai");
213+
224214
public static Model Whisper1 { get; } = new("whisper-1", "openai");
225215

226-
/// <summary>
227-
/// The default model for <see cref="Images.ImagesEndpoint"/>.
228-
/// </summary>
216+
public static Model Transcribe_GPT_4o { get; } = new("gpt-4o-transcribe", "openai");
217+
218+
public static Model Transcribe_GPT_4o_Mini { get; } = new("gpt-4o-mini-transcribe", "openai");
219+
229220
public static Model DallE_2 { get; } = new("dall-e-2", "openai");
230221

231222
public static Model DallE_3 { get; } = new("dall-e-3", "openai");

Runtime/OpenAIClient.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ protected override void ValidateAuthentication()
131131
{
132132
new RealtimeClientEventConverter(),
133133
new RealtimeServerEventConverter(),
134-
new StringEnumConverter(new SnakeCaseNamingStrategy()),
134+
new StringEnumConverter(new SnakeCaseNamingStrategy())
135135
}
136136
};
137137

Runtime/Realtime/InputAudioBufferStartedResponse.cs

+11-3
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,23 @@
55

66
namespace OpenAI.Realtime
77
{
8+
/// <summary>
9+
/// Sent by the server when in server_vad mode to indicate that speech has been detected in the audio buffer.
10+
/// This can happen any time audio is added to the buffer (unless speech is already detected).
11+
/// The client may want to use this event to interrupt audio playback or provide visual feedback to the user.
12+
/// The client should expect to receive a input_audio_buffer.speech_stopped event when speech stops.
13+
/// The item_id property is the ID of the user message item that will be created when speech stops and
14+
/// will also be included in the input_audio_buffer.speech_stopped event (unless the client manually commits the audio buffer during VAD activation).
15+
/// </summary>
816
[Preserve]
917
public sealed class InputAudioBufferStartedResponse : BaseRealtimeEvent, IServerEvent
1018
{
1119
[Preserve]
1220
[JsonConstructor]
1321
internal InputAudioBufferStartedResponse(
14-
[JsonProperty("event_id")] string eventId,
15-
[JsonProperty("type")] string type,
16-
[JsonProperty("audio_start_ms")] int audioStartMs,
22+
[JsonProperty("event_id")] string eventId,
23+
[JsonProperty("type")] string type,
24+
[JsonProperty("audio_start_ms")] int audioStartMs,
1725
[JsonProperty("item_id")] string itemId)
1826
{
1927
EventId = eventId;

Runtime/Realtime/InputAudioBufferStoppedResponse.cs

+4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55

66
namespace OpenAI.Realtime
77
{
8+
/// <summary>
9+
/// Returned in server_vad mode when the server detects the end of speech in the audio buffer.
10+
/// The server will also send an conversation.item.created event with the user message item that is created from the audio buffer.
11+
/// </summary>
812
[Preserve]
913
public sealed class InputAudioBufferStoppedResponse : BaseRealtimeEvent, IServerEvent
1014
{

Runtime/Realtime/Options.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ public Options(
208208
[Preserve]
209209
[JsonProperty("turn_detection")]
210210
[JsonConverter(typeof(VoiceActivityDetectionSettingsConverter))]
211-
public VoiceActivityDetectionSettings VoiceActivityDetectionSettings { get; private set; }
211+
public IVoiceActivityDetectionSettings VoiceActivityDetectionSettings { get; private set; }
212212

213213
[Preserve]
214214
[JsonProperty("tools")]

Runtime/Realtime/SessionConfiguration.cs

+4-4
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public SessionConfiguration(
2121
RealtimeAudioFormat inputAudioFormat = RealtimeAudioFormat.PCM16,
2222
RealtimeAudioFormat outputAudioFormat = RealtimeAudioFormat.PCM16,
2323
Model transcriptionModel = null,
24-
VoiceActivityDetectionSettings turnDetectionSettings = null,
24+
IVoiceActivityDetectionSettings turnDetectionSettings = null,
2525
IEnumerable<Tool> tools = null,
2626
string toolChoice = null,
2727
float? temperature = null,
@@ -45,7 +45,7 @@ public SessionConfiguration(
4545
InputAudioTranscriptionSettings = new(string.IsNullOrWhiteSpace(transcriptionModel)
4646
? "whisper-1"
4747
: transcriptionModel);
48-
VoiceActivityDetectionSettings = turnDetectionSettings ?? new(TurnDetectionType.Server_VAD);
48+
VoiceActivityDetectionSettings = turnDetectionSettings ?? new ServerVAD();
4949

5050
var toolList = tools?.ToList();
5151

@@ -104,7 +104,7 @@ internal SessionConfiguration(
104104
RealtimeAudioFormat inputAudioFormat,
105105
RealtimeAudioFormat outputAudioFormat,
106106
InputAudioTranscriptionSettings inputAudioTranscriptionSettings,
107-
VoiceActivityDetectionSettings voiceActivityDetectionSettings,
107+
IVoiceActivityDetectionSettings voiceActivityDetectionSettings,
108108
IReadOnlyList<Function> tools,
109109
object toolChoice,
110110
float? temperature,
@@ -156,7 +156,7 @@ internal SessionConfiguration(
156156
[Preserve]
157157
[JsonProperty("turn_detection")]
158158
[JsonConverter(typeof(VoiceActivityDetectionSettingsConverter))]
159-
public VoiceActivityDetectionSettings VoiceActivityDetectionSettings { get; private set; }
159+
public IVoiceActivityDetectionSettings VoiceActivityDetectionSettings { get; private set; }
160160

161161
[Preserve]
162162
[JsonProperty("tools")]

Runtime/Realtime/TurnDetectionType.cs

+2
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,7 @@ public enum TurnDetectionType
99
Disabled = 0,
1010
[EnumMember(Value = "server_vad")]
1111
Server_VAD,
12+
[EnumMember(Value = "semantic_vad")]
13+
Semantic_VAD,
1214
}
1315
}

0 commit comments

Comments
 (0)