Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
db6b7e3
Initial speech to text abstractions
rogerbarreto Mar 18, 2025
4bdb7b9
Address some feedback (still more things to address)
stephentoub Mar 18, 2025
14c37c2
Merge branch 'main' of https://github.com/rogerbarreto/extensions int…
rogerbarreto Mar 29, 2025
fad8017
Resolve conflict
rogerbarreto Mar 29, 2025
3448daa
Ensure UT are working before further changes
rogerbarreto Mar 29, 2025
ef93211
Update method names Transcribe / Response to GetText
rogerbarreto Mar 29, 2025
43d610c
Update Test Names to new Method names
rogerbarreto Mar 29, 2025
ff4ae4a
Change interface from IList<IAsyncEnumerable> to one stream item at a…
rogerbarreto Mar 29, 2025
0831000
Update XmlDocs with corret definition, ensure correct naming
rogerbarreto Mar 29, 2025
8c893a9
Dropping the Choice / Message concept, flattering the Message with th…
rogerbarreto Mar 29, 2025
3d91982
Remove CultureInfo complexity from language properties
rogerbarreto Mar 30, 2025
009eeca
Adding Prompt property to options + UT
rogerbarreto Mar 30, 2025
305e7e4
Revert global.json changes
rogerbarreto Mar 30, 2025
1feac6d
Add missing experimental
rogerbarreto Mar 30, 2025
956097d
Fix UT
rogerbarreto Mar 30, 2025
0830a51
Address PR comments
rogerbarreto Mar 31, 2025
72407f2
Fix unit tests
rogerbarreto Mar 31, 2025
3c7e4ae
Fix UT
rogerbarreto Apr 1, 2025
8763c8c
Merge branch 'main' into audio-transcription-abstraction
rogerbarreto Apr 1, 2025
8d473cb
Merge branch 'audio-transcription-abstraction' of https://github.com/…
rogerbarreto Apr 1, 2025
c6c016e
Address PR comments
rogerbarreto Apr 1, 2025
b3d7819
Merge branch 'main' into audio-transcription-abstraction
rogerbarreto Apr 2, 2025
ca1338b
Remove async wrapping
rogerbarreto Apr 2, 2025
d3a14c9
Adjusting concat / text fields
rogerbarreto Apr 2, 2025
263f0e0
Start time and end time added to update + UT covering
rogerbarreto Apr 2, 2025
dd5ec14
AsISpeechToText renaming
rogerbarreto Apr 2, 2025
9eabb98
Remove OpenAIClient ctor + small fixes
rogerbarreto Apr 2, 2025
78e4ebb
Removing rawrepresentation impl from Update -> Response
rogerbarreto Apr 2, 2025
46acd1c
Merge branch 'main' into audio-transcription-abstraction
rogerbarreto Apr 2, 2025
8bf3389
Add missing AsISpeechToText UT
rogerbarreto Apr 2, 2025
c5c6e89
Add GetService UT
rogerbarreto Apr 2, 2025
977a0e5
Warning fix
rogerbarreto Apr 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Address PR comments
  • Loading branch information
rogerbarreto committed Mar 31, 2025
commit 0830a514428856a0d199094efa67cf11ef4ddd0f
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;
Expand Down Expand Up @@ -46,7 +47,10 @@ public static async Task<SpeechToTextResponse> GetTextAsync(
_ = Throw.IfNull(client);
_ = Throw.IfNull(audioSpeechContent);

using var audioSpeechStream = new MemoryStream(audioSpeechContent.Data.ToArray());
using var audioSpeechStream = MemoryMarshal.TryGetArray(audioSpeechContent.Data, out var array) ?
new MemoryStream(array.Array!, array.Offset, array.Count) :
new MemoryStream(audioSpeechContent.Data.ToArray());

return await client.GetTextAsync(audioSpeechStream, options, cancellationToken).ConfigureAwait(false);
}

Expand All @@ -65,7 +69,10 @@ public static async IAsyncEnumerable<SpeechToTextResponseUpdate> GetStreamingTex
_ = Throw.IfNull(client);
_ = Throw.IfNull(audioSpeechContent);

using var audioSpeechStream = new MemoryStream(audioSpeechContent.Data.ToArray());
using var audioSpeechStream = MemoryMarshal.TryGetArray(audioSpeechContent.Data, out var array) ?
new MemoryStream(array.Array!, array.Offset, array.Count) :
new MemoryStream(audioSpeechContent.Data.ToArray());

await foreach (var update in client.GetStreamingTextAsync(audioSpeechStream, options, cancellationToken).ConfigureAwait(false))
{
yield return update;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ public class SpeechToTextClientMetadata
/// appropriate name defined in the OpenTelemetry Semantic Conventions for Generative AI systems.
/// </param>
/// <param name="providerUri">The URL for accessing the speech to text provider, if applicable.</param>
/// <param name="modelId">The ID of the speech to text model used, if applicable.</param>
public SpeechToTextClientMetadata(string? providerName = null, Uri? providerUri = null, string? modelId = null)
/// <param name="defaultModelId">The ID of the speech to text used by default, if applicable.</param>
public SpeechToTextClientMetadata(string? providerName = null, Uri? providerUri = null, string? defaultModelId = null)
{
ModelId = modelId;
DefaultModelId = defaultModelId;
ProviderName = providerName;
ProviderUri = providerUri;
}
Expand All @@ -34,10 +34,10 @@ public SpeechToTextClientMetadata(string? providerName = null, Uri? providerUri
/// <summary>Gets the URL for accessing the speech to text provider.</summary>
public Uri? ProviderUri { get; }

/// <summary>Gets the ID of the model used by this speech to text provider.</summary>
/// <summary>Gets the ID of the default model used by this speech to text client.</summary>
/// <remarks>
/// This value can be null if either the name is unknown or there are multiple possible models associated with this instance.
/// An individual request may override this value via <see cref="SpeechToTextOptions.ModelId"/>.
/// </remarks>
public string? ModelId { get; }
public string? DefaultModelId { get; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ namespace Microsoft.Extensions.AI;
[Experimental("MEAI001")]
public class SpeechToTextOptions
{
/// <summary>Gets or sets the ID for the speech to text.</summary>
/// <remarks>Long running jobs may use this ID for status pooling.</remarks>
public string? ResponseId { get; set; }

/// <summary>Gets or sets the model ID for the speech to text.</summary>
public string? ModelId { get; set; }

Expand All @@ -22,9 +18,6 @@ public class SpeechToTextOptions
/// <summary>Gets or sets the language for the target generated text.</summary>
public string? TextLanguage { get; set; }

/// <summary>Gets or sets the prompt to be used for the speech to text request.</summary>
public string? Prompt { get; set; }

/// <summary>Gets or sets the sample rate of the speech input audio.</summary>
public int? SpeechSampleRate { get; set; }

Expand All @@ -37,12 +30,10 @@ public virtual SpeechToTextOptions Clone()
{
SpeechToTextOptions options = new()
{
ResponseId = ResponseId,
ModelId = ModelId,
SpeechLanguage = SpeechLanguage,
TextLanguage = TextLanguage,
SpeechSampleRate = SpeechSampleRate,
Prompt = Prompt,
AdditionalProperties = AdditionalProperties?.Clone(),
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;

using System.Text.Json.Serialization;
using Microsoft.Shared.Diagnostics;

#pragma warning disable EA0011 // Consider removing unnecessary conditional access operators

namespace Microsoft.Extensions.AI;

/// <summary>Represents the result of an speech to text request.</summary>
Expand Down Expand Up @@ -62,29 +62,12 @@ public SpeechToTextResponse(string? content)
/// <summary>Gets or sets any additional properties associated with the speech to text completion.</summary>
public AdditionalPropertiesDictionary? AdditionalProperties { get; set; }

/// <summary>
/// Gets or sets the text of the first <see cref="TextContent"/> instance in <see cref="Contents" />.
/// </summary>
/// <summary>Gets the text of this speech to text response.</summary>
/// <remarks>
/// If there is no <see cref="TextContent"/> instance in <see cref="Contents" />, then the getter returns <see langword="null" />,
/// and the setter adds a new <see cref="TextContent"/> instance with the provided value.
/// This property concatenates the text of all <see cref="TextContent"/> objects in <see cref="Contents"/>.
/// </remarks>
[JsonIgnore]
public string? Text
{
get => Contents.OfType<TextContent>().FirstOrDefault()?.Text;
set
{
if (Contents.OfType<TextContent>().FirstOrDefault() is { } textContent)
{
textContent.Text = value;
}
else if (value is not null)
{
Contents.Add(new TextContent(value));
}
}
}
public string Text => Contents?.ConcatText() ?? string.Empty;

/// <inheritdoc />
public override string ToString() => Contents.ConcatText();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Text.Json.Serialization;
using Microsoft.Shared.Diagnostics;

#pragma warning disable EA0011 // Consider removing unnecessary conditional access operators

namespace Microsoft.Extensions.AI;

/// <summary>
Expand Down Expand Up @@ -81,29 +82,12 @@ public SpeechToTextResponseUpdate(string? content)
/// <summary>Gets or sets additional properties for the update.</summary>
public AdditionalPropertiesDictionary? AdditionalProperties { get; set; }

/// <summary>
/// Gets or sets the text of the first <see cref="TextContent"/> instance in <see cref="Contents" />.
/// </summary>
/// <summary>Gets the text of this speech to text response.</summary>
/// <remarks>
/// If there is no <see cref="TextContent"/> instance in <see cref="Contents" />, then the getter returns <see langword="null" />,
/// and the setter adds a new <see cref="TextContent"/> instance with the provided value.
/// This property concatenates the text of all <see cref="TextContent"/> objects in <see cref="Contents"/>.
/// </remarks>
[JsonIgnore]
public string? Text
{
get => Contents.OfType<TextContent>().FirstOrDefault()?.Text;
set
{
if (Contents.OfType<TextContent>().FirstOrDefault() is { } textContent)
{
textContent.Text = value;
}
else if (value is not null)
{
Contents.Add(new TextContent(value));
}
}
}
public string Text => Contents?.ConcatText() ?? string.Empty;

/// <summary>Gets or sets the generated content items.</summary>
[AllowNull]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,9 @@ public static class SpeechToTextResponseUpdateExtensions
{
/// <summary>Combines <see cref="SpeechToTextResponseUpdate"/> instances into a single <see cref="SpeechToTextResponse"/>.</summary>
/// <param name="updates">The updates to be combined.</param>
/// <param name="coalesceContent">
/// <see langword="true"/> to attempt to coalesce contiguous <see cref="AIContent"/> items, where applicable,
/// into a single <see cref="AIContent"/>. When <see langword="false"/>, the original content items are used.
/// The default is <see langword="true"/>.
/// </param>
/// <returns>The combined <see cref="SpeechToTextResponse"/>.</returns>
public static SpeechToTextResponse ToSpeechToTextResponse(
this IEnumerable<SpeechToTextResponseUpdate> updates, bool coalesceContent = true)
this IEnumerable<SpeechToTextResponseUpdate> updates)
{
_ = Throw.IfNull(updates);

Expand All @@ -41,10 +36,7 @@ public static SpeechToTextResponse ToSpeechToTextResponse(
ProcessUpdate(update, contents, ref responseId, ref modelId, ref rawRepresentation, ref additionalProperties);
}

if (coalesceContent)
{
ChatResponseExtensions.CoalesceTextContent(contents);
}
ChatResponseExtensions.CoalesceTextContent(contents);

response.Contents = contents;
response.ResponseId = responseId;
Expand All @@ -57,22 +49,17 @@ public static SpeechToTextResponse ToSpeechToTextResponse(

/// <summary>Combines <see cref="SpeechToTextResponseUpdate"/> instances into a single <see cref="SpeechToTextResponse"/>.</summary>
/// <param name="updates">The updates to be combined.</param>
/// <param name="coalesceContent">
/// <see langword="true"/> to attempt to coalesce contiguous <see cref="AIContent"/> items, where applicable,
/// into a single <see cref="AIContent"/>. When <see langword="false"/>, the original content items are used.
/// The default is <see langword="true"/>.
/// </param>
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
/// <returns>The combined <see cref="SpeechToTextResponse"/>.</returns>
public static Task<SpeechToTextResponse> ToSpeechToTextResponseAsync(
this IAsyncEnumerable<SpeechToTextResponseUpdate> updates, bool coalesceContent = true, CancellationToken cancellationToken = default)
this IAsyncEnumerable<SpeechToTextResponseUpdate> updates, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(updates);

return ToResponseAsync(updates, coalesceContent, cancellationToken);
return ToResponseAsync(updates, cancellationToken);

static async Task<SpeechToTextResponse> ToResponseAsync(
IAsyncEnumerable<SpeechToTextResponseUpdate> updates, bool coalesceContent, CancellationToken cancellationToken)
IAsyncEnumerable<SpeechToTextResponseUpdate> updates, CancellationToken cancellationToken)
{
SpeechToTextResponse response = new();
List<AIContent> contents = [];
Expand All @@ -86,10 +73,7 @@ static async Task<SpeechToTextResponse> ToResponseAsync(
ProcessUpdate(update, contents, ref responseId, ref modelId, ref rawRepresentation, ref additionalProperties);
}

if (coalesceContent)
{
ChatResponseExtensions.CoalesceTextContent(contents);
}
ChatResponseExtensions.CoalesceTextContent(contents);

response.Contents = contents;
response.ResponseId = responseId;
Expand Down

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@

using System;
using System.ComponentModel;
using System.Diagnostics.CodeAnalysis;
using Microsoft.Shared.Diagnostics;
using OpenAI;
using OpenAI.Audio;
using OpenAI.Chat;
using OpenAI.Embeddings;
using OpenAI.Responses;
Expand Down Expand Up @@ -37,21 +35,6 @@ public static IChatClient AsIChatClient(this ChatClient chatClient) =>
public static IChatClient AsIChatClient(this OpenAIResponseClient responseClient) =>
new OpenAIResponseChatClient(responseClient);

/// <summary>Gets an <see cref="ISpeechToTextClient"/> for use with this <see cref="OpenAIClient"/>.</summary>
/// <param name="openAIClient">The client.</param>
/// <param name="modelId">The model.</param>
/// <returns>An <see cref="ISpeechToTextClient"/> that can be used to transcribe audio via the <see cref="OpenAIClient"/>.</returns>
[Experimental("MEAI001")]
public static ISpeechToTextClient AsSpeechToTextClient(this OpenAIClient openAIClient, string modelId) =>
new OpenAISpeechToTextClient(openAIClient, modelId);

/// <summary>Gets an <see cref="ISpeechToTextClient"/> for use with this <see cref="AudioClient"/>.</summary>
/// <param name="audioClient">The client.</param>
/// <returns>An <see cref="ISpeechToTextClient"/> that can be used to transcribe audio via the <see cref="AudioClient"/>.</returns>
[Experimental("MEAI001")]
public static ISpeechToTextClient AsSpeechToTextClient(this AudioClient audioClient) =>
new OpenAISpeechToTextClient(audioClient);

/// <summary>Gets an <see cref="IEmbeddingGenerator{String, Single}"/> for use with this <see cref="OpenAIClient"/>.</summary>
/// <param name="openAIClient">The client.</param>
/// <param name="modelId">The model to use.</param>
Expand Down
Loading
Loading