fixes for STT

This commit is contained in:
Jacob Dubin
2026-04-18 08:15:37 -05:00
parent 6276a8e1ac
commit a91433c5a0
4 changed files with 83 additions and 4 deletions

View File

@@ -15,6 +15,7 @@ public sealed class WebSocketTurnFinalizationService(
{ {
private const int AutoFinalizeMinBufferedAudioBytes = 12000; private const int AutoFinalizeMinBufferedAudioBytes = 12000;
private const int AutoFinalizeMinBufferedAudioChunks = 5; private const int AutoFinalizeMinBufferedAudioChunks = 5;
private static readonly TimeSpan AutoFinalizeMinTurnAge = TimeSpan.FromMilliseconds(1800);
public async Task<IReadOnlyList<WebSocketReply>> HandleBinaryAudioAsync( public async Task<IReadOnlyList<WebSocketReply>> HandleBinaryAudioAsync(
CloudSession session, CloudSession session,
@@ -119,10 +120,28 @@ public sealed class WebSocketTurnFinalizationService(
return turn; return turn;
} }
ISttStrategy? strategy = null;
try
{
strategy = await sttStrategySelector.SelectAsync(turn, cancellationToken);
}
catch (InvalidOperationException ex) when (string.Equals(ex.Message, "No STT strategy can handle the current turn.", StringComparison.Ordinal))
{
return turn;
}
catch (Exception ex)
{
session.TurnState.LastSttError = ex.Message;
session.TurnState.LastSttErrorUtc = DateTimeOffset.UtcNow;
await sink.RecordTranscriptError(ex, "Error during STT processing", cancellationToken);
return turn;
}
try try
{ {
var strategy = await sttStrategySelector.SelectAsync(turn, cancellationToken);
var sttResult = await strategy.TranscribeAsync(turn, cancellationToken); var sttResult = await strategy.TranscribeAsync(turn, cancellationToken);
session.TurnState.LastSttError = null;
session.TurnState.LastSttErrorUtc = null;
var attributes = new Dictionary<string, object?>(turn.Attributes, StringComparer.OrdinalIgnoreCase) var attributes = new Dictionary<string, object?>(turn.Attributes, StringComparer.OrdinalIgnoreCase)
{ {
@@ -160,6 +179,8 @@ public sealed class WebSocketTurnFinalizationService(
} }
catch (Exception ex) catch (Exception ex)
{ {
session.TurnState.LastSttError = ex.Message;
session.TurnState.LastSttErrorUtc = DateTimeOffset.UtcNow;
await sink.RecordTranscriptError(ex, "Error during STT processing", cancellationToken); await sink.RecordTranscriptError(ex, "Error during STT processing", cancellationToken);
return turn; return turn;
} }
@@ -229,6 +250,8 @@ public sealed class WebSocketTurnFinalizationService(
{ {
session.TurnState.BufferedAudioBytes = 0; session.TurnState.BufferedAudioBytes = 0;
session.TurnState.BufferedAudioChunkCount = 0; session.TurnState.BufferedAudioChunkCount = 0;
session.TurnState.LastSttError = null;
session.TurnState.LastSttErrorUtc = null;
session.TurnState.FirstAudioReceivedUtc = null; session.TurnState.FirstAudioReceivedUtc = null;
session.TurnState.LastAudioReceivedUtc = null; session.TurnState.LastAudioReceivedUtc = null;
session.TurnState.BufferedAudioFrames.Clear(); session.TurnState.BufferedAudioFrames.Clear();
@@ -241,6 +264,8 @@ public sealed class WebSocketTurnFinalizationService(
turnState.TransId = transId; turnState.TransId = transId;
turnState.ContextPayload = null; turnState.ContextPayload = null;
turnState.AudioTranscriptHint = null; turnState.AudioTranscriptHint = null;
turnState.LastSttError = null;
turnState.LastSttErrorUtc = null;
turnState.FirstAudioReceivedUtc = null; turnState.FirstAudioReceivedUtc = null;
turnState.LastAudioReceivedUtc = null; turnState.LastAudioReceivedUtc = null;
turnState.BufferedAudioChunkCount = 0; turnState.BufferedAudioChunkCount = 0;
@@ -272,7 +297,9 @@ public sealed class WebSocketTurnFinalizationService(
turnState.FinalizeAttemptCount += 1; turnState.FinalizeAttemptCount += 1;
} }
if (allowFallbackOnMissingTranscript && turnState.BufferedAudioBytes >= AutoFinalizeMinBufferedAudioBytes) if (allowFallbackOnMissingTranscript &&
turnState.BufferedAudioBytes >= AutoFinalizeMinBufferedAudioBytes &&
string.IsNullOrWhiteSpace(turnState.LastSttError))
{ {
turnState.AwaitingTurnCompletion = false; turnState.AwaitingTurnCompletion = false;
session.LastTranscript = string.Empty; session.LastTranscript = string.Empty;
@@ -331,12 +358,16 @@ public sealed class WebSocketTurnFinalizationService(
private static bool ShouldAutoFinalize(CloudSession session) private static bool ShouldAutoFinalize(CloudSession session)
{ {
var turnState = session.TurnState; var turnState = session.TurnState;
var turnAge = turnState.FirstAudioReceivedUtc.HasValue
? DateTimeOffset.UtcNow - turnState.FirstAudioReceivedUtc.Value
: TimeSpan.Zero;
return turnState.AwaitingTurnCompletion && return turnState.AwaitingTurnCompletion &&
turnState is turnState is
{ {
SawListen: true, SawContext: true, BufferedAudioChunkCount: >= AutoFinalizeMinBufferedAudioChunks, SawListen: true, SawContext: true, BufferedAudioChunkCount: >= AutoFinalizeMinBufferedAudioChunks,
BufferedAudioBytes: >= AutoFinalizeMinBufferedAudioBytes BufferedAudioBytes: >= AutoFinalizeMinBufferedAudioBytes
}; } &&
turnAge >= AutoFinalizeMinTurnAge;
} }
private static string? ExtractDataPayload(string? text) private static string? ExtractDataPayload(string? text)

View File

@@ -5,6 +5,8 @@ public sealed class WebSocketTurnState
public string? TransId { get; set; } public string? TransId { get; set; }
public string? ContextPayload { get; set; } public string? ContextPayload { get; set; }
public string? AudioTranscriptHint { get; set; } public string? AudioTranscriptHint { get; set; }
public string? LastSttError { get; set; }
public DateTimeOffset? LastSttErrorUtc { get; set; }
public DateTimeOffset? FirstAudioReceivedUtc { get; set; } public DateTimeOffset? FirstAudioReceivedUtc { get; set; }
public DateTimeOffset? LastAudioReceivedUtc { get; set; } public DateTimeOffset? LastAudioReceivedUtc { get; set; }
public int BufferedAudioChunkCount { get; set; } public int BufferedAudioChunkCount { get; set; }

View File

@@ -29,4 +29,42 @@ public sealed class FileTurnTelemetrySinkTests
sink.Verify(s => s.RecordTranscriptError(It.IsAny<Exception>(), It.IsAny<string>(), It.IsAny<CancellationToken>()), Times.Once()); sink.Verify(s => s.RecordTranscriptError(It.IsAny<Exception>(), It.IsAny<string>(), It.IsAny<CancellationToken>()), Times.Once());
} }
}
[Fact]
public async Task AutoFinalize_DoesNotFallbackImmediately_WhenSttThrows()
{
var sink = new Mock<ITurnTelemetrySink>();
var sttStrategySelector = new Mock<ISttStrategySelector>();
sttStrategySelector.Setup(s => s.SelectAsync(It.IsAny<TurnContext>(), It.IsAny<CancellationToken>()))
.ThrowsAsync(new InvalidOperationException("ffmpeg failed"));
var turnService = new WebSocketTurnFinalizationService(
new ProtocolToTurnContextMapper(),
Mock.Of<IConversationBroker>(),
new ResponsePlanToSocketMessagesMapper(),
sttStrategySelector.Object,
sink.Object
);
var session = new CloudSession();
session.TurnState.AwaitingTurnCompletion = true;
session.TurnState.SawListen = true;
session.TurnState.SawContext = true;
session.TurnState.BufferedAudioBytes = 12000;
session.TurnState.BufferedAudioChunkCount = 5;
session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2);
var replies = await turnService.HandleContextAsync(
session,
new WebSocketMessageEnvelope { Text = """{"type":"CONTEXT","data":{"topic":"conversation"}}""" },
CancellationToken.None);
Assert.Single(replies);
using var payload = System.Text.Json.JsonDocument.Parse(replies[0].Text!);
Assert.Equal("OPENJIBO_TURN_PENDING", payload.RootElement.GetProperty("type").GetString());
Assert.Equal(12000, session.TurnState.BufferedAudioBytes);
Assert.Equal("ffmpeg failed", session.TurnState.LastSttError);
sink.Verify(s => s.RecordTranscriptError(It.IsAny<Exception>(), It.IsAny<string>(), It.IsAny<CancellationToken>()), Times.Once());
}
}

View File

@@ -135,6 +135,10 @@ public sealed class JiboWebSocketServiceTests
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0])); Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0]));
} }
var session = _store.FindSessionByToken("hub-auto-finalize-token");
Assert.NotNull(session);
session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2);
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{ {
HostName = "neo-hub.jibo.com", HostName = "neo-hub.jibo.com",
@@ -192,6 +196,10 @@ public sealed class JiboWebSocketServiceTests
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0])); Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0]));
} }
var session = _store.FindSessionByToken("hub-auto-fallback-token");
Assert.NotNull(session);
session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2);
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{ {
HostName = "neo-hub.jibo.com", HostName = "neo-hub.jibo.com",