From a91433c5a04fe30a02f65fa7fc1bf11b869af5cb Mon Sep 17 00:00:00 2001 From: Jacob Dubin Date: Sat, 18 Apr 2026 08:15:37 -0500 Subject: [PATCH] fixes for STT --- .../WebSocketTurnFinalizationService.cs | 37 +++++++++++++++-- .../Models/WebSocketTurnState.cs | 2 + .../Turn/FileTurnTelemetrySinkTests.cs | 40 ++++++++++++++++++- .../WebSockets/JiboWebSocketServiceTests.cs | 8 ++++ 4 files changed, 83 insertions(+), 4 deletions(-) diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs index 21a16cd..67220d4 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs @@ -15,6 +15,7 @@ public sealed class WebSocketTurnFinalizationService( { private const int AutoFinalizeMinBufferedAudioBytes = 12000; private const int AutoFinalizeMinBufferedAudioChunks = 5; + private static readonly TimeSpan AutoFinalizeMinTurnAge = TimeSpan.FromMilliseconds(1800); public async Task> HandleBinaryAudioAsync( CloudSession session, @@ -119,10 +120,28 @@ public sealed class WebSocketTurnFinalizationService( return turn; } + ISttStrategy? strategy = null; + try + { + strategy = await sttStrategySelector.SelectAsync(turn, cancellationToken); + } + catch (InvalidOperationException ex) when (string.Equals(ex.Message, "No STT strategy can handle the current turn.", StringComparison.Ordinal)) + { + return turn; + } + catch (Exception ex) + { + session.TurnState.LastSttError = ex.Message; + session.TurnState.LastSttErrorUtc = DateTimeOffset.UtcNow; + await sink.RecordTranscriptError(ex, "Error during STT processing", cancellationToken); + return turn; + } + try { - var strategy = await sttStrategySelector.SelectAsync(turn, cancellationToken); var sttResult = await strategy.TranscribeAsync(turn, cancellationToken); + session.TurnState.LastSttError = null; + session.TurnState.LastSttErrorUtc = null; var attributes = new Dictionary(turn.Attributes, StringComparer.OrdinalIgnoreCase) { @@ -160,6 +179,8 @@ public sealed class WebSocketTurnFinalizationService( } catch (Exception ex) { + session.TurnState.LastSttError = ex.Message; + session.TurnState.LastSttErrorUtc = DateTimeOffset.UtcNow; await sink.RecordTranscriptError(ex, "Error during STT processing", cancellationToken); return turn; } @@ -229,6 +250,8 @@ public sealed class WebSocketTurnFinalizationService( { session.TurnState.BufferedAudioBytes = 0; session.TurnState.BufferedAudioChunkCount = 0; + session.TurnState.LastSttError = null; + session.TurnState.LastSttErrorUtc = null; session.TurnState.FirstAudioReceivedUtc = null; session.TurnState.LastAudioReceivedUtc = null; session.TurnState.BufferedAudioFrames.Clear(); @@ -241,6 +264,8 @@ public sealed class WebSocketTurnFinalizationService( turnState.TransId = transId; turnState.ContextPayload = null; turnState.AudioTranscriptHint = null; + turnState.LastSttError = null; + turnState.LastSttErrorUtc = null; turnState.FirstAudioReceivedUtc = null; turnState.LastAudioReceivedUtc = null; turnState.BufferedAudioChunkCount = 0; @@ -272,7 +297,9 @@ public sealed class WebSocketTurnFinalizationService( turnState.FinalizeAttemptCount += 1; } - if (allowFallbackOnMissingTranscript && turnState.BufferedAudioBytes >= AutoFinalizeMinBufferedAudioBytes) + if (allowFallbackOnMissingTranscript && + turnState.BufferedAudioBytes >= AutoFinalizeMinBufferedAudioBytes && + string.IsNullOrWhiteSpace(turnState.LastSttError)) { turnState.AwaitingTurnCompletion = false; session.LastTranscript = string.Empty; @@ -331,12 +358,16 @@ public sealed class WebSocketTurnFinalizationService( private static bool ShouldAutoFinalize(CloudSession session) { var turnState = session.TurnState; + var turnAge = turnState.FirstAudioReceivedUtc.HasValue + ? DateTimeOffset.UtcNow - turnState.FirstAudioReceivedUtc.Value + : TimeSpan.Zero; return turnState.AwaitingTurnCompletion && turnState is { SawListen: true, SawContext: true, BufferedAudioChunkCount: >= AutoFinalizeMinBufferedAudioChunks, BufferedAudioBytes: >= AutoFinalizeMinBufferedAudioBytes - }; + } && + turnAge >= AutoFinalizeMinTurnAge; } private static string? ExtractDataPayload(string? text) diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/WebSocketTurnState.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/WebSocketTurnState.cs index e055a0f..9624891 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/WebSocketTurnState.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/WebSocketTurnState.cs @@ -5,6 +5,8 @@ public sealed class WebSocketTurnState public string? TransId { get; set; } public string? ContextPayload { get; set; } public string? AudioTranscriptHint { get; set; } + public string? LastSttError { get; set; } + public DateTimeOffset? LastSttErrorUtc { get; set; } public DateTimeOffset? FirstAudioReceivedUtc { get; set; } public DateTimeOffset? LastAudioReceivedUtc { get; set; } public int BufferedAudioChunkCount { get; set; } diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/Turn/FileTurnTelemetrySinkTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/Turn/FileTurnTelemetrySinkTests.cs index ab686c1..af99c48 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/Turn/FileTurnTelemetrySinkTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/Turn/FileTurnTelemetrySinkTests.cs @@ -29,4 +29,42 @@ public sealed class FileTurnTelemetrySinkTests sink.Verify(s => s.RecordTranscriptError(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once()); } -} \ No newline at end of file + + [Fact] + public async Task AutoFinalize_DoesNotFallbackImmediately_WhenSttThrows() + { + var sink = new Mock(); + var sttStrategySelector = new Mock(); + sttStrategySelector.Setup(s => s.SelectAsync(It.IsAny(), It.IsAny())) + .ThrowsAsync(new InvalidOperationException("ffmpeg failed")); + + var turnService = new WebSocketTurnFinalizationService( + new ProtocolToTurnContextMapper(), + Mock.Of(), + new ResponsePlanToSocketMessagesMapper(), + sttStrategySelector.Object, + sink.Object + ); + + var session = new CloudSession(); + session.TurnState.AwaitingTurnCompletion = true; + session.TurnState.SawListen = true; + session.TurnState.SawContext = true; + session.TurnState.BufferedAudioBytes = 12000; + session.TurnState.BufferedAudioChunkCount = 5; + session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2); + + var replies = await turnService.HandleContextAsync( + session, + new WebSocketMessageEnvelope { Text = """{"type":"CONTEXT","data":{"topic":"conversation"}}""" }, + CancellationToken.None); + + Assert.Single(replies); + using var payload = System.Text.Json.JsonDocument.Parse(replies[0].Text!); + Assert.Equal("OPENJIBO_TURN_PENDING", payload.RootElement.GetProperty("type").GetString()); + Assert.Equal(12000, session.TurnState.BufferedAudioBytes); + Assert.Equal("ffmpeg failed", session.TurnState.LastSttError); + + sink.Verify(s => s.RecordTranscriptError(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once()); + } +} diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs index 2a42198..7224389 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs @@ -135,6 +135,10 @@ public sealed class JiboWebSocketServiceTests Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0])); } + var session = _store.FindSessionByToken("hub-auto-finalize-token"); + Assert.NotNull(session); + session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2); + replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope { HostName = "neo-hub.jibo.com", @@ -192,6 +196,10 @@ public sealed class JiboWebSocketServiceTests Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0])); } + var session = _store.FindSessionByToken("hub-auto-fallback-token"); + Assert.NotNull(session); + session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2); + replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope { HostName = "neo-hub.jibo.com",