fixes for STT
This commit is contained in:
@@ -15,6 +15,7 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
{
|
{
|
||||||
private const int AutoFinalizeMinBufferedAudioBytes = 12000;
|
private const int AutoFinalizeMinBufferedAudioBytes = 12000;
|
||||||
private const int AutoFinalizeMinBufferedAudioChunks = 5;
|
private const int AutoFinalizeMinBufferedAudioChunks = 5;
|
||||||
|
private static readonly TimeSpan AutoFinalizeMinTurnAge = TimeSpan.FromMilliseconds(1800);
|
||||||
|
|
||||||
public async Task<IReadOnlyList<WebSocketReply>> HandleBinaryAudioAsync(
|
public async Task<IReadOnlyList<WebSocketReply>> HandleBinaryAudioAsync(
|
||||||
CloudSession session,
|
CloudSession session,
|
||||||
@@ -119,10 +120,28 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
return turn;
|
return turn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ISttStrategy? strategy = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
strategy = await sttStrategySelector.SelectAsync(turn, cancellationToken);
|
||||||
|
}
|
||||||
|
catch (InvalidOperationException ex) when (string.Equals(ex.Message, "No STT strategy can handle the current turn.", StringComparison.Ordinal))
|
||||||
|
{
|
||||||
|
return turn;
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
session.TurnState.LastSttError = ex.Message;
|
||||||
|
session.TurnState.LastSttErrorUtc = DateTimeOffset.UtcNow;
|
||||||
|
await sink.RecordTranscriptError(ex, "Error during STT processing", cancellationToken);
|
||||||
|
return turn;
|
||||||
|
}
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var strategy = await sttStrategySelector.SelectAsync(turn, cancellationToken);
|
|
||||||
var sttResult = await strategy.TranscribeAsync(turn, cancellationToken);
|
var sttResult = await strategy.TranscribeAsync(turn, cancellationToken);
|
||||||
|
session.TurnState.LastSttError = null;
|
||||||
|
session.TurnState.LastSttErrorUtc = null;
|
||||||
|
|
||||||
var attributes = new Dictionary<string, object?>(turn.Attributes, StringComparer.OrdinalIgnoreCase)
|
var attributes = new Dictionary<string, object?>(turn.Attributes, StringComparer.OrdinalIgnoreCase)
|
||||||
{
|
{
|
||||||
@@ -160,6 +179,8 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
|
session.TurnState.LastSttError = ex.Message;
|
||||||
|
session.TurnState.LastSttErrorUtc = DateTimeOffset.UtcNow;
|
||||||
await sink.RecordTranscriptError(ex, "Error during STT processing", cancellationToken);
|
await sink.RecordTranscriptError(ex, "Error during STT processing", cancellationToken);
|
||||||
return turn;
|
return turn;
|
||||||
}
|
}
|
||||||
@@ -229,6 +250,8 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
{
|
{
|
||||||
session.TurnState.BufferedAudioBytes = 0;
|
session.TurnState.BufferedAudioBytes = 0;
|
||||||
session.TurnState.BufferedAudioChunkCount = 0;
|
session.TurnState.BufferedAudioChunkCount = 0;
|
||||||
|
session.TurnState.LastSttError = null;
|
||||||
|
session.TurnState.LastSttErrorUtc = null;
|
||||||
session.TurnState.FirstAudioReceivedUtc = null;
|
session.TurnState.FirstAudioReceivedUtc = null;
|
||||||
session.TurnState.LastAudioReceivedUtc = null;
|
session.TurnState.LastAudioReceivedUtc = null;
|
||||||
session.TurnState.BufferedAudioFrames.Clear();
|
session.TurnState.BufferedAudioFrames.Clear();
|
||||||
@@ -241,6 +264,8 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
turnState.TransId = transId;
|
turnState.TransId = transId;
|
||||||
turnState.ContextPayload = null;
|
turnState.ContextPayload = null;
|
||||||
turnState.AudioTranscriptHint = null;
|
turnState.AudioTranscriptHint = null;
|
||||||
|
turnState.LastSttError = null;
|
||||||
|
turnState.LastSttErrorUtc = null;
|
||||||
turnState.FirstAudioReceivedUtc = null;
|
turnState.FirstAudioReceivedUtc = null;
|
||||||
turnState.LastAudioReceivedUtc = null;
|
turnState.LastAudioReceivedUtc = null;
|
||||||
turnState.BufferedAudioChunkCount = 0;
|
turnState.BufferedAudioChunkCount = 0;
|
||||||
@@ -272,7 +297,9 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
turnState.FinalizeAttemptCount += 1;
|
turnState.FinalizeAttemptCount += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allowFallbackOnMissingTranscript && turnState.BufferedAudioBytes >= AutoFinalizeMinBufferedAudioBytes)
|
if (allowFallbackOnMissingTranscript &&
|
||||||
|
turnState.BufferedAudioBytes >= AutoFinalizeMinBufferedAudioBytes &&
|
||||||
|
string.IsNullOrWhiteSpace(turnState.LastSttError))
|
||||||
{
|
{
|
||||||
turnState.AwaitingTurnCompletion = false;
|
turnState.AwaitingTurnCompletion = false;
|
||||||
session.LastTranscript = string.Empty;
|
session.LastTranscript = string.Empty;
|
||||||
@@ -331,12 +358,16 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
private static bool ShouldAutoFinalize(CloudSession session)
|
private static bool ShouldAutoFinalize(CloudSession session)
|
||||||
{
|
{
|
||||||
var turnState = session.TurnState;
|
var turnState = session.TurnState;
|
||||||
|
var turnAge = turnState.FirstAudioReceivedUtc.HasValue
|
||||||
|
? DateTimeOffset.UtcNow - turnState.FirstAudioReceivedUtc.Value
|
||||||
|
: TimeSpan.Zero;
|
||||||
return turnState.AwaitingTurnCompletion &&
|
return turnState.AwaitingTurnCompletion &&
|
||||||
turnState is
|
turnState is
|
||||||
{
|
{
|
||||||
SawListen: true, SawContext: true, BufferedAudioChunkCount: >= AutoFinalizeMinBufferedAudioChunks,
|
SawListen: true, SawContext: true, BufferedAudioChunkCount: >= AutoFinalizeMinBufferedAudioChunks,
|
||||||
BufferedAudioBytes: >= AutoFinalizeMinBufferedAudioBytes
|
BufferedAudioBytes: >= AutoFinalizeMinBufferedAudioBytes
|
||||||
};
|
} &&
|
||||||
|
turnAge >= AutoFinalizeMinTurnAge;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string? ExtractDataPayload(string? text)
|
private static string? ExtractDataPayload(string? text)
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ public sealed class WebSocketTurnState
|
|||||||
public string? TransId { get; set; }
|
public string? TransId { get; set; }
|
||||||
public string? ContextPayload { get; set; }
|
public string? ContextPayload { get; set; }
|
||||||
public string? AudioTranscriptHint { get; set; }
|
public string? AudioTranscriptHint { get; set; }
|
||||||
|
public string? LastSttError { get; set; }
|
||||||
|
public DateTimeOffset? LastSttErrorUtc { get; set; }
|
||||||
public DateTimeOffset? FirstAudioReceivedUtc { get; set; }
|
public DateTimeOffset? FirstAudioReceivedUtc { get; set; }
|
||||||
public DateTimeOffset? LastAudioReceivedUtc { get; set; }
|
public DateTimeOffset? LastAudioReceivedUtc { get; set; }
|
||||||
public int BufferedAudioChunkCount { get; set; }
|
public int BufferedAudioChunkCount { get; set; }
|
||||||
|
|||||||
@@ -29,4 +29,42 @@ public sealed class FileTurnTelemetrySinkTests
|
|||||||
|
|
||||||
sink.Verify(s => s.RecordTranscriptError(It.IsAny<Exception>(), It.IsAny<string>(), It.IsAny<CancellationToken>()), Times.Once());
|
sink.Verify(s => s.RecordTranscriptError(It.IsAny<Exception>(), It.IsAny<string>(), It.IsAny<CancellationToken>()), Times.Once());
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
[Fact]
|
||||||
|
public async Task AutoFinalize_DoesNotFallbackImmediately_WhenSttThrows()
|
||||||
|
{
|
||||||
|
var sink = new Mock<ITurnTelemetrySink>();
|
||||||
|
var sttStrategySelector = new Mock<ISttStrategySelector>();
|
||||||
|
sttStrategySelector.Setup(s => s.SelectAsync(It.IsAny<TurnContext>(), It.IsAny<CancellationToken>()))
|
||||||
|
.ThrowsAsync(new InvalidOperationException("ffmpeg failed"));
|
||||||
|
|
||||||
|
var turnService = new WebSocketTurnFinalizationService(
|
||||||
|
new ProtocolToTurnContextMapper(),
|
||||||
|
Mock.Of<IConversationBroker>(),
|
||||||
|
new ResponsePlanToSocketMessagesMapper(),
|
||||||
|
sttStrategySelector.Object,
|
||||||
|
sink.Object
|
||||||
|
);
|
||||||
|
|
||||||
|
var session = new CloudSession();
|
||||||
|
session.TurnState.AwaitingTurnCompletion = true;
|
||||||
|
session.TurnState.SawListen = true;
|
||||||
|
session.TurnState.SawContext = true;
|
||||||
|
session.TurnState.BufferedAudioBytes = 12000;
|
||||||
|
session.TurnState.BufferedAudioChunkCount = 5;
|
||||||
|
session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2);
|
||||||
|
|
||||||
|
var replies = await turnService.HandleContextAsync(
|
||||||
|
session,
|
||||||
|
new WebSocketMessageEnvelope { Text = """{"type":"CONTEXT","data":{"topic":"conversation"}}""" },
|
||||||
|
CancellationToken.None);
|
||||||
|
|
||||||
|
Assert.Single(replies);
|
||||||
|
using var payload = System.Text.Json.JsonDocument.Parse(replies[0].Text!);
|
||||||
|
Assert.Equal("OPENJIBO_TURN_PENDING", payload.RootElement.GetProperty("type").GetString());
|
||||||
|
Assert.Equal(12000, session.TurnState.BufferedAudioBytes);
|
||||||
|
Assert.Equal("ffmpeg failed", session.TurnState.LastSttError);
|
||||||
|
|
||||||
|
sink.Verify(s => s.RecordTranscriptError(It.IsAny<Exception>(), It.IsAny<string>(), It.IsAny<CancellationToken>()), Times.Once());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -135,6 +135,10 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0]));
|
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var session = _store.FindSessionByToken("hub-auto-finalize-token");
|
||||||
|
Assert.NotNull(session);
|
||||||
|
session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2);
|
||||||
|
|
||||||
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
{
|
{
|
||||||
HostName = "neo-hub.jibo.com",
|
HostName = "neo-hub.jibo.com",
|
||||||
@@ -192,6 +196,10 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0]));
|
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var session = _store.FindSessionByToken("hub-auto-fallback-token");
|
||||||
|
Assert.NotNull(session);
|
||||||
|
session.TurnState.FirstAudioReceivedUtc = DateTimeOffset.UtcNow - TimeSpan.FromSeconds(2);
|
||||||
|
|
||||||
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
{
|
{
|
||||||
HostName = "neo-hub.jibo.com",
|
HostName = "neo-hub.jibo.com",
|
||||||
|
|||||||
Reference in New Issue
Block a user