From 70d36bf56375b895e875287a0b2288bdc7e0b28e Mon Sep 17 00:00:00 2001 From: Jacob Dubin Date: Sun, 12 Apr 2026 08:31:33 -0500 Subject: [PATCH] next round of websocket fun --- OpenJibo/OpenJibo.slnx | 1 + OpenJibo/docs/protocol-inventory.md | 4 +- OpenJibo/src/Jibo.Cloud/dotnet/README.md | 2 + .../Services/JiboWebSocketService.cs | 1 + .../Services/ProtocolToTurnContextMapper.cs | 28 ++- .../WebSocketTurnFinalizationService.cs | 74 +++++-- .../Jibo.Cloud.Domain/Models/CloudSession.cs | 5 +- .../Models/WebSocketTurnState.cs | 14 ++ .../src/Jibo.Cloud/node/fixtures/README.md | 2 +- .../neo-hub-buffered-audio-pending.flow.json | 41 ++++ .../neo-hub-multichunk-audio-chat.flow.json | 61 ++++++ .../WebSockets/JiboWebSocketServiceTests.cs | 194 +++++++++++++++++- 12 files changed, 391 insertions(+), 36 deletions(-) create mode 100644 OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/WebSocketTurnState.cs create mode 100644 OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-buffered-audio-pending.flow.json create mode 100644 OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-multichunk-audio-chat.flow.json diff --git a/OpenJibo/OpenJibo.slnx b/OpenJibo/OpenJibo.slnx index eb24c72..60f2dee 100644 --- a/OpenJibo/OpenJibo.slnx +++ b/OpenJibo/OpenJibo.slnx @@ -19,6 +19,7 @@ + diff --git a/OpenJibo/docs/protocol-inventory.md b/OpenJibo/docs/protocol-inventory.md index 6296c9b..02b30fd 100644 --- a/OpenJibo/docs/protocol-inventory.md +++ b/OpenJibo/docs/protocol-inventory.md @@ -64,6 +64,7 @@ Observed from `open-jibo-link.js`: The current .NET pass covers only a narrow, explicitly synthetic subset of observed Neo-Hub behavior: - token/session tracking across websocket turns +- explicit per-turn state tracking for transID, rules, context, buffered audio, and finalize attempts - buffered audio accounting and turn-pending state - `LISTEN` message handling with synthetic `LISTEN` result payload shaping - `CONTEXT` capture for turn/session state @@ -85,7 +86,8 @@ This does not yet mean parity for: The current .NET websocket layer now separates: - robot-facing websocket compatibility -- session and buffered-audio state +- long-lived cloud session state +- per-turn websocket state - transcript resolution / STT selection - turn-to-response mapping diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/README.md b/OpenJibo/src/Jibo.Cloud/dotnet/README.md index e560462..e4c2c0e 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/README.md +++ b/OpenJibo/src/Jibo.Cloud/dotnet/README.md @@ -68,6 +68,7 @@ The intent is to grow from a runnable dev monolith into the real Azure deploymen Current websocket scope is still intentionally narrow: - token-backed socket sessions +- explicit websocket turn-state tracking separate from long-lived cloud session state - synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR` - buffered audio state tracking behind a dedicated turn-finalization layer - synthetic STT strategy selection for fixture-driven audio turn completion @@ -79,6 +80,7 @@ Not yet covered: - real binary audio / ASR finalization parity - provider-backed ASR integration +- timed finalize/fallback behavior matching richer Node turn-state semantics - upstream Nimbus or broader skill lifecycle behavior - animation / expression command families - ESML feature parity beyond the narrow synthetic playback payloads used in the current scaffold diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboWebSocketService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboWebSocketService.cs index 4b9438e..83df7fb 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboWebSocketService.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboWebSocketService.cs @@ -26,6 +26,7 @@ public sealed class JiboWebSocketService( if (!string.IsNullOrWhiteSpace(parsedTransId)) { session.LastTransId = parsedTransId; + session.TurnState.TransId = parsedTransId; } if (parsedType == "CONTEXT") diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ProtocolToTurnContextMapper.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ProtocolToTurnContextMapper.cs index 06128a3..81e2d1b 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ProtocolToTurnContextMapper.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ProtocolToTurnContextMapper.cs @@ -8,6 +8,7 @@ public sealed class ProtocolToTurnContextMapper { public TurnContext MapListenMessage(WebSocketMessageEnvelope envelope, CloudSession session, string messageType) { + var turnState = session.TurnState; var text = ExtractTranscript(envelope.Text); var protocolOperation = messageType.ToLowerInvariant(); var attributes = new Dictionary(StringComparer.OrdinalIgnoreCase) @@ -15,30 +16,35 @@ public sealed class ProtocolToTurnContextMapper ["messageType"] = messageType }; - if (!string.IsNullOrWhiteSpace(session.LastTransId)) + if (!string.IsNullOrWhiteSpace(turnState.TransId)) { - attributes["transID"] = session.LastTransId; + attributes["transID"] = turnState.TransId; } - if (session.Metadata.TryGetValue("context", out var context)) + if (!string.IsNullOrWhiteSpace(turnState.ContextPayload)) { - attributes["context"] = context; + attributes["context"] = turnState.ContextPayload; } - if (session.Metadata.TryGetValue("listenRules", out var listenRules)) + if (turnState.ListenRules.Count > 0) { - attributes["listenRules"] = listenRules; + attributes["listenRules"] = turnState.ListenRules; } - if (session.BufferedAudioBytes > 0) + if (turnState.BufferedAudioBytes > 0) { - attributes["bufferedAudioBytes"] = session.BufferedAudioBytes; - attributes["bufferedAudioChunks"] = session.BufferedAudioChunkCount; + attributes["bufferedAudioBytes"] = turnState.BufferedAudioBytes; + attributes["bufferedAudioChunks"] = turnState.BufferedAudioChunkCount; } - if (session.Metadata.TryGetValue("audioTranscriptHint", out var audioTranscriptHint)) + if (!string.IsNullOrWhiteSpace(turnState.AudioTranscriptHint)) { - attributes["audioTranscriptHint"] = audioTranscriptHint; + attributes["audioTranscriptHint"] = turnState.AudioTranscriptHint; + } + + if (turnState.FinalizeAttemptCount > 0) + { + attributes["finalizeAttemptCount"] = turnState.FinalizeAttemptCount; } return new TurnContext diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs index 77511da..fb38c22 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs @@ -12,11 +12,12 @@ public sealed class WebSocketTurnFinalizationService( { public IReadOnlyList HandleBinaryAudio(CloudSession session, WebSocketMessageEnvelope envelope) { + var turnState = session.TurnState; session.LastMessageType = "BINARY_AUDIO"; - session.BufferedAudioChunkCount += 1; - session.BufferedAudioBytes += envelope.Binary?.Length ?? 0; - session.LastAudioReceivedUtc = DateTimeOffset.UtcNow; - session.AwaitingTurnCompletion = true; + turnState.BufferedAudioChunkCount += 1; + turnState.BufferedAudioBytes += envelope.Binary?.Length ?? 0; + turnState.LastAudioReceivedUtc = DateTimeOffset.UtcNow; + turnState.AwaitingTurnCompletion = true; session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0; return @@ -29,8 +30,8 @@ public sealed class WebSocketTurnFinalizationService( data = new { bytes = envelope.Binary?.Length ?? 0, - bufferedBytes = session.BufferedAudioBytes, - bufferedChunks = session.BufferedAudioChunkCount, + bufferedBytes = turnState.BufferedAudioBytes, + bufferedChunks = turnState.BufferedAudioChunkCount, sessionId = session.SessionId } }) @@ -40,11 +41,14 @@ public sealed class WebSocketTurnFinalizationService( public IReadOnlyList HandleContext(CloudSession session, string? text) { - session.Metadata["context"] = ExtractDataPayload(text); + var turnState = session.TurnState; + turnState.ContextPayload = ExtractDataPayload(text); + session.Metadata["context"] = turnState.ContextPayload; if (TryReadContextProperty(text, "audioTranscriptHint", out var transcriptHint) && !string.IsNullOrWhiteSpace(transcriptHint)) { + turnState.AudioTranscriptHint = transcriptHint; session.Metadata["audioTranscriptHint"] = transcriptHint; } @@ -75,10 +79,15 @@ public sealed class WebSocketTurnFinalizationService( var turn = turnContextMapper.MapListenMessage(envelope, session, messageType); var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken); + var turnState = session.TurnState; if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) && string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript)) { - session.AwaitingTurnCompletion = true; + turnState.AwaitingTurnCompletion = true; + if (turnState.BufferedAudioBytes > 0) + { + turnState.FinalizeAttemptCount += 1; + } return [ new WebSocketReply @@ -90,9 +99,11 @@ public sealed class WebSocketTurnFinalizationService( { sessionId = session.SessionId, transID = session.LastTransId, - bufferedAudioBytes = session.BufferedAudioBytes, - bufferedAudioChunks = session.BufferedAudioChunkCount, - awaitingAudio = true + bufferedAudioBytes = turnState.BufferedAudioBytes, + bufferedAudioChunks = turnState.BufferedAudioChunkCount, + awaitingAudio = turnState.BufferedAudioBytes == 0, + awaitingTranscriptHint = turnState.BufferedAudioBytes > 0 && string.IsNullOrWhiteSpace(turnState.AudioTranscriptHint), + finalizeAttempts = turnState.FinalizeAttemptCount } }) } @@ -107,7 +118,7 @@ public sealed class WebSocketTurnFinalizationService( session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen ? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout) : null; - session.AwaitingTurnCompletion = false; + turnState.AwaitingTurnCompletion = false; var emitSkillActions = messageType != "CLIENT_NLU"; var replies = replyMapper.Map(plan, finalizedTurn, session, emitSkillActions).Select(text => new WebSocketReply @@ -126,7 +137,7 @@ public sealed class WebSocketTurnFinalizationService( return turn; } - if (session.BufferedAudioBytes <= 0) + if (session.TurnState.BufferedAudioBytes <= 0) { return turn; } @@ -178,6 +189,7 @@ public sealed class WebSocketTurnFinalizationService( private static void PersistTurnHints(CloudSession session, string? text) { + var turnState = session.TurnState; if (string.IsNullOrWhiteSpace(text)) { return; @@ -188,14 +200,26 @@ public sealed class WebSocketTurnFinalizationService( using var document = JsonDocument.Parse(text); var root = document.RootElement; + if (root.TryGetProperty("transID", out var transId) && transId.ValueKind == JsonValueKind.String) + { + var nextTransId = transId.GetString(); + if (!string.IsNullOrWhiteSpace(nextTransId) && + !string.Equals(turnState.TransId, nextTransId, StringComparison.Ordinal)) + { + ResetTurnState(turnState, nextTransId); + session.LastTransId = nextTransId; + } + } + if (root.TryGetProperty("data", out var data) && data.ValueKind == JsonValueKind.Object) { if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array) { - session.Metadata["listenRules"] = rules.EnumerateArray() + turnState.ListenRules = rules.EnumerateArray() .Select(item => item.ValueKind == JsonValueKind.String ? item.GetString() ?? string.Empty : item.ToString()) .Where(rule => !string.IsNullOrWhiteSpace(rule)) .ToArray(); + session.Metadata["listenRules"] = turnState.ListenRules; } if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String) @@ -205,7 +229,8 @@ public sealed class WebSocketTurnFinalizationService( if (data.TryGetProperty("transcriptHint", out var transcriptHint) && transcriptHint.ValueKind == JsonValueKind.String) { - session.Metadata["audioTranscriptHint"] = transcriptHint.GetString(); + turnState.AudioTranscriptHint = transcriptHint.GetString(); + session.Metadata["audioTranscriptHint"] = turnState.AudioTranscriptHint; } } } @@ -217,11 +242,26 @@ public sealed class WebSocketTurnFinalizationService( private static void ResetBufferedAudio(CloudSession session) { - session.BufferedAudioBytes = 0; - session.BufferedAudioChunkCount = 0; + session.TurnState.BufferedAudioBytes = 0; + session.TurnState.BufferedAudioChunkCount = 0; + session.TurnState.LastAudioReceivedUtc = null; + session.TurnState.FinalizeAttemptCount = 0; session.Metadata.Remove("audioTranscriptHint"); } + private static void ResetTurnState(WebSocketTurnState turnState, string? transId) + { + turnState.TransId = transId; + turnState.ContextPayload = null; + turnState.AudioTranscriptHint = null; + turnState.LastAudioReceivedUtc = null; + turnState.BufferedAudioChunkCount = 0; + turnState.BufferedAudioBytes = 0; + turnState.FinalizeAttemptCount = 0; + turnState.AwaitingTurnCompletion = false; + turnState.ListenRules = []; + } + private static string? ExtractDataPayload(string? text) { if (string.IsNullOrWhiteSpace(text)) diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/CloudSession.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/CloudSession.cs index 11d55a3..8185464 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/CloudSession.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/CloudSession.cs @@ -11,16 +11,13 @@ public sealed class CloudSession public string? Path { get; init; } public DateTimeOffset CreatedUtc { get; init; } = DateTimeOffset.UtcNow; public DateTimeOffset LastSeenUtc { get; set; } = DateTimeOffset.UtcNow; - public DateTimeOffset? LastAudioReceivedUtc { get; set; } public DateTimeOffset? FollowUpExpiresUtc { get; set; } public string? LastMessageType { get; set; } public string? LastListenType { get; set; } public string? LastIntent { get; set; } public string? LastTranscript { get; set; } public string? LastTransId { get; set; } - public int BufferedAudioChunkCount { get; set; } - public int BufferedAudioBytes { get; set; } - public bool AwaitingTurnCompletion { get; set; } public bool FollowUpOpen => FollowUpExpiresUtc.HasValue && FollowUpExpiresUtc > DateTimeOffset.UtcNow; + public WebSocketTurnState TurnState { get; } = new(); public IDictionary Metadata { get; init; } = new Dictionary(); } diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/WebSocketTurnState.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/WebSocketTurnState.cs new file mode 100644 index 0000000..2390a78 --- /dev/null +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/WebSocketTurnState.cs @@ -0,0 +1,14 @@ +namespace Jibo.Cloud.Domain.Models; + +public sealed class WebSocketTurnState +{ + public string? TransId { get; set; } + public string? ContextPayload { get; set; } + public string? AudioTranscriptHint { get; set; } + public DateTimeOffset? LastAudioReceivedUtc { get; set; } + public int BufferedAudioChunkCount { get; set; } + public int BufferedAudioBytes { get; set; } + public int FinalizeAttemptCount { get; set; } + public bool AwaitingTurnCompletion { get; set; } + public IReadOnlyList ListenRules { get; set; } = []; +} diff --git a/OpenJibo/src/Jibo.Cloud/node/fixtures/README.md b/OpenJibo/src/Jibo.Cloud/node/fixtures/README.md index 83eacff..a14f979 100644 --- a/OpenJibo/src/Jibo.Cloud/node/fixtures/README.md +++ b/OpenJibo/src/Jibo.Cloud/node/fixtures/README.md @@ -7,6 +7,6 @@ Current fixture groups: - `http/` Basic `X-Amz-Target` request and response examples for startup flows. - `websocket/` - Sanitized Neo-Hub turn-flow examples used to replay `LISTEN`, `CONTEXT`, `CLIENT_NLU`, `CLIENT_ASR`, and synthetic `EOS` / `SKILL_ACTION` behavior against the .NET implementation. + Sanitized Neo-Hub turn-flow examples used to replay `LISTEN`, `CONTEXT`, `CLIENT_NLU`, `CLIENT_ASR`, buffered-audio accumulation, pending/finalize states, and synthetic `EOS` / `SKILL_ACTION` behavior against the .NET implementation. Expand this folder whenever new robot traffic is captured and cleaned. diff --git a/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-buffered-audio-pending.flow.json b/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-buffered-audio-pending.flow.json new file mode 100644 index 0000000..1c9aa18 --- /dev/null +++ b/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-buffered-audio-pending.flow.json @@ -0,0 +1,41 @@ +{ + "name": "neo-hub buffered audio pending flow", + "session": { + "hostName": "neo-hub.jibo.com", + "path": "/listen", + "kind": "neo-hub-listen", + "token": "fixture-pending-audio-token" + }, + "steps": [ + { + "text": { + "type": "LISTEN", + "transID": "fixture-trans-pending", + "data": { + "rules": [ + "wake-word" + ] + } + }, + "expectedReplyTypes": [ + "OPENJIBO_TURN_PENDING" + ] + }, + { + "binary": [1, 2, 3, 4], + "expectedReplyTypes": [ + "OPENJIBO_AUDIO_RECEIVED" + ] + }, + { + "text": { + "type": "CLIENT_ASR", + "transID": "fixture-trans-pending", + "data": { } + }, + "expectedReplyTypes": [ + "OPENJIBO_TURN_PENDING" + ] + } + ] +} diff --git a/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-multichunk-audio-chat.flow.json b/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-multichunk-audio-chat.flow.json new file mode 100644 index 0000000..2c73bb4 --- /dev/null +++ b/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-multichunk-audio-chat.flow.json @@ -0,0 +1,61 @@ +{ + "name": "neo-hub multichunk audio chat flow", + "session": { + "hostName": "neo-hub.jibo.com", + "path": "/listen", + "kind": "neo-hub-listen", + "token": "fixture-audio-chat-token" + }, + "steps": [ + { + "text": { + "type": "LISTEN", + "transID": "fixture-trans-audio-chat", + "data": { + "rules": [ + "wake-word" + ] + } + }, + "expectedReplyTypes": [ + "OPENJIBO_TURN_PENDING" + ] + }, + { + "text": { + "type": "CONTEXT", + "transID": "fixture-trans-audio-chat", + "data": { + "audioTranscriptHint": "hello from buffered audio" + } + }, + "expectedReplyTypes": [ + "OPENJIBO_CONTEXT_ACK" + ] + }, + { + "binary": [1, 2, 3], + "expectedReplyTypes": [ + "OPENJIBO_AUDIO_RECEIVED" + ] + }, + { + "binary": [4, 5, 6, 7], + "expectedReplyTypes": [ + "OPENJIBO_AUDIO_RECEIVED" + ] + }, + { + "text": { + "type": "CLIENT_ASR", + "transID": "fixture-trans-audio-chat", + "data": { } + }, + "expectedReplyTypes": [ + "LISTEN", + "EOS", + "SKILL_ACTION" + ] + } + ] +} diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs index a76c643..c0e03a4 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs @@ -72,6 +72,48 @@ public sealed class JiboWebSocketServiceTests Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32()); } + [Fact] + public async Task MultiChunkAudio_AccumulatesBufferedStateAcrossMessages() + { + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-multichunk-token", + Text = """{"type":"LISTEN","transID":"trans-multi","data":{"rules":["wake-word"]}}""" + }); + + var firstAudioReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-multichunk-token", + Binary = [1, 2, 3] + }); + + var secondAudioReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-multichunk-token", + Binary = [4, 5, 6, 7] + }); + + using var firstPayload = JsonDocument.Parse(firstAudioReplies[0].Text!); + using var secondPayload = JsonDocument.Parse(secondAudioReplies[0].Text!); + Assert.Equal(3, firstPayload.RootElement.GetProperty("data").GetProperty("bufferedBytes").GetInt32()); + Assert.Equal(7, secondPayload.RootElement.GetProperty("data").GetProperty("bufferedBytes").GetInt32()); + Assert.Equal(2, secondPayload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32()); + + var session = _store.FindSessionByToken("hub-multichunk-token"); + Assert.NotNull(session); + Assert.Equal(7, session!.TurnState.BufferedAudioBytes); + Assert.Equal(2, session.TurnState.BufferedAudioChunkCount); + } + [Fact] public async Task ContextThenClientNlu_UsesFollowUpTurnStateAndSkipsSkillAction() { @@ -175,15 +217,163 @@ public sealed class JiboWebSocketServiceTests var session = _store.FindSessionByToken("hub-audio-token"); Assert.NotNull(session); - Assert.Equal(0, session!.BufferedAudioBytes); - Assert.Equal(0, session.BufferedAudioChunkCount); + Assert.Equal(0, session!.TurnState.BufferedAudioBytes); + Assert.Equal(0, session.TurnState.BufferedAudioChunkCount); Assert.False(session.Metadata.ContainsKey("audioTranscriptHint")); } + [Fact] + public async Task BufferedAudio_WithoutTranscriptHint_RemainsPending() + { + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-pending-token", + Text = """{"type":"LISTEN","transID":"trans-pending","data":{"rules":["wake-word"]}}""" + }); + + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-pending-token", + Binary = [1, 2, 3, 4] + }); + + var finalizeReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-pending-token", + Text = """{"type":"CLIENT_ASR","transID":"trans-pending","data":{}}""" + }); + + Assert.Single(finalizeReplies); + Assert.Equal("OPENJIBO_TURN_PENDING", ReadReplyType(finalizeReplies[0])); + + using var payload = JsonDocument.Parse(finalizeReplies[0].Text!); + Assert.True(payload.RootElement.GetProperty("data").GetProperty("awaitingTranscriptHint").GetBoolean()); + Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("finalizeAttempts").GetInt32()); + } + + [Fact] + public async Task BufferedAudio_WithChatTranscriptHint_FinalizesAsChat() + { + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-audio-chat-token", + Text = """{"type":"LISTEN","transID":"trans-audio-chat","data":{"rules":["wake-word"]}}""" + }); + + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-audio-chat-token", + Text = """{"type":"CONTEXT","transID":"trans-audio-chat","data":{"audioTranscriptHint":"hello from buffered audio"}}""" + }); + + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-audio-chat-token", + Binary = [1, 2, 3, 4, 5] + }); + + var finalizeReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-audio-chat-token", + Text = """{"type":"CLIENT_ASR","transID":"trans-audio-chat","data":{}}""" + }); + + Assert.Equal(3, finalizeReplies.Count); + using var listenPayload = JsonDocument.Parse(finalizeReplies[0].Text!); + Assert.Equal("hello from buffered audio", listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString()); + Assert.Equal("chat", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString()); + + using var skillPayload = JsonDocument.Parse(finalizeReplies[2].Text!); + Assert.Equal("chitchat-skill", skillPayload.RootElement.GetProperty("data").GetProperty("skill").GetProperty("id").GetString()); + } + + [Fact] + public async Task FollowUpTurn_UsesNewTurnStateWithoutLeakingBufferedAudio() + { + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-followup-audio-token", + Text = """{"type":"LISTEN","transID":"trans-first","data":{"rules":["wake-word"]}}""" + }); + + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-followup-audio-token", + Text = """{"type":"CONTEXT","transID":"trans-first","data":{"audioTranscriptHint":"tell me a joke"}}""" + }); + + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-followup-audio-token", + Binary = [1, 2, 3, 4] + }); + + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-followup-audio-token", + Text = """{"type":"CLIENT_ASR","transID":"trans-first","data":{}}""" + }); + + var followUpReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-followup-audio-token", + Text = """{"type":"LISTEN","transID":"trans-second","data":{"text":"what time is it","rules":["follow-up"]}}""" + }); + + Assert.Equal(3, followUpReplies.Count); + using var payload = JsonDocument.Parse(followUpReplies[0].Text!); + Assert.Equal("time", payload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString()); + Assert.Equal("trans-second", payload.RootElement.GetProperty("transID").GetString()); + + var session = _store.FindSessionByToken("hub-followup-audio-token"); + Assert.NotNull(session); + Assert.Equal("trans-second", session!.TurnState.TransId); + Assert.Equal(0, session.TurnState.BufferedAudioBytes); + Assert.Equal(0, session.TurnState.BufferedAudioChunkCount); + } + [Theory] [InlineData("fixtures\\neo-hub-client-asr-joke.flow.json")] [InlineData("fixtures\\neo-hub-context-client-nlu.flow.json")] [InlineData("fixtures\\neo-hub-buffered-audio-synthetic-asr.flow.json")] + [InlineData("fixtures\\neo-hub-multichunk-audio-chat.flow.json")] + [InlineData("fixtures\\neo-hub-buffered-audio-pending.flow.json")] public async Task WebSocketFixture_ReplaysSuccessfully(string relativePath) { var fixture = WebSocketFixtureLoader.Load(relativePath);