From 41fc9a1ef6553bbf4125e8ade6e7c8ea9b504b38 Mon Sep 17 00:00:00 2001 From: Jacob Dubin Date: Sat, 11 Apr 2026 22:11:08 -0500 Subject: [PATCH] another pass complete --- OpenJibo/docs/protocol-inventory.md | 16 +- OpenJibo/src/Jibo.Cloud/dotnet/README.md | 3 + .../Services/DefaultSttStrategySelector.cs | 19 ++ .../Services/JiboWebSocketService.cs | 140 +-------- .../Services/ProtocolToTurnContextMapper.cs | 28 +- .../SyntheticBufferedAudioSttStrategy.cs | 59 ++++ .../WebSocketTurnFinalizationService.cs | 274 ++++++++++++++++++ .../Jibo.Cloud.Domain/Models/CloudSession.cs | 4 + .../ServiceCollectionExtensions.cs | 3 + ...hub-buffered-audio-synthetic-asr.flow.json | 56 ++++ .../WebSockets/JiboWebSocketServiceTests.cs | 83 +++++- 11 files changed, 542 insertions(+), 143 deletions(-) create mode 100644 OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/DefaultSttStrategySelector.cs create mode 100644 OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/SyntheticBufferedAudioSttStrategy.cs create mode 100644 OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs create mode 100644 OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-buffered-audio-synthetic-asr.flow.json diff --git a/OpenJibo/docs/protocol-inventory.md b/OpenJibo/docs/protocol-inventory.md index f41c31f..6296c9b 100644 --- a/OpenJibo/docs/protocol-inventory.md +++ b/OpenJibo/docs/protocol-inventory.md @@ -64,21 +64,33 @@ Observed from `open-jibo-link.js`: The current .NET pass covers only a narrow, explicitly synthetic subset of observed Neo-Hub behavior: - token/session tracking across websocket turns +- buffered audio accounting and turn-pending state - `LISTEN` message handling with synthetic `LISTEN` result payload shaping - `CONTEXT` capture for turn/session state - `CLIENT_NLU` turn completion using remembered listen/session metadata -- `CLIENT_ASR` text-driven turn completion +- `CLIENT_ASR` turn completion, including a synthetic STT seam for buffered-audio replay - `EOS` emission after completed turns - first richer vertical slice for joke/chat `SKILL_ACTION` playback This does not yet mean parity for: - real binary audio buffering and finalization -- external ASR lifecycle timing +- real STT provider integration and external ASR lifecycle timing - early-EOS behavior - multi-step skill lifecycles beyond the current synthetic playback response - broader interaction, animation, or ESML command families +### Internal ASR Direction + +The current .NET websocket layer now separates: + +- robot-facing websocket compatibility +- session and buffered-audio state +- transcript resolution / STT selection +- turn-to-response mapping + +That separation is intentional. The synthetic STT path currently exists only to support fixture-driven replay while parity work continues. It should be treated as an internal compatibility seam, not as the final production ASR design. + ## Upload Paths | Path | Purpose | Confidence | Current .NET status | diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/README.md b/OpenJibo/src/Jibo.Cloud/dotnet/README.md index f18cbb0..e560462 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/README.md +++ b/OpenJibo/src/Jibo.Cloud/dotnet/README.md @@ -69,6 +69,8 @@ Current websocket scope is still intentionally narrow: - token-backed socket sessions - synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR` +- buffered audio state tracking behind a dedicated turn-finalization layer +- synthetic STT strategy selection for fixture-driven audio turn completion - `CONTEXT` capture and follow-up turn state - `EOS` completion - first skill vertical for joke/chat `SKILL_ACTION` playback @@ -76,6 +78,7 @@ Current websocket scope is still intentionally narrow: Not yet covered: - real binary audio / ASR finalization parity +- provider-backed ASR integration - upstream Nimbus or broader skill lifecycle behavior - animation / expression command families - ESML feature parity beyond the narrow synthetic playback payloads used in the current scaffold diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/DefaultSttStrategySelector.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/DefaultSttStrategySelector.cs new file mode 100644 index 0000000..ca5f8d0 --- /dev/null +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/DefaultSttStrategySelector.cs @@ -0,0 +1,19 @@ +using Jibo.Runtime.Abstractions; + +namespace Jibo.Cloud.Application.Services; + +public sealed class DefaultSttStrategySelector(IEnumerable strategies) : ISttStrategySelector +{ + private readonly IReadOnlyList _strategies = strategies.ToArray(); + + public Task SelectAsync(TurnContext turn, CancellationToken cancellationToken = default) + { + var strategy = _strategies.FirstOrDefault(candidate => candidate.CanHandle(turn)); + if (strategy is null) + { + throw new InvalidOperationException("No STT strategy can handle the current turn."); + } + + return Task.FromResult(strategy); + } +} diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboWebSocketService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboWebSocketService.cs index cf16976..4b9438e 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboWebSocketService.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboWebSocketService.cs @@ -7,9 +7,7 @@ namespace Jibo.Cloud.Application.Services; public sealed class JiboWebSocketService( ICloudStateStore stateStore, - ProtocolToTurnContextMapper turnContextMapper, - IConversationBroker conversationBroker, - ResponsePlanToSocketMessagesMapper replyMapper) + WebSocketTurnFinalizationService turnFinalizationService) { public async Task> HandleMessageAsync(WebSocketMessageEnvelope envelope, CancellationToken cancellationToken = default) { @@ -19,23 +17,7 @@ public sealed class JiboWebSocketService( if (envelope.IsBinary) { - session.LastMessageType = "BINARY_AUDIO"; - session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0; - return - [ - new WebSocketReply - { - Text = JsonSerializer.Serialize(new - { - type = "OPENJIBO_AUDIO_RECEIVED", - data = new - { - bytes = envelope.Binary?.Length ?? 0, - sessionId = session.SessionId - } - }) - } - ]; + return turnFinalizationService.HandleBinaryAudio(session, envelope); } var parsedType = ReadMessageType(envelope.Text); @@ -48,64 +30,12 @@ public sealed class JiboWebSocketService( if (parsedType == "CONTEXT") { - session.Metadata["context"] = ExtractDataPayload(envelope.Text); - return - [ - new WebSocketReply - { - Text = JsonSerializer.Serialize(new - { - type = "OPENJIBO_CONTEXT_ACK", - data = new - { - sessionId = session.SessionId, - transID = session.LastTransId - } - }) - } - ]; + return turnFinalizationService.HandleContext(session, envelope.Text); } if (parsedType is "LISTEN" or "CLIENT_NLU" or "CLIENT_ASR") { - PersistTurnHints(session, envelope.Text, parsedType); - - var turn = turnContextMapper.MapListenMessage(envelope, session, parsedType); - if (string.IsNullOrWhiteSpace(turn.NormalizedTranscript) && - string.IsNullOrWhiteSpace(turn.RawTranscript)) - { - return - [ - new WebSocketReply - { - Text = JsonSerializer.Serialize(new - { - type = "OPENJIBO_ACK", - data = new - { - messageType = parsedType, - sessionId = session.SessionId, - transID = session.LastTransId - } - }) - } - ]; - } - - var plan = await conversationBroker.HandleTurnAsync(turn, cancellationToken); - var listenAction = plan.Actions.OfType().OrderBy(action => action.Sequence).LastOrDefault(); - session.LastTranscript = turn.NormalizedTranscript ?? turn.RawTranscript; - session.LastIntent = plan.IntentName; - session.LastListenType = listenAction?.Mode; - session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen - ? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout) - : null; - - var emitSkillActions = parsedType != "CLIENT_NLU"; - return replyMapper.Map(plan, turn, session, emitSkillActions).Select(text => new WebSocketReply - { - Text = text - }).ToArray(); + return await turnFinalizationService.HandleTurnAsync(session, envelope, parsedType, cancellationToken); } return @@ -125,45 +55,6 @@ public sealed class JiboWebSocketService( ]; } - private static void PersistTurnHints(CloudSession session, string? text, string messageType) - { - if (string.IsNullOrWhiteSpace(text)) - { - return; - } - - try - { - using var document = JsonDocument.Parse(text); - var root = document.RootElement; - - if (root.TryGetProperty("data", out var data) && data.ValueKind == JsonValueKind.Object) - { - if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array) - { - session.Metadata["listenRules"] = rules.EnumerateArray() - .Select(item => item.ValueKind == JsonValueKind.String ? item.GetString() ?? string.Empty : item.ToString()) - .Where(rule => !string.IsNullOrWhiteSpace(rule)) - .ToArray(); - } - - if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String) - { - session.LastIntent = intent.GetString(); - } - - if (messageType == "CONTEXT") - { - session.Metadata["context"] = data.GetRawText(); - } - } - } - catch - { - // Keep the compatibility layer permissive while captures are still incomplete. - } - } - private static string ReadMessageType(string? text) { if (string.IsNullOrWhiteSpace(text)) @@ -209,27 +100,4 @@ public sealed class JiboWebSocketService( return null; } - - private static string? ExtractDataPayload(string? text) - { - if (string.IsNullOrWhiteSpace(text)) - { - return null; - } - - try - { - using var document = JsonDocument.Parse(text); - if (document.RootElement.TryGetProperty("data", out var data)) - { - return data.GetRawText(); - } - } - catch - { - return null; - } - - return null; - } } diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ProtocolToTurnContextMapper.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ProtocolToTurnContextMapper.cs index cb0dce2..06128a3 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ProtocolToTurnContextMapper.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ProtocolToTurnContextMapper.cs @@ -30,6 +30,17 @@ public sealed class ProtocolToTurnContextMapper attributes["listenRules"] = listenRules; } + if (session.BufferedAudioBytes > 0) + { + attributes["bufferedAudioBytes"] = session.BufferedAudioBytes; + attributes["bufferedAudioChunks"] = session.BufferedAudioChunkCount; + } + + if (session.Metadata.TryGetValue("audioTranscriptHint", out var audioTranscriptHint)) + { + attributes["audioTranscriptHint"] = audioTranscriptHint; + } + return new TurnContext { SessionId = session.SessionId, @@ -68,17 +79,30 @@ public sealed class ProtocolToTurnContextMapper return transcript.GetString(); } + if (data.TryGetProperty("asr", out var asr) && + asr.ValueKind == JsonValueKind.Object && + asr.TryGetProperty("text", out var asrText) && + asrText.ValueKind == JsonValueKind.String) + { + return asrText.GetString(); + } + + if (data.TryGetProperty("transcriptHint", out var transcriptHint) && transcriptHint.ValueKind == JsonValueKind.String) + { + return transcriptHint.GetString(); + } + if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String) { return intent.GetString(); } } + + return null; } catch { return text; } - - return text; } } diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/SyntheticBufferedAudioSttStrategy.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/SyntheticBufferedAudioSttStrategy.cs new file mode 100644 index 0000000..0061f7f --- /dev/null +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/SyntheticBufferedAudioSttStrategy.cs @@ -0,0 +1,59 @@ +using Jibo.Runtime.Abstractions; + +namespace Jibo.Cloud.Application.Services; + +public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy +{ + public string Name => "synthetic-buffered-audio"; + + public bool CanHandle(TurnContext turn) + { + return ReadBufferedAudioBytes(turn) > 0 && + !string.IsNullOrWhiteSpace(ReadTranscriptHint(turn)); + } + + public Task TranscribeAsync(TurnContext turn, CancellationToken cancellationToken = default) + { + var transcriptHint = ReadTranscriptHint(turn); + if (string.IsNullOrWhiteSpace(transcriptHint)) + { + throw new InvalidOperationException("Synthetic buffered audio STT requires an audio transcript hint."); + } + + return Task.FromResult(new SttResult + { + Text = transcriptHint.Trim(), + Provider = Name, + Confidence = 0.75f, + Locale = turn.Locale, + Metadata = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["bufferedAudioBytes"] = ReadBufferedAudioBytes(turn), + ["mode"] = "fixture-hint" + } + }); + } + + private static int ReadBufferedAudioBytes(TurnContext turn) + { + if (!turn.Attributes.TryGetValue("bufferedAudioBytes", out var bufferedAudioBytes)) + { + return 0; + } + + return bufferedAudioBytes switch + { + int value => value, + long value => (int)value, + string value when int.TryParse(value, out var parsed) => parsed, + _ => 0 + }; + } + + private static string? ReadTranscriptHint(TurnContext turn) + { + return turn.Attributes.TryGetValue("audioTranscriptHint", out var transcriptHint) + ? transcriptHint?.ToString() + : null; + } +} diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs new file mode 100644 index 0000000..77511da --- /dev/null +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs @@ -0,0 +1,274 @@ +using System.Text.Json; +using Jibo.Cloud.Domain.Models; +using Jibo.Runtime.Abstractions; + +namespace Jibo.Cloud.Application.Services; + +public sealed class WebSocketTurnFinalizationService( + ProtocolToTurnContextMapper turnContextMapper, + IConversationBroker conversationBroker, + ResponsePlanToSocketMessagesMapper replyMapper, + ISttStrategySelector sttStrategySelector) +{ + public IReadOnlyList HandleBinaryAudio(CloudSession session, WebSocketMessageEnvelope envelope) + { + session.LastMessageType = "BINARY_AUDIO"; + session.BufferedAudioChunkCount += 1; + session.BufferedAudioBytes += envelope.Binary?.Length ?? 0; + session.LastAudioReceivedUtc = DateTimeOffset.UtcNow; + session.AwaitingTurnCompletion = true; + session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0; + + return + [ + new WebSocketReply + { + Text = JsonSerializer.Serialize(new + { + type = "OPENJIBO_AUDIO_RECEIVED", + data = new + { + bytes = envelope.Binary?.Length ?? 0, + bufferedBytes = session.BufferedAudioBytes, + bufferedChunks = session.BufferedAudioChunkCount, + sessionId = session.SessionId + } + }) + } + ]; + } + + public IReadOnlyList HandleContext(CloudSession session, string? text) + { + session.Metadata["context"] = ExtractDataPayload(text); + + if (TryReadContextProperty(text, "audioTranscriptHint", out var transcriptHint) && + !string.IsNullOrWhiteSpace(transcriptHint)) + { + session.Metadata["audioTranscriptHint"] = transcriptHint; + } + + return + [ + new WebSocketReply + { + Text = JsonSerializer.Serialize(new + { + type = "OPENJIBO_CONTEXT_ACK", + data = new + { + sessionId = session.SessionId, + transID = session.LastTransId + } + }) + } + ]; + } + + public async Task> HandleTurnAsync( + CloudSession session, + WebSocketMessageEnvelope envelope, + string messageType, + CancellationToken cancellationToken = default) + { + PersistTurnHints(session, envelope.Text); + + var turn = turnContextMapper.MapListenMessage(envelope, session, messageType); + var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken); + if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) && + string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript)) + { + session.AwaitingTurnCompletion = true; + return + [ + new WebSocketReply + { + Text = JsonSerializer.Serialize(new + { + type = "OPENJIBO_TURN_PENDING", + data = new + { + sessionId = session.SessionId, + transID = session.LastTransId, + bufferedAudioBytes = session.BufferedAudioBytes, + bufferedAudioChunks = session.BufferedAudioChunkCount, + awaitingAudio = true + } + }) + } + ]; + } + + var plan = await conversationBroker.HandleTurnAsync(finalizedTurn, cancellationToken); + var listenAction = plan.Actions.OfType().OrderBy(action => action.Sequence).LastOrDefault(); + session.LastTranscript = finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript; + session.LastIntent = plan.IntentName; + session.LastListenType = listenAction?.Mode; + session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen + ? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout) + : null; + session.AwaitingTurnCompletion = false; + + var emitSkillActions = messageType != "CLIENT_NLU"; + var replies = replyMapper.Map(plan, finalizedTurn, session, emitSkillActions).Select(text => new WebSocketReply + { + Text = text + }).ToArray(); + + ResetBufferedAudio(session); + return replies; + } + + private async Task ResolveTranscriptAsync(TurnContext turn, CloudSession session, CancellationToken cancellationToken) + { + if (!string.IsNullOrWhiteSpace(turn.NormalizedTranscript) || !string.IsNullOrWhiteSpace(turn.RawTranscript)) + { + return turn; + } + + if (session.BufferedAudioBytes <= 0) + { + return turn; + } + + try + { + var strategy = await sttStrategySelector.SelectAsync(turn, cancellationToken); + var sttResult = await strategy.TranscribeAsync(turn, cancellationToken); + + var attributes = new Dictionary(turn.Attributes, StringComparer.OrdinalIgnoreCase) + { + ["sttProvider"] = sttResult.Provider, + ["sttConfidence"] = sttResult.Confidence + }; + + foreach (var pair in sttResult.Metadata) + { + attributes[$"stt:{pair.Key}"] = pair.Value; + } + + return new TurnContext + { + TurnId = turn.TurnId, + SessionId = turn.SessionId, + TimestampUtc = turn.TimestampUtc, + InputMode = turn.InputMode, + SourceKind = turn.SourceKind, + WakePhrase = turn.WakePhrase, + RawTranscript = sttResult.Text, + NormalizedTranscript = sttResult.Text.Trim(), + DeviceId = turn.DeviceId, + HostName = turn.HostName, + RequestId = turn.RequestId, + ProtocolService = turn.ProtocolService, + ProtocolOperation = turn.ProtocolOperation, + FirmwareVersion = turn.FirmwareVersion, + ApplicationVersion = turn.ApplicationVersion, + Locale = sttResult.Locale ?? turn.Locale, + TimeZone = turn.TimeZone, + IsFollowUpEligible = turn.IsFollowUpEligible, + Attributes = attributes + }; + } + catch + { + return turn; + } + } + + private static void PersistTurnHints(CloudSession session, string? text) + { + if (string.IsNullOrWhiteSpace(text)) + { + return; + } + + try + { + using var document = JsonDocument.Parse(text); + var root = document.RootElement; + + if (root.TryGetProperty("data", out var data) && data.ValueKind == JsonValueKind.Object) + { + if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array) + { + session.Metadata["listenRules"] = rules.EnumerateArray() + .Select(item => item.ValueKind == JsonValueKind.String ? item.GetString() ?? string.Empty : item.ToString()) + .Where(rule => !string.IsNullOrWhiteSpace(rule)) + .ToArray(); + } + + if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String) + { + session.LastIntent = intent.GetString(); + } + + if (data.TryGetProperty("transcriptHint", out var transcriptHint) && transcriptHint.ValueKind == JsonValueKind.String) + { + session.Metadata["audioTranscriptHint"] = transcriptHint.GetString(); + } + } + } + catch + { + // Keep the compatibility layer permissive while captures are still incomplete. + } + } + + private static void ResetBufferedAudio(CloudSession session) + { + session.BufferedAudioBytes = 0; + session.BufferedAudioChunkCount = 0; + session.Metadata.Remove("audioTranscriptHint"); + } + + private static string? ExtractDataPayload(string? text) + { + if (string.IsNullOrWhiteSpace(text)) + { + return null; + } + + try + { + using var document = JsonDocument.Parse(text); + if (document.RootElement.TryGetProperty("data", out var data)) + { + return data.GetRawText(); + } + } + catch + { + return null; + } + + return null; + } + + private static bool TryReadContextProperty(string? text, string propertyName, out string? value) + { + value = null; + if (string.IsNullOrWhiteSpace(text)) + { + return false; + } + + try + { + using var document = JsonDocument.Parse(text); + if (!document.RootElement.TryGetProperty("data", out var data) || + !data.TryGetProperty(propertyName, out var property) || + property.ValueKind != JsonValueKind.String) + { + return false; + } + + value = property.GetString(); + return !string.IsNullOrWhiteSpace(value); + } + catch + { + return false; + } + } +} diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/CloudSession.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/CloudSession.cs index 3ce0eaf..11d55a3 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/CloudSession.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Domain/Models/CloudSession.cs @@ -11,12 +11,16 @@ public sealed class CloudSession public string? Path { get; init; } public DateTimeOffset CreatedUtc { get; init; } = DateTimeOffset.UtcNow; public DateTimeOffset LastSeenUtc { get; set; } = DateTimeOffset.UtcNow; + public DateTimeOffset? LastAudioReceivedUtc { get; set; } public DateTimeOffset? FollowUpExpiresUtc { get; set; } public string? LastMessageType { get; set; } public string? LastListenType { get; set; } public string? LastIntent { get; set; } public string? LastTranscript { get; set; } public string? LastTransId { get; set; } + public int BufferedAudioChunkCount { get; set; } + public int BufferedAudioBytes { get; set; } + public bool AwaitingTurnCompletion { get; set; } public bool FollowUpOpen => FollowUpExpiresUtc.HasValue && FollowUpExpiresUtc > DateTimeOffset.UtcNow; public IDictionary Metadata { get; init; } = new Dictionary(); } diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/DependencyInjection/ServiceCollectionExtensions.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/DependencyInjection/ServiceCollectionExtensions.cs index 07251c5..5122a0e 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/DependencyInjection/ServiceCollectionExtensions.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/DependencyInjection/ServiceCollectionExtensions.cs @@ -12,8 +12,11 @@ public static class ServiceCollectionExtensions { services.AddSingleton(); services.AddSingleton(); + services.AddSingleton(); + services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); + services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); diff --git a/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-buffered-audio-synthetic-asr.flow.json b/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-buffered-audio-synthetic-asr.flow.json new file mode 100644 index 0000000..7cd6471 --- /dev/null +++ b/OpenJibo/src/Jibo.Cloud/node/fixtures/websocket/neo-hub-buffered-audio-synthetic-asr.flow.json @@ -0,0 +1,56 @@ +{ + "name": "neo-hub buffered audio synthetic asr flow", + "session": { + "hostName": "neo-hub.jibo.com", + "path": "/listen", + "kind": "neo-hub-listen", + "token": "fixture-audio-token" + }, + "steps": [ + { + "text": { + "type": "LISTEN", + "transID": "fixture-trans-audio", + "data": { + "rules": [ + "wake-word" + ] + } + }, + "expectedReplyTypes": [ + "OPENJIBO_TURN_PENDING" + ] + }, + { + "text": { + "type": "CONTEXT", + "transID": "fixture-trans-audio", + "data": { + "topic": "conversation", + "audioTranscriptHint": "tell me a joke" + } + }, + "expectedReplyTypes": [ + "OPENJIBO_CONTEXT_ACK" + ] + }, + { + "binary": [1, 2, 3, 4, 5, 6], + "expectedReplyTypes": [ + "OPENJIBO_AUDIO_RECEIVED" + ] + }, + { + "text": { + "type": "CLIENT_ASR", + "transID": "fixture-trans-audio", + "data": { } + }, + "expectedReplyTypes": [ + "LISTEN", + "EOS", + "SKILL_ACTION" + ] + } + ] +} diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs index f1174d7..a76c643 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs @@ -14,11 +14,21 @@ public sealed class JiboWebSocketServiceTests public JiboWebSocketServiceTests() { _store = new InMemoryCloudStateStore(); + var turnContextMapper = new ProtocolToTurnContextMapper(); + var conversationBroker = new DemoConversationBroker(); + var replyMapper = new ResponsePlanToSocketMessagesMapper(); + var sttSelector = new DefaultSttStrategySelector( + [ + new SyntheticBufferedAudioSttStrategy() + ]); + _service = new JiboWebSocketService( _store, - new ProtocolToTurnContextMapper(), - new DemoConversationBroker(), - new ResponsePlanToSocketMessagesMapper()); + new WebSocketTurnFinalizationService( + turnContextMapper, + conversationBroker, + replyMapper, + sttSelector)); } [Fact] @@ -58,6 +68,8 @@ public sealed class JiboWebSocketServiceTests using var payload = JsonDocument.Parse(replies[0].Text!); Assert.Equal("OPENJIBO_AUDIO_RECEIVED", payload.RootElement.GetProperty("type").GetString()); Assert.Equal(4, payload.RootElement.GetProperty("data").GetProperty("bytes").GetInt32()); + Assert.Equal(4, payload.RootElement.GetProperty("data").GetProperty("bufferedBytes").GetInt32()); + Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32()); } [Fact] @@ -104,9 +116,74 @@ public sealed class JiboWebSocketServiceTests Assert.Equal("trans-follow-up", session.LastTransId); } + [Fact] + public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam() + { + var listenReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-audio-token", + Text = """{"type":"LISTEN","transID":"trans-audio","data":{"rules":["wake-word"]}}""" + }); + + Assert.Single(listenReplies); + Assert.Equal("OPENJIBO_TURN_PENDING", ReadReplyType(listenReplies[0])); + + var contextReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-audio-token", + Text = """{"type":"CONTEXT","transID":"trans-audio","data":{"topic":"conversation","audioTranscriptHint":"tell me a joke"}}""" + }); + + Assert.Single(contextReplies); + Assert.Equal("OPENJIBO_CONTEXT_ACK", ReadReplyType(contextReplies[0])); + + var audioReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-audio-token", + Binary = [1, 2, 3, 4, 5, 6] + }); + + Assert.Single(audioReplies); + Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(audioReplies[0])); + + var finalizeReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-audio-token", + Text = """{"type":"CLIENT_ASR","transID":"trans-audio","data":{}}""" + }); + + Assert.Equal(3, finalizeReplies.Count); + Assert.Equal("LISTEN", ReadReplyType(finalizeReplies[0])); + Assert.Equal("EOS", ReadReplyType(finalizeReplies[1])); + Assert.Equal("SKILL_ACTION", ReadReplyType(finalizeReplies[2])); + + using var listenPayload = JsonDocument.Parse(finalizeReplies[0].Text!); + Assert.Equal("tell me a joke", listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString()); + Assert.Equal("joke", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString()); + + var session = _store.FindSessionByToken("hub-audio-token"); + Assert.NotNull(session); + Assert.Equal(0, session!.BufferedAudioBytes); + Assert.Equal(0, session.BufferedAudioChunkCount); + Assert.False(session.Metadata.ContainsKey("audioTranscriptHint")); + } + [Theory] [InlineData("fixtures\\neo-hub-client-asr-joke.flow.json")] [InlineData("fixtures\\neo-hub-context-client-nlu.flow.json")] + [InlineData("fixtures\\neo-hub-buffered-audio-synthetic-asr.flow.json")] public async Task WebSocketFixture_ReplaysSuccessfully(string relativePath) { var fixture = WebSocketFixtureLoader.Load(relativePath);