next round of websocket fun
This commit is contained in:
@@ -19,6 +19,7 @@
|
|||||||
<File Path="scripts/cloud/README.md" />
|
<File Path="scripts/cloud/README.md" />
|
||||||
</Folder>
|
</Folder>
|
||||||
<Folder Name="/Solution Items/">
|
<Folder Name="/Solution Items/">
|
||||||
|
<File Path="NuGet.Config" />
|
||||||
<File Path="README.md" />
|
<File Path="README.md" />
|
||||||
</Folder>
|
</Folder>
|
||||||
<Folder Name="/src/">
|
<Folder Name="/src/">
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ Observed from `open-jibo-link.js`:
|
|||||||
The current .NET pass covers only a narrow, explicitly synthetic subset of observed Neo-Hub behavior:
|
The current .NET pass covers only a narrow, explicitly synthetic subset of observed Neo-Hub behavior:
|
||||||
|
|
||||||
- token/session tracking across websocket turns
|
- token/session tracking across websocket turns
|
||||||
|
- explicit per-turn state tracking for transID, rules, context, buffered audio, and finalize attempts
|
||||||
- buffered audio accounting and turn-pending state
|
- buffered audio accounting and turn-pending state
|
||||||
- `LISTEN` message handling with synthetic `LISTEN` result payload shaping
|
- `LISTEN` message handling with synthetic `LISTEN` result payload shaping
|
||||||
- `CONTEXT` capture for turn/session state
|
- `CONTEXT` capture for turn/session state
|
||||||
@@ -85,7 +86,8 @@ This does not yet mean parity for:
|
|||||||
The current .NET websocket layer now separates:
|
The current .NET websocket layer now separates:
|
||||||
|
|
||||||
- robot-facing websocket compatibility
|
- robot-facing websocket compatibility
|
||||||
- session and buffered-audio state
|
- long-lived cloud session state
|
||||||
|
- per-turn websocket state
|
||||||
- transcript resolution / STT selection
|
- transcript resolution / STT selection
|
||||||
- turn-to-response mapping
|
- turn-to-response mapping
|
||||||
|
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ The intent is to grow from a runnable dev monolith into the real Azure deploymen
|
|||||||
Current websocket scope is still intentionally narrow:
|
Current websocket scope is still intentionally narrow:
|
||||||
|
|
||||||
- token-backed socket sessions
|
- token-backed socket sessions
|
||||||
|
- explicit websocket turn-state tracking separate from long-lived cloud session state
|
||||||
- synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR`
|
- synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR`
|
||||||
- buffered audio state tracking behind a dedicated turn-finalization layer
|
- buffered audio state tracking behind a dedicated turn-finalization layer
|
||||||
- synthetic STT strategy selection for fixture-driven audio turn completion
|
- synthetic STT strategy selection for fixture-driven audio turn completion
|
||||||
@@ -79,6 +80,7 @@ Not yet covered:
|
|||||||
|
|
||||||
- real binary audio / ASR finalization parity
|
- real binary audio / ASR finalization parity
|
||||||
- provider-backed ASR integration
|
- provider-backed ASR integration
|
||||||
|
- timed finalize/fallback behavior matching richer Node turn-state semantics
|
||||||
- upstream Nimbus or broader skill lifecycle behavior
|
- upstream Nimbus or broader skill lifecycle behavior
|
||||||
- animation / expression command families
|
- animation / expression command families
|
||||||
- ESML feature parity beyond the narrow synthetic playback payloads used in the current scaffold
|
- ESML feature parity beyond the narrow synthetic playback payloads used in the current scaffold
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ public sealed class JiboWebSocketService(
|
|||||||
if (!string.IsNullOrWhiteSpace(parsedTransId))
|
if (!string.IsNullOrWhiteSpace(parsedTransId))
|
||||||
{
|
{
|
||||||
session.LastTransId = parsedTransId;
|
session.LastTransId = parsedTransId;
|
||||||
|
session.TurnState.TransId = parsedTransId;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parsedType == "CONTEXT")
|
if (parsedType == "CONTEXT")
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ public sealed class ProtocolToTurnContextMapper
|
|||||||
{
|
{
|
||||||
public TurnContext MapListenMessage(WebSocketMessageEnvelope envelope, CloudSession session, string messageType)
|
public TurnContext MapListenMessage(WebSocketMessageEnvelope envelope, CloudSession session, string messageType)
|
||||||
{
|
{
|
||||||
|
var turnState = session.TurnState;
|
||||||
var text = ExtractTranscript(envelope.Text);
|
var text = ExtractTranscript(envelope.Text);
|
||||||
var protocolOperation = messageType.ToLowerInvariant();
|
var protocolOperation = messageType.ToLowerInvariant();
|
||||||
var attributes = new Dictionary<string, object?>(StringComparer.OrdinalIgnoreCase)
|
var attributes = new Dictionary<string, object?>(StringComparer.OrdinalIgnoreCase)
|
||||||
@@ -15,30 +16,35 @@ public sealed class ProtocolToTurnContextMapper
|
|||||||
["messageType"] = messageType
|
["messageType"] = messageType
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!string.IsNullOrWhiteSpace(session.LastTransId))
|
if (!string.IsNullOrWhiteSpace(turnState.TransId))
|
||||||
{
|
{
|
||||||
attributes["transID"] = session.LastTransId;
|
attributes["transID"] = turnState.TransId;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (session.Metadata.TryGetValue("context", out var context))
|
if (!string.IsNullOrWhiteSpace(turnState.ContextPayload))
|
||||||
{
|
{
|
||||||
attributes["context"] = context;
|
attributes["context"] = turnState.ContextPayload;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (session.Metadata.TryGetValue("listenRules", out var listenRules))
|
if (turnState.ListenRules.Count > 0)
|
||||||
{
|
{
|
||||||
attributes["listenRules"] = listenRules;
|
attributes["listenRules"] = turnState.ListenRules;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (session.BufferedAudioBytes > 0)
|
if (turnState.BufferedAudioBytes > 0)
|
||||||
{
|
{
|
||||||
attributes["bufferedAudioBytes"] = session.BufferedAudioBytes;
|
attributes["bufferedAudioBytes"] = turnState.BufferedAudioBytes;
|
||||||
attributes["bufferedAudioChunks"] = session.BufferedAudioChunkCount;
|
attributes["bufferedAudioChunks"] = turnState.BufferedAudioChunkCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (session.Metadata.TryGetValue("audioTranscriptHint", out var audioTranscriptHint))
|
if (!string.IsNullOrWhiteSpace(turnState.AudioTranscriptHint))
|
||||||
{
|
{
|
||||||
attributes["audioTranscriptHint"] = audioTranscriptHint;
|
attributes["audioTranscriptHint"] = turnState.AudioTranscriptHint;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (turnState.FinalizeAttemptCount > 0)
|
||||||
|
{
|
||||||
|
attributes["finalizeAttemptCount"] = turnState.FinalizeAttemptCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new TurnContext
|
return new TurnContext
|
||||||
|
|||||||
@@ -12,11 +12,12 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
{
|
{
|
||||||
public IReadOnlyList<WebSocketReply> HandleBinaryAudio(CloudSession session, WebSocketMessageEnvelope envelope)
|
public IReadOnlyList<WebSocketReply> HandleBinaryAudio(CloudSession session, WebSocketMessageEnvelope envelope)
|
||||||
{
|
{
|
||||||
|
var turnState = session.TurnState;
|
||||||
session.LastMessageType = "BINARY_AUDIO";
|
session.LastMessageType = "BINARY_AUDIO";
|
||||||
session.BufferedAudioChunkCount += 1;
|
turnState.BufferedAudioChunkCount += 1;
|
||||||
session.BufferedAudioBytes += envelope.Binary?.Length ?? 0;
|
turnState.BufferedAudioBytes += envelope.Binary?.Length ?? 0;
|
||||||
session.LastAudioReceivedUtc = DateTimeOffset.UtcNow;
|
turnState.LastAudioReceivedUtc = DateTimeOffset.UtcNow;
|
||||||
session.AwaitingTurnCompletion = true;
|
turnState.AwaitingTurnCompletion = true;
|
||||||
session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0;
|
session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0;
|
||||||
|
|
||||||
return
|
return
|
||||||
@@ -29,8 +30,8 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
data = new
|
data = new
|
||||||
{
|
{
|
||||||
bytes = envelope.Binary?.Length ?? 0,
|
bytes = envelope.Binary?.Length ?? 0,
|
||||||
bufferedBytes = session.BufferedAudioBytes,
|
bufferedBytes = turnState.BufferedAudioBytes,
|
||||||
bufferedChunks = session.BufferedAudioChunkCount,
|
bufferedChunks = turnState.BufferedAudioChunkCount,
|
||||||
sessionId = session.SessionId
|
sessionId = session.SessionId
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@@ -40,11 +41,14 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
|
|
||||||
public IReadOnlyList<WebSocketReply> HandleContext(CloudSession session, string? text)
|
public IReadOnlyList<WebSocketReply> HandleContext(CloudSession session, string? text)
|
||||||
{
|
{
|
||||||
session.Metadata["context"] = ExtractDataPayload(text);
|
var turnState = session.TurnState;
|
||||||
|
turnState.ContextPayload = ExtractDataPayload(text);
|
||||||
|
session.Metadata["context"] = turnState.ContextPayload;
|
||||||
|
|
||||||
if (TryReadContextProperty(text, "audioTranscriptHint", out var transcriptHint) &&
|
if (TryReadContextProperty(text, "audioTranscriptHint", out var transcriptHint) &&
|
||||||
!string.IsNullOrWhiteSpace(transcriptHint))
|
!string.IsNullOrWhiteSpace(transcriptHint))
|
||||||
{
|
{
|
||||||
|
turnState.AudioTranscriptHint = transcriptHint;
|
||||||
session.Metadata["audioTranscriptHint"] = transcriptHint;
|
session.Metadata["audioTranscriptHint"] = transcriptHint;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -75,10 +79,15 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
|
|
||||||
var turn = turnContextMapper.MapListenMessage(envelope, session, messageType);
|
var turn = turnContextMapper.MapListenMessage(envelope, session, messageType);
|
||||||
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
||||||
|
var turnState = session.TurnState;
|
||||||
if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) &&
|
if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) &&
|
||||||
string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript))
|
string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript))
|
||||||
{
|
{
|
||||||
session.AwaitingTurnCompletion = true;
|
turnState.AwaitingTurnCompletion = true;
|
||||||
|
if (turnState.BufferedAudioBytes > 0)
|
||||||
|
{
|
||||||
|
turnState.FinalizeAttemptCount += 1;
|
||||||
|
}
|
||||||
return
|
return
|
||||||
[
|
[
|
||||||
new WebSocketReply
|
new WebSocketReply
|
||||||
@@ -90,9 +99,11 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
{
|
{
|
||||||
sessionId = session.SessionId,
|
sessionId = session.SessionId,
|
||||||
transID = session.LastTransId,
|
transID = session.LastTransId,
|
||||||
bufferedAudioBytes = session.BufferedAudioBytes,
|
bufferedAudioBytes = turnState.BufferedAudioBytes,
|
||||||
bufferedAudioChunks = session.BufferedAudioChunkCount,
|
bufferedAudioChunks = turnState.BufferedAudioChunkCount,
|
||||||
awaitingAudio = true
|
awaitingAudio = turnState.BufferedAudioBytes == 0,
|
||||||
|
awaitingTranscriptHint = turnState.BufferedAudioBytes > 0 && string.IsNullOrWhiteSpace(turnState.AudioTranscriptHint),
|
||||||
|
finalizeAttempts = turnState.FinalizeAttemptCount
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -107,7 +118,7 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen
|
session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen
|
||||||
? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout)
|
? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout)
|
||||||
: null;
|
: null;
|
||||||
session.AwaitingTurnCompletion = false;
|
turnState.AwaitingTurnCompletion = false;
|
||||||
|
|
||||||
var emitSkillActions = messageType != "CLIENT_NLU";
|
var emitSkillActions = messageType != "CLIENT_NLU";
|
||||||
var replies = replyMapper.Map(plan, finalizedTurn, session, emitSkillActions).Select(text => new WebSocketReply
|
var replies = replyMapper.Map(plan, finalizedTurn, session, emitSkillActions).Select(text => new WebSocketReply
|
||||||
@@ -126,7 +137,7 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
return turn;
|
return turn;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (session.BufferedAudioBytes <= 0)
|
if (session.TurnState.BufferedAudioBytes <= 0)
|
||||||
{
|
{
|
||||||
return turn;
|
return turn;
|
||||||
}
|
}
|
||||||
@@ -178,6 +189,7 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
|
|
||||||
private static void PersistTurnHints(CloudSession session, string? text)
|
private static void PersistTurnHints(CloudSession session, string? text)
|
||||||
{
|
{
|
||||||
|
var turnState = session.TurnState;
|
||||||
if (string.IsNullOrWhiteSpace(text))
|
if (string.IsNullOrWhiteSpace(text))
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
@@ -188,14 +200,26 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
using var document = JsonDocument.Parse(text);
|
using var document = JsonDocument.Parse(text);
|
||||||
var root = document.RootElement;
|
var root = document.RootElement;
|
||||||
|
|
||||||
|
if (root.TryGetProperty("transID", out var transId) && transId.ValueKind == JsonValueKind.String)
|
||||||
|
{
|
||||||
|
var nextTransId = transId.GetString();
|
||||||
|
if (!string.IsNullOrWhiteSpace(nextTransId) &&
|
||||||
|
!string.Equals(turnState.TransId, nextTransId, StringComparison.Ordinal))
|
||||||
|
{
|
||||||
|
ResetTurnState(turnState, nextTransId);
|
||||||
|
session.LastTransId = nextTransId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (root.TryGetProperty("data", out var data) && data.ValueKind == JsonValueKind.Object)
|
if (root.TryGetProperty("data", out var data) && data.ValueKind == JsonValueKind.Object)
|
||||||
{
|
{
|
||||||
if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array)
|
if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array)
|
||||||
{
|
{
|
||||||
session.Metadata["listenRules"] = rules.EnumerateArray()
|
turnState.ListenRules = rules.EnumerateArray()
|
||||||
.Select(item => item.ValueKind == JsonValueKind.String ? item.GetString() ?? string.Empty : item.ToString())
|
.Select(item => item.ValueKind == JsonValueKind.String ? item.GetString() ?? string.Empty : item.ToString())
|
||||||
.Where(rule => !string.IsNullOrWhiteSpace(rule))
|
.Where(rule => !string.IsNullOrWhiteSpace(rule))
|
||||||
.ToArray();
|
.ToArray();
|
||||||
|
session.Metadata["listenRules"] = turnState.ListenRules;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String)
|
if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String)
|
||||||
@@ -205,7 +229,8 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
|
|
||||||
if (data.TryGetProperty("transcriptHint", out var transcriptHint) && transcriptHint.ValueKind == JsonValueKind.String)
|
if (data.TryGetProperty("transcriptHint", out var transcriptHint) && transcriptHint.ValueKind == JsonValueKind.String)
|
||||||
{
|
{
|
||||||
session.Metadata["audioTranscriptHint"] = transcriptHint.GetString();
|
turnState.AudioTranscriptHint = transcriptHint.GetString();
|
||||||
|
session.Metadata["audioTranscriptHint"] = turnState.AudioTranscriptHint;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -217,11 +242,26 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
|
|
||||||
private static void ResetBufferedAudio(CloudSession session)
|
private static void ResetBufferedAudio(CloudSession session)
|
||||||
{
|
{
|
||||||
session.BufferedAudioBytes = 0;
|
session.TurnState.BufferedAudioBytes = 0;
|
||||||
session.BufferedAudioChunkCount = 0;
|
session.TurnState.BufferedAudioChunkCount = 0;
|
||||||
|
session.TurnState.LastAudioReceivedUtc = null;
|
||||||
|
session.TurnState.FinalizeAttemptCount = 0;
|
||||||
session.Metadata.Remove("audioTranscriptHint");
|
session.Metadata.Remove("audioTranscriptHint");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void ResetTurnState(WebSocketTurnState turnState, string? transId)
|
||||||
|
{
|
||||||
|
turnState.TransId = transId;
|
||||||
|
turnState.ContextPayload = null;
|
||||||
|
turnState.AudioTranscriptHint = null;
|
||||||
|
turnState.LastAudioReceivedUtc = null;
|
||||||
|
turnState.BufferedAudioChunkCount = 0;
|
||||||
|
turnState.BufferedAudioBytes = 0;
|
||||||
|
turnState.FinalizeAttemptCount = 0;
|
||||||
|
turnState.AwaitingTurnCompletion = false;
|
||||||
|
turnState.ListenRules = [];
|
||||||
|
}
|
||||||
|
|
||||||
private static string? ExtractDataPayload(string? text)
|
private static string? ExtractDataPayload(string? text)
|
||||||
{
|
{
|
||||||
if (string.IsNullOrWhiteSpace(text))
|
if (string.IsNullOrWhiteSpace(text))
|
||||||
|
|||||||
@@ -11,16 +11,13 @@ public sealed class CloudSession
|
|||||||
public string? Path { get; init; }
|
public string? Path { get; init; }
|
||||||
public DateTimeOffset CreatedUtc { get; init; } = DateTimeOffset.UtcNow;
|
public DateTimeOffset CreatedUtc { get; init; } = DateTimeOffset.UtcNow;
|
||||||
public DateTimeOffset LastSeenUtc { get; set; } = DateTimeOffset.UtcNow;
|
public DateTimeOffset LastSeenUtc { get; set; } = DateTimeOffset.UtcNow;
|
||||||
public DateTimeOffset? LastAudioReceivedUtc { get; set; }
|
|
||||||
public DateTimeOffset? FollowUpExpiresUtc { get; set; }
|
public DateTimeOffset? FollowUpExpiresUtc { get; set; }
|
||||||
public string? LastMessageType { get; set; }
|
public string? LastMessageType { get; set; }
|
||||||
public string? LastListenType { get; set; }
|
public string? LastListenType { get; set; }
|
||||||
public string? LastIntent { get; set; }
|
public string? LastIntent { get; set; }
|
||||||
public string? LastTranscript { get; set; }
|
public string? LastTranscript { get; set; }
|
||||||
public string? LastTransId { get; set; }
|
public string? LastTransId { get; set; }
|
||||||
public int BufferedAudioChunkCount { get; set; }
|
|
||||||
public int BufferedAudioBytes { get; set; }
|
|
||||||
public bool AwaitingTurnCompletion { get; set; }
|
|
||||||
public bool FollowUpOpen => FollowUpExpiresUtc.HasValue && FollowUpExpiresUtc > DateTimeOffset.UtcNow;
|
public bool FollowUpOpen => FollowUpExpiresUtc.HasValue && FollowUpExpiresUtc > DateTimeOffset.UtcNow;
|
||||||
|
public WebSocketTurnState TurnState { get; } = new();
|
||||||
public IDictionary<string, object?> Metadata { get; init; } = new Dictionary<string, object?>();
|
public IDictionary<string, object?> Metadata { get; init; } = new Dictionary<string, object?>();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
namespace Jibo.Cloud.Domain.Models;
|
||||||
|
|
||||||
|
public sealed class WebSocketTurnState
|
||||||
|
{
|
||||||
|
public string? TransId { get; set; }
|
||||||
|
public string? ContextPayload { get; set; }
|
||||||
|
public string? AudioTranscriptHint { get; set; }
|
||||||
|
public DateTimeOffset? LastAudioReceivedUtc { get; set; }
|
||||||
|
public int BufferedAudioChunkCount { get; set; }
|
||||||
|
public int BufferedAudioBytes { get; set; }
|
||||||
|
public int FinalizeAttemptCount { get; set; }
|
||||||
|
public bool AwaitingTurnCompletion { get; set; }
|
||||||
|
public IReadOnlyList<string> ListenRules { get; set; } = [];
|
||||||
|
}
|
||||||
@@ -7,6 +7,6 @@ Current fixture groups:
|
|||||||
- `http/`
|
- `http/`
|
||||||
Basic `X-Amz-Target` request and response examples for startup flows.
|
Basic `X-Amz-Target` request and response examples for startup flows.
|
||||||
- `websocket/`
|
- `websocket/`
|
||||||
Sanitized Neo-Hub turn-flow examples used to replay `LISTEN`, `CONTEXT`, `CLIENT_NLU`, `CLIENT_ASR`, and synthetic `EOS` / `SKILL_ACTION` behavior against the .NET implementation.
|
Sanitized Neo-Hub turn-flow examples used to replay `LISTEN`, `CONTEXT`, `CLIENT_NLU`, `CLIENT_ASR`, buffered-audio accumulation, pending/finalize states, and synthetic `EOS` / `SKILL_ACTION` behavior against the .NET implementation.
|
||||||
|
|
||||||
Expand this folder whenever new robot traffic is captured and cleaned.
|
Expand this folder whenever new robot traffic is captured and cleaned.
|
||||||
|
|||||||
@@ -0,0 +1,41 @@
|
|||||||
|
{
|
||||||
|
"name": "neo-hub buffered audio pending flow",
|
||||||
|
"session": {
|
||||||
|
"hostName": "neo-hub.jibo.com",
|
||||||
|
"path": "/listen",
|
||||||
|
"kind": "neo-hub-listen",
|
||||||
|
"token": "fixture-pending-audio-token"
|
||||||
|
},
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"text": {
|
||||||
|
"type": "LISTEN",
|
||||||
|
"transID": "fixture-trans-pending",
|
||||||
|
"data": {
|
||||||
|
"rules": [
|
||||||
|
"wake-word"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_TURN_PENDING"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"binary": [1, 2, 3, 4],
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_AUDIO_RECEIVED"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": {
|
||||||
|
"type": "CLIENT_ASR",
|
||||||
|
"transID": "fixture-trans-pending",
|
||||||
|
"data": { }
|
||||||
|
},
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_TURN_PENDING"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
{
|
||||||
|
"name": "neo-hub multichunk audio chat flow",
|
||||||
|
"session": {
|
||||||
|
"hostName": "neo-hub.jibo.com",
|
||||||
|
"path": "/listen",
|
||||||
|
"kind": "neo-hub-listen",
|
||||||
|
"token": "fixture-audio-chat-token"
|
||||||
|
},
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"text": {
|
||||||
|
"type": "LISTEN",
|
||||||
|
"transID": "fixture-trans-audio-chat",
|
||||||
|
"data": {
|
||||||
|
"rules": [
|
||||||
|
"wake-word"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_TURN_PENDING"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": {
|
||||||
|
"type": "CONTEXT",
|
||||||
|
"transID": "fixture-trans-audio-chat",
|
||||||
|
"data": {
|
||||||
|
"audioTranscriptHint": "hello from buffered audio"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_CONTEXT_ACK"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"binary": [1, 2, 3],
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_AUDIO_RECEIVED"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"binary": [4, 5, 6, 7],
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_AUDIO_RECEIVED"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": {
|
||||||
|
"type": "CLIENT_ASR",
|
||||||
|
"transID": "fixture-trans-audio-chat",
|
||||||
|
"data": { }
|
||||||
|
},
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"LISTEN",
|
||||||
|
"EOS",
|
||||||
|
"SKILL_ACTION"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -72,6 +72,48 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32());
|
Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task MultiChunkAudio_AccumulatesBufferedStateAcrossMessages()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-multichunk-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-multi","data":{"rules":["wake-word"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
var firstAudioReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-multichunk-token",
|
||||||
|
Binary = [1, 2, 3]
|
||||||
|
});
|
||||||
|
|
||||||
|
var secondAudioReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-multichunk-token",
|
||||||
|
Binary = [4, 5, 6, 7]
|
||||||
|
});
|
||||||
|
|
||||||
|
using var firstPayload = JsonDocument.Parse(firstAudioReplies[0].Text!);
|
||||||
|
using var secondPayload = JsonDocument.Parse(secondAudioReplies[0].Text!);
|
||||||
|
Assert.Equal(3, firstPayload.RootElement.GetProperty("data").GetProperty("bufferedBytes").GetInt32());
|
||||||
|
Assert.Equal(7, secondPayload.RootElement.GetProperty("data").GetProperty("bufferedBytes").GetInt32());
|
||||||
|
Assert.Equal(2, secondPayload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32());
|
||||||
|
|
||||||
|
var session = _store.FindSessionByToken("hub-multichunk-token");
|
||||||
|
Assert.NotNull(session);
|
||||||
|
Assert.Equal(7, session!.TurnState.BufferedAudioBytes);
|
||||||
|
Assert.Equal(2, session.TurnState.BufferedAudioChunkCount);
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public async Task ContextThenClientNlu_UsesFollowUpTurnStateAndSkipsSkillAction()
|
public async Task ContextThenClientNlu_UsesFollowUpTurnStateAndSkipsSkillAction()
|
||||||
{
|
{
|
||||||
@@ -175,15 +217,163 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
|
|
||||||
var session = _store.FindSessionByToken("hub-audio-token");
|
var session = _store.FindSessionByToken("hub-audio-token");
|
||||||
Assert.NotNull(session);
|
Assert.NotNull(session);
|
||||||
Assert.Equal(0, session!.BufferedAudioBytes);
|
Assert.Equal(0, session!.TurnState.BufferedAudioBytes);
|
||||||
Assert.Equal(0, session.BufferedAudioChunkCount);
|
Assert.Equal(0, session.TurnState.BufferedAudioChunkCount);
|
||||||
Assert.False(session.Metadata.ContainsKey("audioTranscriptHint"));
|
Assert.False(session.Metadata.ContainsKey("audioTranscriptHint"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task BufferedAudio_WithoutTranscriptHint_RemainsPending()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-pending-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-pending","data":{"rules":["wake-word"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-pending-token",
|
||||||
|
Binary = [1, 2, 3, 4]
|
||||||
|
});
|
||||||
|
|
||||||
|
var finalizeReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-pending-token",
|
||||||
|
Text = """{"type":"CLIENT_ASR","transID":"trans-pending","data":{}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Single(finalizeReplies);
|
||||||
|
Assert.Equal("OPENJIBO_TURN_PENDING", ReadReplyType(finalizeReplies[0]));
|
||||||
|
|
||||||
|
using var payload = JsonDocument.Parse(finalizeReplies[0].Text!);
|
||||||
|
Assert.True(payload.RootElement.GetProperty("data").GetProperty("awaitingTranscriptHint").GetBoolean());
|
||||||
|
Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("finalizeAttempts").GetInt32());
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task BufferedAudio_WithChatTranscriptHint_FinalizesAsChat()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-audio-chat-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-audio-chat","data":{"rules":["wake-word"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-audio-chat-token",
|
||||||
|
Text = """{"type":"CONTEXT","transID":"trans-audio-chat","data":{"audioTranscriptHint":"hello from buffered audio"}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-audio-chat-token",
|
||||||
|
Binary = [1, 2, 3, 4, 5]
|
||||||
|
});
|
||||||
|
|
||||||
|
var finalizeReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-audio-chat-token",
|
||||||
|
Text = """{"type":"CLIENT_ASR","transID":"trans-audio-chat","data":{}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal(3, finalizeReplies.Count);
|
||||||
|
using var listenPayload = JsonDocument.Parse(finalizeReplies[0].Text!);
|
||||||
|
Assert.Equal("hello from buffered audio", listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
|
||||||
|
Assert.Equal("chat", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||||
|
|
||||||
|
using var skillPayload = JsonDocument.Parse(finalizeReplies[2].Text!);
|
||||||
|
Assert.Equal("chitchat-skill", skillPayload.RootElement.GetProperty("data").GetProperty("skill").GetProperty("id").GetString());
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task FollowUpTurn_UsesNewTurnStateWithoutLeakingBufferedAudio()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-followup-audio-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-first","data":{"rules":["wake-word"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-followup-audio-token",
|
||||||
|
Text = """{"type":"CONTEXT","transID":"trans-first","data":{"audioTranscriptHint":"tell me a joke"}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-followup-audio-token",
|
||||||
|
Binary = [1, 2, 3, 4]
|
||||||
|
});
|
||||||
|
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-followup-audio-token",
|
||||||
|
Text = """{"type":"CLIENT_ASR","transID":"trans-first","data":{}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
var followUpReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-followup-audio-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-second","data":{"text":"what time is it","rules":["follow-up"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal(3, followUpReplies.Count);
|
||||||
|
using var payload = JsonDocument.Parse(followUpReplies[0].Text!);
|
||||||
|
Assert.Equal("time", payload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||||
|
Assert.Equal("trans-second", payload.RootElement.GetProperty("transID").GetString());
|
||||||
|
|
||||||
|
var session = _store.FindSessionByToken("hub-followup-audio-token");
|
||||||
|
Assert.NotNull(session);
|
||||||
|
Assert.Equal("trans-second", session!.TurnState.TransId);
|
||||||
|
Assert.Equal(0, session.TurnState.BufferedAudioBytes);
|
||||||
|
Assert.Equal(0, session.TurnState.BufferedAudioChunkCount);
|
||||||
|
}
|
||||||
|
|
||||||
[Theory]
|
[Theory]
|
||||||
[InlineData("fixtures\\neo-hub-client-asr-joke.flow.json")]
|
[InlineData("fixtures\\neo-hub-client-asr-joke.flow.json")]
|
||||||
[InlineData("fixtures\\neo-hub-context-client-nlu.flow.json")]
|
[InlineData("fixtures\\neo-hub-context-client-nlu.flow.json")]
|
||||||
[InlineData("fixtures\\neo-hub-buffered-audio-synthetic-asr.flow.json")]
|
[InlineData("fixtures\\neo-hub-buffered-audio-synthetic-asr.flow.json")]
|
||||||
|
[InlineData("fixtures\\neo-hub-multichunk-audio-chat.flow.json")]
|
||||||
|
[InlineData("fixtures\\neo-hub-buffered-audio-pending.flow.json")]
|
||||||
public async Task WebSocketFixture_ReplaysSuccessfully(string relativePath)
|
public async Task WebSocketFixture_ReplaysSuccessfully(string relativePath)
|
||||||
{
|
{
|
||||||
var fixture = WebSocketFixtureLoader.Load(relativePath);
|
var fixture = WebSocketFixtureLoader.Load(relativePath);
|
||||||
|
|||||||
Reference in New Issue
Block a user