added a first pass at websocket IO

This commit is contained in:
Jacob Dubin
2026-04-11 21:50:26 -05:00
parent d7ea8eebab
commit e2da1cfcfe
13 changed files with 583 additions and 28 deletions

View File

@@ -57,9 +57,25 @@ The .NET implementation should:
- copy observed behavior where needed
- use fixtures captured from Node and real robots
- avoid speculative protocol design
- separate HTTP parity, websocket parity, and future discovery work so coverage stays honest
## Current State
This folder now contains the first hosted scaffold, not just a README.
The intent is to grow from a runnable dev monolith into the real Azure deployment target without abandoning the existing abstractions work.
Current websocket scope is still intentionally narrow:
- token-backed socket sessions
- synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR`
- `CONTEXT` capture and follow-up turn state
- `EOS` completion
- first skill vertical for joke/chat `SKILL_ACTION` playback
Not yet covered:
- real binary audio / ASR finalization parity
- upstream Nimbus or broader skill lifecycle behavior
- animation / expression command families
- ESML feature parity beyond the narrow synthetic playback payloads used in the current scaffold

View File

@@ -57,6 +57,19 @@ public sealed class DemoConversationBroker : IConversationBroker
}
};
if (string.Equals(plan.IntentName, "joke", StringComparison.OrdinalIgnoreCase))
{
plan.Actions.Add(new InvokeNativeSkillAction
{
Sequence = 2,
SkillName = "@be/joke",
Payload = new Dictionary<string, object?>
{
["replyType"] = "joke"
}
});
}
return Task.FromResult(plan);
}
}

View File

@@ -15,9 +15,12 @@ public sealed class JiboWebSocketService(
{
var session = stateStore.FindSessionByToken(envelope.Token ?? string.Empty) ??
stateStore.OpenSession(envelope.Kind, null, envelope.Token, envelope.HostName, envelope.Path);
session.LastSeenUtc = DateTimeOffset.UtcNow;
if (envelope.IsBinary)
{
session.LastMessageType = "BINARY_AUDIO";
session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0;
return
[
new WebSocketReply
@@ -36,14 +39,70 @@ public sealed class JiboWebSocketService(
}
var parsedType = ReadMessageType(envelope.Text);
session.LastListenType = parsedType;
session.LastMessageType = parsedType;
var parsedTransId = ReadTransId(envelope.Text);
if (!string.IsNullOrWhiteSpace(parsedTransId))
{
session.LastTransId = parsedTransId;
}
if (parsedType == "CONTEXT")
{
session.Metadata["context"] = ExtractDataPayload(envelope.Text);
return
[
new WebSocketReply
{
Text = JsonSerializer.Serialize(new
{
type = "OPENJIBO_CONTEXT_ACK",
data = new
{
sessionId = session.SessionId,
transID = session.LastTransId
}
})
}
];
}
if (parsedType is "LISTEN" or "CLIENT_NLU" or "CLIENT_ASR")
{
var turn = turnContextMapper.MapListenMessage(envelope, session);
var plan = await conversationBroker.HandleTurnAsync(turn, cancellationToken);
PersistTurnHints(session, envelope.Text, parsedType);
return replyMapper.Map(plan).Select(text => new WebSocketReply
var turn = turnContextMapper.MapListenMessage(envelope, session, parsedType);
if (string.IsNullOrWhiteSpace(turn.NormalizedTranscript) &&
string.IsNullOrWhiteSpace(turn.RawTranscript))
{
return
[
new WebSocketReply
{
Text = JsonSerializer.Serialize(new
{
type = "OPENJIBO_ACK",
data = new
{
messageType = parsedType,
sessionId = session.SessionId,
transID = session.LastTransId
}
})
}
];
}
var plan = await conversationBroker.HandleTurnAsync(turn, cancellationToken);
var listenAction = plan.Actions.OfType<ListenAction>().OrderBy(action => action.Sequence).LastOrDefault();
session.LastTranscript = turn.NormalizedTranscript ?? turn.RawTranscript;
session.LastIntent = plan.IntentName;
session.LastListenType = listenAction?.Mode;
session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen
? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout)
: null;
var emitSkillActions = parsedType != "CLIENT_NLU";
return replyMapper.Map(plan, turn, session, emitSkillActions).Select(text => new WebSocketReply
{
Text = text
}).ToArray();
@@ -66,6 +125,45 @@ public sealed class JiboWebSocketService(
];
}
private static void PersistTurnHints(CloudSession session, string? text, string messageType)
{
if (string.IsNullOrWhiteSpace(text))
{
return;
}
try
{
using var document = JsonDocument.Parse(text);
var root = document.RootElement;
if (root.TryGetProperty("data", out var data) && data.ValueKind == JsonValueKind.Object)
{
if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array)
{
session.Metadata["listenRules"] = rules.EnumerateArray()
.Select(item => item.ValueKind == JsonValueKind.String ? item.GetString() ?? string.Empty : item.ToString())
.Where(rule => !string.IsNullOrWhiteSpace(rule))
.ToArray();
}
if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String)
{
session.LastIntent = intent.GetString();
}
if (messageType == "CONTEXT")
{
session.Metadata["context"] = data.GetRawText();
}
}
}
catch
{
// Keep the compatibility layer permissive while captures are still incomplete.
}
}
private static string ReadMessageType(string? text)
{
if (string.IsNullOrWhiteSpace(text))
@@ -88,4 +186,50 @@ public sealed class JiboWebSocketService(
return "UNKNOWN";
}
private static string? ReadTransId(string? text)
{
if (string.IsNullOrWhiteSpace(text))
{
return null;
}
try
{
using var document = JsonDocument.Parse(text);
if (document.RootElement.TryGetProperty("transID", out var transId) && transId.ValueKind == JsonValueKind.String)
{
return transId.GetString();
}
}
catch
{
return null;
}
return null;
}
private static string? ExtractDataPayload(string? text)
{
if (string.IsNullOrWhiteSpace(text))
{
return null;
}
try
{
using var document = JsonDocument.Parse(text);
if (document.RootElement.TryGetProperty("data", out var data))
{
return data.GetRawText();
}
}
catch
{
return null;
}
return null;
}
}

View File

@@ -6,14 +6,34 @@ namespace Jibo.Cloud.Application.Services;
public sealed class ProtocolToTurnContextMapper
{
public TurnContext MapListenMessage(WebSocketMessageEnvelope envelope, CloudSession session)
public TurnContext MapListenMessage(WebSocketMessageEnvelope envelope, CloudSession session, string messageType)
{
var text = ExtractTranscript(envelope.Text);
var protocolOperation = messageType.ToLowerInvariant();
var attributes = new Dictionary<string, object?>(StringComparer.OrdinalIgnoreCase)
{
["messageType"] = messageType
};
if (!string.IsNullOrWhiteSpace(session.LastTransId))
{
attributes["transID"] = session.LastTransId;
}
if (session.Metadata.TryGetValue("context", out var context))
{
attributes["context"] = context;
}
if (session.Metadata.TryGetValue("listenRules", out var listenRules))
{
attributes["listenRules"] = listenRules;
}
return new TurnContext
{
SessionId = session.SessionId,
InputMode = session.LastListenType == "follow-up" ? TurnInputMode.FollowUp : TurnInputMode.DirectText,
InputMode = session.FollowUpOpen ? TurnInputMode.FollowUp : TurnInputMode.DirectText,
SourceKind = TurnSourceKind.Api,
RawTranscript = text,
NormalizedTranscript = text?.Trim(),
@@ -21,10 +41,11 @@ public sealed class ProtocolToTurnContextMapper
HostName = envelope.HostName,
RequestId = envelope.ConnectionId,
ProtocolService = "neo-hub",
ProtocolOperation = "listen",
ProtocolOperation = protocolOperation,
FirmwareVersion = session.Metadata.TryGetValue("firmwareVersion", out var firmwareVersion) ? firmwareVersion as string : null,
ApplicationVersion = session.Metadata.TryGetValue("applicationVersion", out var applicationVersion) ? applicationVersion as string : null,
IsFollowUpEligible = true
IsFollowUpEligible = true,
Attributes = attributes
};
}

View File

@@ -1,39 +1,144 @@
using System.Text.Json;
using Jibo.Cloud.Domain.Models;
using Jibo.Runtime.Abstractions;
namespace Jibo.Cloud.Application.Services;
public sealed class ResponsePlanToSocketMessagesMapper
{
public IReadOnlyList<string> Map(ResponsePlan plan)
public IReadOnlyList<string> Map(ResponsePlan plan, TurnContext turn, CloudSession session, bool emitSkillActions)
{
var speak = plan.Actions.OfType<SpeakAction>().FirstOrDefault();
var skill = plan.Actions.OfType<InvokeNativeSkillAction>().FirstOrDefault();
var transId = turn.Attributes.TryGetValue("transID", out var transIdValue)
? transIdValue?.ToString() ?? string.Empty
: session.LastTransId ?? string.Empty;
var transcript = turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty;
var rules = ReadRules(turn);
var messages = new List<string>();
if (speak is not null)
messages.Add(JsonSerializer.Serialize(new
{
messages.Add(JsonSerializer.Serialize(new
type = "LISTEN",
transID = transId,
data = new
{
type = "OPENJIBO_RESPONSE",
data = new
asr = new
{
intent = plan.IntentName,
text = speak.Text,
followUpOpen = plan.FollowUp.KeepMicOpen,
timeoutMs = (int)plan.FollowUp.Timeout.TotalMilliseconds
confidence = 0.95,
final = true,
text = transcript
},
nlu = new
{
confidence = 0.95,
intent = plan.IntentName ?? "unknown",
rules,
entities = new Dictionary<string, object?>()
},
match = new
{
intent = plan.IntentName ?? "unknown",
rule = rules.FirstOrDefault() ?? string.Empty,
score = 0.95
}
}));
}
}
}));
messages.Add(JsonSerializer.Serialize(new
{
type = "EOS",
data = new
{
sessionId = plan.SessionId
sessionId = plan.SessionId,
transID = transId
}
}));
if (emitSkillActions && speak is not null)
{
messages.Add(JsonSerializer.Serialize(BuildSkillPayload(plan, turn, transId, speak, skill)));
}
return messages;
}
private static IReadOnlyList<string> ReadRules(TurnContext turn)
{
if (!turn.Attributes.TryGetValue("listenRules", out var value))
{
return [];
}
return value switch
{
IReadOnlyList<string> typedRules => typedRules,
IEnumerable<string> rules => rules.Where(rule => !string.IsNullOrWhiteSpace(rule)).ToArray(),
_ => []
};
}
private static object BuildSkillPayload(ResponsePlan plan, TurnContext turn, string transId, SpeakAction speak, InvokeNativeSkillAction? skill)
{
var isJoke = string.Equals(plan.IntentName, "joke", StringComparison.OrdinalIgnoreCase) ||
string.Equals(skill?.SkillName, "@be/joke", StringComparison.OrdinalIgnoreCase);
var skillId = isJoke ? "@be/joke" : skill?.SkillName ?? "chitchat-skill";
var esml = isJoke
? $"<speak><es cat='happy' filter='!ssa-only, !sfx-only' endNeutral='true'>{EscapeXml(speak.Text)}</es></speak>"
: $"<speak><es cat='neutral' filter='!ssa-only, !sfx-only' endNeutral='true'>{EscapeXml(speak.Text)}</es></speak>";
var mimId = isJoke ? "runtime-joke" : "runtime-chat";
return new
{
type = "SKILL_ACTION",
ts = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(),
msgID = $"msg-{Guid.NewGuid():N}",
transID = transId,
data = new
{
skill = new
{
id = skillId
},
action = new
{
config = new
{
jcp = new
{
type = "SLIM",
config = new
{
play = new
{
esml,
meta = new
{
prompt_id = "RUNTIME_PROMPT",
prompt_sub_category = "AN",
mim_id = mimId,
mim_type = "announcement",
intent = plan.IntentName ?? "unknown",
transcript = turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty
}
}
}
}
}
},
analytics = new Dictionary<string, object?>(),
final = true
}
};
}
private static string EscapeXml(string value)
{
return value
.Replace("&", "&amp;", StringComparison.Ordinal)
.Replace("<", "&lt;", StringComparison.Ordinal)
.Replace(">", "&gt;", StringComparison.Ordinal)
.Replace("\"", "&quot;", StringComparison.Ordinal)
.Replace("'", "&apos;", StringComparison.Ordinal);
}
}

View File

@@ -11,7 +11,12 @@ public sealed class CloudSession
public string? Path { get; init; }
public DateTimeOffset CreatedUtc { get; init; } = DateTimeOffset.UtcNow;
public DateTimeOffset LastSeenUtc { get; set; } = DateTimeOffset.UtcNow;
public DateTimeOffset? FollowUpExpiresUtc { get; set; }
public string? LastMessageType { get; set; }
public string? LastListenType { get; set; }
public string? LastIntent { get; set; }
public string? LastTranscript { get; set; }
public string? LastTransId { get; set; }
public bool FollowUpOpen => FollowUpExpiresUtc.HasValue && FollowUpExpiresUtc > DateTimeOffset.UtcNow;
public IDictionary<string, object?> Metadata { get; init; } = new Dictionary<string, object?>();
}

View File

@@ -6,5 +6,7 @@ Current fixture groups:
- `http/`
Basic `X-Amz-Target` request and response examples for startup flows.
- `websocket/`
Sanitized Neo-Hub turn-flow examples used to replay `LISTEN`, `CONTEXT`, `CLIENT_NLU`, `CLIENT_ASR`, and synthetic `EOS` / `SKILL_ACTION` behavior against the .NET implementation.
Expand this folder whenever new robot traffic is captured and cleaned.

View File

@@ -0,0 +1,42 @@
{
"name": "neo-hub client asr joke flow",
"session": {
"hostName": "neo-hub.jibo.com",
"path": "/listen",
"kind": "neo-hub-listen",
"token": "fixture-joke-token"
},
"steps": [
{
"text": {
"type": "LISTEN",
"transID": "fixture-trans-joke",
"data": {
"text": "tell me a joke",
"rules": [
"wake-word"
]
}
},
"expectedReplyTypes": [
"LISTEN",
"EOS",
"SKILL_ACTION"
]
},
{
"text": {
"type": "CLIENT_ASR",
"transID": "fixture-trans-joke",
"data": {
"text": "tell me a joke"
}
},
"expectedReplyTypes": [
"LISTEN",
"EOS",
"SKILL_ACTION"
]
}
]
}

View File

@@ -0,0 +1,54 @@
{
"name": "neo-hub context client nlu flow",
"session": {
"hostName": "neo-hub.jibo.com",
"path": "/listen",
"kind": "neo-hub-listen",
"token": "fixture-nlu-token"
},
"steps": [
{
"text": {
"type": "LISTEN",
"transID": "fixture-trans-nlu",
"data": {
"text": "hello jibo",
"rules": [
"wake-word"
]
}
},
"expectedReplyTypes": [
"LISTEN",
"EOS",
"SKILL_ACTION"
]
},
{
"text": {
"type": "CONTEXT",
"transID": "fixture-trans-nlu",
"data": {
"topic": "conversation",
"screen": "home"
}
},
"expectedReplyTypes": [
"OPENJIBO_CONTEXT_ACK"
]
},
{
"text": {
"type": "CLIENT_NLU",
"transID": "fixture-trans-nlu",
"data": {
"intent": "joke"
}
},
"expectedReplyTypes": [
"LISTEN",
"EOS"
]
}
]
}