another pass complete
This commit is contained in:
@@ -64,21 +64,33 @@ Observed from `open-jibo-link.js`:
|
|||||||
The current .NET pass covers only a narrow, explicitly synthetic subset of observed Neo-Hub behavior:
|
The current .NET pass covers only a narrow, explicitly synthetic subset of observed Neo-Hub behavior:
|
||||||
|
|
||||||
- token/session tracking across websocket turns
|
- token/session tracking across websocket turns
|
||||||
|
- buffered audio accounting and turn-pending state
|
||||||
- `LISTEN` message handling with synthetic `LISTEN` result payload shaping
|
- `LISTEN` message handling with synthetic `LISTEN` result payload shaping
|
||||||
- `CONTEXT` capture for turn/session state
|
- `CONTEXT` capture for turn/session state
|
||||||
- `CLIENT_NLU` turn completion using remembered listen/session metadata
|
- `CLIENT_NLU` turn completion using remembered listen/session metadata
|
||||||
- `CLIENT_ASR` text-driven turn completion
|
- `CLIENT_ASR` turn completion, including a synthetic STT seam for buffered-audio replay
|
||||||
- `EOS` emission after completed turns
|
- `EOS` emission after completed turns
|
||||||
- first richer vertical slice for joke/chat `SKILL_ACTION` playback
|
- first richer vertical slice for joke/chat `SKILL_ACTION` playback
|
||||||
|
|
||||||
This does not yet mean parity for:
|
This does not yet mean parity for:
|
||||||
|
|
||||||
- real binary audio buffering and finalization
|
- real binary audio buffering and finalization
|
||||||
- external ASR lifecycle timing
|
- real STT provider integration and external ASR lifecycle timing
|
||||||
- early-EOS behavior
|
- early-EOS behavior
|
||||||
- multi-step skill lifecycles beyond the current synthetic playback response
|
- multi-step skill lifecycles beyond the current synthetic playback response
|
||||||
- broader interaction, animation, or ESML command families
|
- broader interaction, animation, or ESML command families
|
||||||
|
|
||||||
|
### Internal ASR Direction
|
||||||
|
|
||||||
|
The current .NET websocket layer now separates:
|
||||||
|
|
||||||
|
- robot-facing websocket compatibility
|
||||||
|
- session and buffered-audio state
|
||||||
|
- transcript resolution / STT selection
|
||||||
|
- turn-to-response mapping
|
||||||
|
|
||||||
|
That separation is intentional. The synthetic STT path currently exists only to support fixture-driven replay while parity work continues. It should be treated as an internal compatibility seam, not as the final production ASR design.
|
||||||
|
|
||||||
## Upload Paths
|
## Upload Paths
|
||||||
|
|
||||||
| Path | Purpose | Confidence | Current .NET status |
|
| Path | Purpose | Confidence | Current .NET status |
|
||||||
|
|||||||
@@ -69,6 +69,8 @@ Current websocket scope is still intentionally narrow:
|
|||||||
|
|
||||||
- token-backed socket sessions
|
- token-backed socket sessions
|
||||||
- synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR`
|
- synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR`
|
||||||
|
- buffered audio state tracking behind a dedicated turn-finalization layer
|
||||||
|
- synthetic STT strategy selection for fixture-driven audio turn completion
|
||||||
- `CONTEXT` capture and follow-up turn state
|
- `CONTEXT` capture and follow-up turn state
|
||||||
- `EOS` completion
|
- `EOS` completion
|
||||||
- first skill vertical for joke/chat `SKILL_ACTION` playback
|
- first skill vertical for joke/chat `SKILL_ACTION` playback
|
||||||
@@ -76,6 +78,7 @@ Current websocket scope is still intentionally narrow:
|
|||||||
Not yet covered:
|
Not yet covered:
|
||||||
|
|
||||||
- real binary audio / ASR finalization parity
|
- real binary audio / ASR finalization parity
|
||||||
|
- provider-backed ASR integration
|
||||||
- upstream Nimbus or broader skill lifecycle behavior
|
- upstream Nimbus or broader skill lifecycle behavior
|
||||||
- animation / expression command families
|
- animation / expression command families
|
||||||
- ESML feature parity beyond the narrow synthetic playback payloads used in the current scaffold
|
- ESML feature parity beyond the narrow synthetic playback payloads used in the current scaffold
|
||||||
|
|||||||
@@ -0,0 +1,19 @@
|
|||||||
|
using Jibo.Runtime.Abstractions;
|
||||||
|
|
||||||
|
namespace Jibo.Cloud.Application.Services;
|
||||||
|
|
||||||
|
public sealed class DefaultSttStrategySelector(IEnumerable<ISttStrategy> strategies) : ISttStrategySelector
|
||||||
|
{
|
||||||
|
private readonly IReadOnlyList<ISttStrategy> _strategies = strategies.ToArray();
|
||||||
|
|
||||||
|
public Task<ISttStrategy> SelectAsync(TurnContext turn, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
var strategy = _strategies.FirstOrDefault(candidate => candidate.CanHandle(turn));
|
||||||
|
if (strategy is null)
|
||||||
|
{
|
||||||
|
throw new InvalidOperationException("No STT strategy can handle the current turn.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Task.FromResult(strategy);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,9 +7,7 @@ namespace Jibo.Cloud.Application.Services;
|
|||||||
|
|
||||||
public sealed class JiboWebSocketService(
|
public sealed class JiboWebSocketService(
|
||||||
ICloudStateStore stateStore,
|
ICloudStateStore stateStore,
|
||||||
ProtocolToTurnContextMapper turnContextMapper,
|
WebSocketTurnFinalizationService turnFinalizationService)
|
||||||
IConversationBroker conversationBroker,
|
|
||||||
ResponsePlanToSocketMessagesMapper replyMapper)
|
|
||||||
{
|
{
|
||||||
public async Task<IReadOnlyList<WebSocketReply>> HandleMessageAsync(WebSocketMessageEnvelope envelope, CancellationToken cancellationToken = default)
|
public async Task<IReadOnlyList<WebSocketReply>> HandleMessageAsync(WebSocketMessageEnvelope envelope, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
@@ -19,23 +17,7 @@ public sealed class JiboWebSocketService(
|
|||||||
|
|
||||||
if (envelope.IsBinary)
|
if (envelope.IsBinary)
|
||||||
{
|
{
|
||||||
session.LastMessageType = "BINARY_AUDIO";
|
return turnFinalizationService.HandleBinaryAudio(session, envelope);
|
||||||
session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0;
|
|
||||||
return
|
|
||||||
[
|
|
||||||
new WebSocketReply
|
|
||||||
{
|
|
||||||
Text = JsonSerializer.Serialize(new
|
|
||||||
{
|
|
||||||
type = "OPENJIBO_AUDIO_RECEIVED",
|
|
||||||
data = new
|
|
||||||
{
|
|
||||||
bytes = envelope.Binary?.Length ?? 0,
|
|
||||||
sessionId = session.SessionId
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var parsedType = ReadMessageType(envelope.Text);
|
var parsedType = ReadMessageType(envelope.Text);
|
||||||
@@ -48,64 +30,12 @@ public sealed class JiboWebSocketService(
|
|||||||
|
|
||||||
if (parsedType == "CONTEXT")
|
if (parsedType == "CONTEXT")
|
||||||
{
|
{
|
||||||
session.Metadata["context"] = ExtractDataPayload(envelope.Text);
|
return turnFinalizationService.HandleContext(session, envelope.Text);
|
||||||
return
|
|
||||||
[
|
|
||||||
new WebSocketReply
|
|
||||||
{
|
|
||||||
Text = JsonSerializer.Serialize(new
|
|
||||||
{
|
|
||||||
type = "OPENJIBO_CONTEXT_ACK",
|
|
||||||
data = new
|
|
||||||
{
|
|
||||||
sessionId = session.SessionId,
|
|
||||||
transID = session.LastTransId
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parsedType is "LISTEN" or "CLIENT_NLU" or "CLIENT_ASR")
|
if (parsedType is "LISTEN" or "CLIENT_NLU" or "CLIENT_ASR")
|
||||||
{
|
{
|
||||||
PersistTurnHints(session, envelope.Text, parsedType);
|
return await turnFinalizationService.HandleTurnAsync(session, envelope, parsedType, cancellationToken);
|
||||||
|
|
||||||
var turn = turnContextMapper.MapListenMessage(envelope, session, parsedType);
|
|
||||||
if (string.IsNullOrWhiteSpace(turn.NormalizedTranscript) &&
|
|
||||||
string.IsNullOrWhiteSpace(turn.RawTranscript))
|
|
||||||
{
|
|
||||||
return
|
|
||||||
[
|
|
||||||
new WebSocketReply
|
|
||||||
{
|
|
||||||
Text = JsonSerializer.Serialize(new
|
|
||||||
{
|
|
||||||
type = "OPENJIBO_ACK",
|
|
||||||
data = new
|
|
||||||
{
|
|
||||||
messageType = parsedType,
|
|
||||||
sessionId = session.SessionId,
|
|
||||||
transID = session.LastTransId
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
var plan = await conversationBroker.HandleTurnAsync(turn, cancellationToken);
|
|
||||||
var listenAction = plan.Actions.OfType<ListenAction>().OrderBy(action => action.Sequence).LastOrDefault();
|
|
||||||
session.LastTranscript = turn.NormalizedTranscript ?? turn.RawTranscript;
|
|
||||||
session.LastIntent = plan.IntentName;
|
|
||||||
session.LastListenType = listenAction?.Mode;
|
|
||||||
session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen
|
|
||||||
? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout)
|
|
||||||
: null;
|
|
||||||
|
|
||||||
var emitSkillActions = parsedType != "CLIENT_NLU";
|
|
||||||
return replyMapper.Map(plan, turn, session, emitSkillActions).Select(text => new WebSocketReply
|
|
||||||
{
|
|
||||||
Text = text
|
|
||||||
}).ToArray();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
@@ -125,45 +55,6 @@ public sealed class JiboWebSocketService(
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void PersistTurnHints(CloudSession session, string? text, string messageType)
|
|
||||||
{
|
|
||||||
if (string.IsNullOrWhiteSpace(text))
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
using var document = JsonDocument.Parse(text);
|
|
||||||
var root = document.RootElement;
|
|
||||||
|
|
||||||
if (root.TryGetProperty("data", out var data) && data.ValueKind == JsonValueKind.Object)
|
|
||||||
{
|
|
||||||
if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array)
|
|
||||||
{
|
|
||||||
session.Metadata["listenRules"] = rules.EnumerateArray()
|
|
||||||
.Select(item => item.ValueKind == JsonValueKind.String ? item.GetString() ?? string.Empty : item.ToString())
|
|
||||||
.Where(rule => !string.IsNullOrWhiteSpace(rule))
|
|
||||||
.ToArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String)
|
|
||||||
{
|
|
||||||
session.LastIntent = intent.GetString();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (messageType == "CONTEXT")
|
|
||||||
{
|
|
||||||
session.Metadata["context"] = data.GetRawText();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch
|
|
||||||
{
|
|
||||||
// Keep the compatibility layer permissive while captures are still incomplete.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static string ReadMessageType(string? text)
|
private static string ReadMessageType(string? text)
|
||||||
{
|
{
|
||||||
if (string.IsNullOrWhiteSpace(text))
|
if (string.IsNullOrWhiteSpace(text))
|
||||||
@@ -209,27 +100,4 @@ public sealed class JiboWebSocketService(
|
|||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string? ExtractDataPayload(string? text)
|
|
||||||
{
|
|
||||||
if (string.IsNullOrWhiteSpace(text))
|
|
||||||
{
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
using var document = JsonDocument.Parse(text);
|
|
||||||
if (document.RootElement.TryGetProperty("data", out var data))
|
|
||||||
{
|
|
||||||
return data.GetRawText();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch
|
|
||||||
{
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,6 +30,17 @@ public sealed class ProtocolToTurnContextMapper
|
|||||||
attributes["listenRules"] = listenRules;
|
attributes["listenRules"] = listenRules;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (session.BufferedAudioBytes > 0)
|
||||||
|
{
|
||||||
|
attributes["bufferedAudioBytes"] = session.BufferedAudioBytes;
|
||||||
|
attributes["bufferedAudioChunks"] = session.BufferedAudioChunkCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (session.Metadata.TryGetValue("audioTranscriptHint", out var audioTranscriptHint))
|
||||||
|
{
|
||||||
|
attributes["audioTranscriptHint"] = audioTranscriptHint;
|
||||||
|
}
|
||||||
|
|
||||||
return new TurnContext
|
return new TurnContext
|
||||||
{
|
{
|
||||||
SessionId = session.SessionId,
|
SessionId = session.SessionId,
|
||||||
@@ -68,17 +79,30 @@ public sealed class ProtocolToTurnContextMapper
|
|||||||
return transcript.GetString();
|
return transcript.GetString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (data.TryGetProperty("asr", out var asr) &&
|
||||||
|
asr.ValueKind == JsonValueKind.Object &&
|
||||||
|
asr.TryGetProperty("text", out var asrText) &&
|
||||||
|
asrText.ValueKind == JsonValueKind.String)
|
||||||
|
{
|
||||||
|
return asrText.GetString();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.TryGetProperty("transcriptHint", out var transcriptHint) && transcriptHint.ValueKind == JsonValueKind.String)
|
||||||
|
{
|
||||||
|
return transcriptHint.GetString();
|
||||||
|
}
|
||||||
|
|
||||||
if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String)
|
if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String)
|
||||||
{
|
{
|
||||||
return intent.GetString();
|
return intent.GetString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
catch
|
catch
|
||||||
{
|
{
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
return text;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,59 @@
|
|||||||
|
using Jibo.Runtime.Abstractions;
|
||||||
|
|
||||||
|
namespace Jibo.Cloud.Application.Services;
|
||||||
|
|
||||||
|
public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy
|
||||||
|
{
|
||||||
|
public string Name => "synthetic-buffered-audio";
|
||||||
|
|
||||||
|
public bool CanHandle(TurnContext turn)
|
||||||
|
{
|
||||||
|
return ReadBufferedAudioBytes(turn) > 0 &&
|
||||||
|
!string.IsNullOrWhiteSpace(ReadTranscriptHint(turn));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Task<SttResult> TranscribeAsync(TurnContext turn, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
var transcriptHint = ReadTranscriptHint(turn);
|
||||||
|
if (string.IsNullOrWhiteSpace(transcriptHint))
|
||||||
|
{
|
||||||
|
throw new InvalidOperationException("Synthetic buffered audio STT requires an audio transcript hint.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Task.FromResult(new SttResult
|
||||||
|
{
|
||||||
|
Text = transcriptHint.Trim(),
|
||||||
|
Provider = Name,
|
||||||
|
Confidence = 0.75f,
|
||||||
|
Locale = turn.Locale,
|
||||||
|
Metadata = new Dictionary<string, object?>(StringComparer.OrdinalIgnoreCase)
|
||||||
|
{
|
||||||
|
["bufferedAudioBytes"] = ReadBufferedAudioBytes(turn),
|
||||||
|
["mode"] = "fixture-hint"
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int ReadBufferedAudioBytes(TurnContext turn)
|
||||||
|
{
|
||||||
|
if (!turn.Attributes.TryGetValue("bufferedAudioBytes", out var bufferedAudioBytes))
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return bufferedAudioBytes switch
|
||||||
|
{
|
||||||
|
int value => value,
|
||||||
|
long value => (int)value,
|
||||||
|
string value when int.TryParse(value, out var parsed) => parsed,
|
||||||
|
_ => 0
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string? ReadTranscriptHint(TurnContext turn)
|
||||||
|
{
|
||||||
|
return turn.Attributes.TryGetValue("audioTranscriptHint", out var transcriptHint)
|
||||||
|
? transcriptHint?.ToString()
|
||||||
|
: null;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,274 @@
|
|||||||
|
using System.Text.Json;
|
||||||
|
using Jibo.Cloud.Domain.Models;
|
||||||
|
using Jibo.Runtime.Abstractions;
|
||||||
|
|
||||||
|
namespace Jibo.Cloud.Application.Services;
|
||||||
|
|
||||||
|
public sealed class WebSocketTurnFinalizationService(
|
||||||
|
ProtocolToTurnContextMapper turnContextMapper,
|
||||||
|
IConversationBroker conversationBroker,
|
||||||
|
ResponsePlanToSocketMessagesMapper replyMapper,
|
||||||
|
ISttStrategySelector sttStrategySelector)
|
||||||
|
{
|
||||||
|
public IReadOnlyList<WebSocketReply> HandleBinaryAudio(CloudSession session, WebSocketMessageEnvelope envelope)
|
||||||
|
{
|
||||||
|
session.LastMessageType = "BINARY_AUDIO";
|
||||||
|
session.BufferedAudioChunkCount += 1;
|
||||||
|
session.BufferedAudioBytes += envelope.Binary?.Length ?? 0;
|
||||||
|
session.LastAudioReceivedUtc = DateTimeOffset.UtcNow;
|
||||||
|
session.AwaitingTurnCompletion = true;
|
||||||
|
session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0;
|
||||||
|
|
||||||
|
return
|
||||||
|
[
|
||||||
|
new WebSocketReply
|
||||||
|
{
|
||||||
|
Text = JsonSerializer.Serialize(new
|
||||||
|
{
|
||||||
|
type = "OPENJIBO_AUDIO_RECEIVED",
|
||||||
|
data = new
|
||||||
|
{
|
||||||
|
bytes = envelope.Binary?.Length ?? 0,
|
||||||
|
bufferedBytes = session.BufferedAudioBytes,
|
||||||
|
bufferedChunks = session.BufferedAudioChunkCount,
|
||||||
|
sessionId = session.SessionId
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
public IReadOnlyList<WebSocketReply> HandleContext(CloudSession session, string? text)
|
||||||
|
{
|
||||||
|
session.Metadata["context"] = ExtractDataPayload(text);
|
||||||
|
|
||||||
|
if (TryReadContextProperty(text, "audioTranscriptHint", out var transcriptHint) &&
|
||||||
|
!string.IsNullOrWhiteSpace(transcriptHint))
|
||||||
|
{
|
||||||
|
session.Metadata["audioTranscriptHint"] = transcriptHint;
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
[
|
||||||
|
new WebSocketReply
|
||||||
|
{
|
||||||
|
Text = JsonSerializer.Serialize(new
|
||||||
|
{
|
||||||
|
type = "OPENJIBO_CONTEXT_ACK",
|
||||||
|
data = new
|
||||||
|
{
|
||||||
|
sessionId = session.SessionId,
|
||||||
|
transID = session.LastTransId
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<IReadOnlyList<WebSocketReply>> HandleTurnAsync(
|
||||||
|
CloudSession session,
|
||||||
|
WebSocketMessageEnvelope envelope,
|
||||||
|
string messageType,
|
||||||
|
CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
PersistTurnHints(session, envelope.Text);
|
||||||
|
|
||||||
|
var turn = turnContextMapper.MapListenMessage(envelope, session, messageType);
|
||||||
|
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
||||||
|
if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) &&
|
||||||
|
string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript))
|
||||||
|
{
|
||||||
|
session.AwaitingTurnCompletion = true;
|
||||||
|
return
|
||||||
|
[
|
||||||
|
new WebSocketReply
|
||||||
|
{
|
||||||
|
Text = JsonSerializer.Serialize(new
|
||||||
|
{
|
||||||
|
type = "OPENJIBO_TURN_PENDING",
|
||||||
|
data = new
|
||||||
|
{
|
||||||
|
sessionId = session.SessionId,
|
||||||
|
transID = session.LastTransId,
|
||||||
|
bufferedAudioBytes = session.BufferedAudioBytes,
|
||||||
|
bufferedAudioChunks = session.BufferedAudioChunkCount,
|
||||||
|
awaitingAudio = true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
var plan = await conversationBroker.HandleTurnAsync(finalizedTurn, cancellationToken);
|
||||||
|
var listenAction = plan.Actions.OfType<ListenAction>().OrderBy(action => action.Sequence).LastOrDefault();
|
||||||
|
session.LastTranscript = finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript;
|
||||||
|
session.LastIntent = plan.IntentName;
|
||||||
|
session.LastListenType = listenAction?.Mode;
|
||||||
|
session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen
|
||||||
|
? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout)
|
||||||
|
: null;
|
||||||
|
session.AwaitingTurnCompletion = false;
|
||||||
|
|
||||||
|
var emitSkillActions = messageType != "CLIENT_NLU";
|
||||||
|
var replies = replyMapper.Map(plan, finalizedTurn, session, emitSkillActions).Select(text => new WebSocketReply
|
||||||
|
{
|
||||||
|
Text = text
|
||||||
|
}).ToArray();
|
||||||
|
|
||||||
|
ResetBufferedAudio(session);
|
||||||
|
return replies;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<TurnContext> ResolveTranscriptAsync(TurnContext turn, CloudSession session, CancellationToken cancellationToken)
|
||||||
|
{
|
||||||
|
if (!string.IsNullOrWhiteSpace(turn.NormalizedTranscript) || !string.IsNullOrWhiteSpace(turn.RawTranscript))
|
||||||
|
{
|
||||||
|
return turn;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (session.BufferedAudioBytes <= 0)
|
||||||
|
{
|
||||||
|
return turn;
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var strategy = await sttStrategySelector.SelectAsync(turn, cancellationToken);
|
||||||
|
var sttResult = await strategy.TranscribeAsync(turn, cancellationToken);
|
||||||
|
|
||||||
|
var attributes = new Dictionary<string, object?>(turn.Attributes, StringComparer.OrdinalIgnoreCase)
|
||||||
|
{
|
||||||
|
["sttProvider"] = sttResult.Provider,
|
||||||
|
["sttConfidence"] = sttResult.Confidence
|
||||||
|
};
|
||||||
|
|
||||||
|
foreach (var pair in sttResult.Metadata)
|
||||||
|
{
|
||||||
|
attributes[$"stt:{pair.Key}"] = pair.Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TurnContext
|
||||||
|
{
|
||||||
|
TurnId = turn.TurnId,
|
||||||
|
SessionId = turn.SessionId,
|
||||||
|
TimestampUtc = turn.TimestampUtc,
|
||||||
|
InputMode = turn.InputMode,
|
||||||
|
SourceKind = turn.SourceKind,
|
||||||
|
WakePhrase = turn.WakePhrase,
|
||||||
|
RawTranscript = sttResult.Text,
|
||||||
|
NormalizedTranscript = sttResult.Text.Trim(),
|
||||||
|
DeviceId = turn.DeviceId,
|
||||||
|
HostName = turn.HostName,
|
||||||
|
RequestId = turn.RequestId,
|
||||||
|
ProtocolService = turn.ProtocolService,
|
||||||
|
ProtocolOperation = turn.ProtocolOperation,
|
||||||
|
FirmwareVersion = turn.FirmwareVersion,
|
||||||
|
ApplicationVersion = turn.ApplicationVersion,
|
||||||
|
Locale = sttResult.Locale ?? turn.Locale,
|
||||||
|
TimeZone = turn.TimeZone,
|
||||||
|
IsFollowUpEligible = turn.IsFollowUpEligible,
|
||||||
|
Attributes = attributes
|
||||||
|
};
|
||||||
|
}
|
||||||
|
catch
|
||||||
|
{
|
||||||
|
return turn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void PersistTurnHints(CloudSession session, string? text)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(text))
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var document = JsonDocument.Parse(text);
|
||||||
|
var root = document.RootElement;
|
||||||
|
|
||||||
|
if (root.TryGetProperty("data", out var data) && data.ValueKind == JsonValueKind.Object)
|
||||||
|
{
|
||||||
|
if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array)
|
||||||
|
{
|
||||||
|
session.Metadata["listenRules"] = rules.EnumerateArray()
|
||||||
|
.Select(item => item.ValueKind == JsonValueKind.String ? item.GetString() ?? string.Empty : item.ToString())
|
||||||
|
.Where(rule => !string.IsNullOrWhiteSpace(rule))
|
||||||
|
.ToArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String)
|
||||||
|
{
|
||||||
|
session.LastIntent = intent.GetString();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.TryGetProperty("transcriptHint", out var transcriptHint) && transcriptHint.ValueKind == JsonValueKind.String)
|
||||||
|
{
|
||||||
|
session.Metadata["audioTranscriptHint"] = transcriptHint.GetString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch
|
||||||
|
{
|
||||||
|
// Keep the compatibility layer permissive while captures are still incomplete.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void ResetBufferedAudio(CloudSession session)
|
||||||
|
{
|
||||||
|
session.BufferedAudioBytes = 0;
|
||||||
|
session.BufferedAudioChunkCount = 0;
|
||||||
|
session.Metadata.Remove("audioTranscriptHint");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string? ExtractDataPayload(string? text)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(text))
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var document = JsonDocument.Parse(text);
|
||||||
|
if (document.RootElement.TryGetProperty("data", out var data))
|
||||||
|
{
|
||||||
|
return data.GetRawText();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool TryReadContextProperty(string? text, string propertyName, out string? value)
|
||||||
|
{
|
||||||
|
value = null;
|
||||||
|
if (string.IsNullOrWhiteSpace(text))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var document = JsonDocument.Parse(text);
|
||||||
|
if (!document.RootElement.TryGetProperty("data", out var data) ||
|
||||||
|
!data.TryGetProperty(propertyName, out var property) ||
|
||||||
|
property.ValueKind != JsonValueKind.String)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
value = property.GetString();
|
||||||
|
return !string.IsNullOrWhiteSpace(value);
|
||||||
|
}
|
||||||
|
catch
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,12 +11,16 @@ public sealed class CloudSession
|
|||||||
public string? Path { get; init; }
|
public string? Path { get; init; }
|
||||||
public DateTimeOffset CreatedUtc { get; init; } = DateTimeOffset.UtcNow;
|
public DateTimeOffset CreatedUtc { get; init; } = DateTimeOffset.UtcNow;
|
||||||
public DateTimeOffset LastSeenUtc { get; set; } = DateTimeOffset.UtcNow;
|
public DateTimeOffset LastSeenUtc { get; set; } = DateTimeOffset.UtcNow;
|
||||||
|
public DateTimeOffset? LastAudioReceivedUtc { get; set; }
|
||||||
public DateTimeOffset? FollowUpExpiresUtc { get; set; }
|
public DateTimeOffset? FollowUpExpiresUtc { get; set; }
|
||||||
public string? LastMessageType { get; set; }
|
public string? LastMessageType { get; set; }
|
||||||
public string? LastListenType { get; set; }
|
public string? LastListenType { get; set; }
|
||||||
public string? LastIntent { get; set; }
|
public string? LastIntent { get; set; }
|
||||||
public string? LastTranscript { get; set; }
|
public string? LastTranscript { get; set; }
|
||||||
public string? LastTransId { get; set; }
|
public string? LastTransId { get; set; }
|
||||||
|
public int BufferedAudioChunkCount { get; set; }
|
||||||
|
public int BufferedAudioBytes { get; set; }
|
||||||
|
public bool AwaitingTurnCompletion { get; set; }
|
||||||
public bool FollowUpOpen => FollowUpExpiresUtc.HasValue && FollowUpExpiresUtc > DateTimeOffset.UtcNow;
|
public bool FollowUpOpen => FollowUpExpiresUtc.HasValue && FollowUpExpiresUtc > DateTimeOffset.UtcNow;
|
||||||
public IDictionary<string, object?> Metadata { get; init; } = new Dictionary<string, object?>();
|
public IDictionary<string, object?> Metadata { get; init; } = new Dictionary<string, object?>();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,8 +12,11 @@ public static class ServiceCollectionExtensions
|
|||||||
{
|
{
|
||||||
services.AddSingleton<ICloudStateStore, InMemoryCloudStateStore>();
|
services.AddSingleton<ICloudStateStore, InMemoryCloudStateStore>();
|
||||||
services.AddSingleton<IConversationBroker, DemoConversationBroker>();
|
services.AddSingleton<IConversationBroker, DemoConversationBroker>();
|
||||||
|
services.AddSingleton<ISttStrategy, SyntheticBufferedAudioSttStrategy>();
|
||||||
|
services.AddSingleton<ISttStrategySelector, DefaultSttStrategySelector>();
|
||||||
services.AddSingleton<ProtocolToTurnContextMapper>();
|
services.AddSingleton<ProtocolToTurnContextMapper>();
|
||||||
services.AddSingleton<ResponsePlanToSocketMessagesMapper>();
|
services.AddSingleton<ResponsePlanToSocketMessagesMapper>();
|
||||||
|
services.AddSingleton<WebSocketTurnFinalizationService>();
|
||||||
services.AddSingleton<JiboCloudProtocolService>();
|
services.AddSingleton<JiboCloudProtocolService>();
|
||||||
services.AddSingleton<JiboWebSocketService>();
|
services.AddSingleton<JiboWebSocketService>();
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,56 @@
|
|||||||
|
{
|
||||||
|
"name": "neo-hub buffered audio synthetic asr flow",
|
||||||
|
"session": {
|
||||||
|
"hostName": "neo-hub.jibo.com",
|
||||||
|
"path": "/listen",
|
||||||
|
"kind": "neo-hub-listen",
|
||||||
|
"token": "fixture-audio-token"
|
||||||
|
},
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"text": {
|
||||||
|
"type": "LISTEN",
|
||||||
|
"transID": "fixture-trans-audio",
|
||||||
|
"data": {
|
||||||
|
"rules": [
|
||||||
|
"wake-word"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_TURN_PENDING"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": {
|
||||||
|
"type": "CONTEXT",
|
||||||
|
"transID": "fixture-trans-audio",
|
||||||
|
"data": {
|
||||||
|
"topic": "conversation",
|
||||||
|
"audioTranscriptHint": "tell me a joke"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_CONTEXT_ACK"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"binary": [1, 2, 3, 4, 5, 6],
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"OPENJIBO_AUDIO_RECEIVED"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": {
|
||||||
|
"type": "CLIENT_ASR",
|
||||||
|
"transID": "fixture-trans-audio",
|
||||||
|
"data": { }
|
||||||
|
},
|
||||||
|
"expectedReplyTypes": [
|
||||||
|
"LISTEN",
|
||||||
|
"EOS",
|
||||||
|
"SKILL_ACTION"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -14,11 +14,21 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
public JiboWebSocketServiceTests()
|
public JiboWebSocketServiceTests()
|
||||||
{
|
{
|
||||||
_store = new InMemoryCloudStateStore();
|
_store = new InMemoryCloudStateStore();
|
||||||
|
var turnContextMapper = new ProtocolToTurnContextMapper();
|
||||||
|
var conversationBroker = new DemoConversationBroker();
|
||||||
|
var replyMapper = new ResponsePlanToSocketMessagesMapper();
|
||||||
|
var sttSelector = new DefaultSttStrategySelector(
|
||||||
|
[
|
||||||
|
new SyntheticBufferedAudioSttStrategy()
|
||||||
|
]);
|
||||||
|
|
||||||
_service = new JiboWebSocketService(
|
_service = new JiboWebSocketService(
|
||||||
_store,
|
_store,
|
||||||
new ProtocolToTurnContextMapper(),
|
new WebSocketTurnFinalizationService(
|
||||||
new DemoConversationBroker(),
|
turnContextMapper,
|
||||||
new ResponsePlanToSocketMessagesMapper());
|
conversationBroker,
|
||||||
|
replyMapper,
|
||||||
|
sttSelector));
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
@@ -58,6 +68,8 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
using var payload = JsonDocument.Parse(replies[0].Text!);
|
using var payload = JsonDocument.Parse(replies[0].Text!);
|
||||||
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", payload.RootElement.GetProperty("type").GetString());
|
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", payload.RootElement.GetProperty("type").GetString());
|
||||||
Assert.Equal(4, payload.RootElement.GetProperty("data").GetProperty("bytes").GetInt32());
|
Assert.Equal(4, payload.RootElement.GetProperty("data").GetProperty("bytes").GetInt32());
|
||||||
|
Assert.Equal(4, payload.RootElement.GetProperty("data").GetProperty("bufferedBytes").GetInt32());
|
||||||
|
Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32());
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
@@ -104,9 +116,74 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
Assert.Equal("trans-follow-up", session.LastTransId);
|
Assert.Equal("trans-follow-up", session.LastTransId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam()
|
||||||
|
{
|
||||||
|
var listenReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-audio-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-audio","data":{"rules":["wake-word"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Single(listenReplies);
|
||||||
|
Assert.Equal("OPENJIBO_TURN_PENDING", ReadReplyType(listenReplies[0]));
|
||||||
|
|
||||||
|
var contextReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-audio-token",
|
||||||
|
Text = """{"type":"CONTEXT","transID":"trans-audio","data":{"topic":"conversation","audioTranscriptHint":"tell me a joke"}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Single(contextReplies);
|
||||||
|
Assert.Equal("OPENJIBO_CONTEXT_ACK", ReadReplyType(contextReplies[0]));
|
||||||
|
|
||||||
|
var audioReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-audio-token",
|
||||||
|
Binary = [1, 2, 3, 4, 5, 6]
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Single(audioReplies);
|
||||||
|
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(audioReplies[0]));
|
||||||
|
|
||||||
|
var finalizeReplies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-audio-token",
|
||||||
|
Text = """{"type":"CLIENT_ASR","transID":"trans-audio","data":{}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal(3, finalizeReplies.Count);
|
||||||
|
Assert.Equal("LISTEN", ReadReplyType(finalizeReplies[0]));
|
||||||
|
Assert.Equal("EOS", ReadReplyType(finalizeReplies[1]));
|
||||||
|
Assert.Equal("SKILL_ACTION", ReadReplyType(finalizeReplies[2]));
|
||||||
|
|
||||||
|
using var listenPayload = JsonDocument.Parse(finalizeReplies[0].Text!);
|
||||||
|
Assert.Equal("tell me a joke", listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
|
||||||
|
Assert.Equal("joke", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||||
|
|
||||||
|
var session = _store.FindSessionByToken("hub-audio-token");
|
||||||
|
Assert.NotNull(session);
|
||||||
|
Assert.Equal(0, session!.BufferedAudioBytes);
|
||||||
|
Assert.Equal(0, session.BufferedAudioChunkCount);
|
||||||
|
Assert.False(session.Metadata.ContainsKey("audioTranscriptHint"));
|
||||||
|
}
|
||||||
|
|
||||||
[Theory]
|
[Theory]
|
||||||
[InlineData("fixtures\\neo-hub-client-asr-joke.flow.json")]
|
[InlineData("fixtures\\neo-hub-client-asr-joke.flow.json")]
|
||||||
[InlineData("fixtures\\neo-hub-context-client-nlu.flow.json")]
|
[InlineData("fixtures\\neo-hub-context-client-nlu.flow.json")]
|
||||||
|
[InlineData("fixtures\\neo-hub-buffered-audio-synthetic-asr.flow.json")]
|
||||||
public async Task WebSocketFixture_ReplaysSuccessfully(string relativePath)
|
public async Task WebSocketFixture_ReplaysSuccessfully(string relativePath)
|
||||||
{
|
{
|
||||||
var fixture = WebSocketFixtureLoader.Load(relativePath);
|
var fixture = WebSocketFixtureLoader.Load(relativePath);
|
||||||
|
|||||||
Reference in New Issue
Block a user