fixes for next round of testing
This commit is contained in:
@@ -99,6 +99,12 @@ What remains unresolved:
|
|||||||
- the next post-`api-socket` startup requests and timing seen in successful Node runs
|
- the next post-`api-socket` startup requests and timing seen in successful Node runs
|
||||||
- broader live websocket behavior on a real robot beyond the current synthetic parity slice
|
- broader live websocket behavior on a real robot beyond the current synthetic parity slice
|
||||||
|
|
||||||
|
The current websocket bridge now also includes server-driven raw-audio turn completion:
|
||||||
|
|
||||||
|
- enough buffered audio plus `CONTEXT` can now trigger auto-finalize on the server side
|
||||||
|
- `EOS` is emitted on that auto-finalize path so turns do not remain open indefinitely
|
||||||
|
- transcript-less raw-audio turns still fall back to a synthetic compatibility response, not real ASR
|
||||||
|
|
||||||
## Important Docs
|
## Important Docs
|
||||||
|
|
||||||
- [Cloud overview](/src/Jibo.Cloud/README.md)
|
- [Cloud overview](/src/Jibo.Cloud/README.md)
|
||||||
|
|||||||
@@ -66,6 +66,7 @@ The current .NET pass covers only a narrow, explicitly synthetic subset of obser
|
|||||||
- token/session tracking across websocket turns
|
- token/session tracking across websocket turns
|
||||||
- explicit per-turn state tracking for transID, rules, context, buffered audio, and finalize attempts
|
- explicit per-turn state tracking for transID, rules, context, buffered audio, and finalize attempts
|
||||||
- buffered audio accounting and turn-pending state
|
- buffered audio accounting and turn-pending state
|
||||||
|
- auto-finalize triggering for raw audio once `LISTEN`, `CONTEXT`, and minimum buffered-audio thresholds are present
|
||||||
- `LISTEN` message handling with synthetic `LISTEN` result payload shaping
|
- `LISTEN` message handling with synthetic `LISTEN` result payload shaping
|
||||||
- `CONTEXT` capture for turn/session state
|
- `CONTEXT` capture for turn/session state
|
||||||
- `CLIENT_NLU` turn completion using remembered listen/session metadata
|
- `CLIENT_NLU` turn completion using remembered listen/session metadata
|
||||||
@@ -81,6 +82,12 @@ This does not yet mean parity for:
|
|||||||
- multi-step skill lifecycles beyond the current synthetic playback response
|
- multi-step skill lifecycles beyond the current synthetic playback response
|
||||||
- broader interaction, animation, or ESML command families
|
- broader interaction, animation, or ESML command families
|
||||||
|
|
||||||
|
Current raw-audio fallback behavior remains explicitly synthetic:
|
||||||
|
|
||||||
|
- when a buffered-audio turn can be resolved through the synthetic transcript-hint seam, `.NET` now auto-finalizes and emits `LISTEN` + `EOS` + `SKILL_ACTION`
|
||||||
|
- when the turn crosses the finalize threshold without a usable transcript, `.NET` now emits a fallback `LISTEN` + `EOS` + generic `SKILL_ACTION` rather than leaving the robot hanging on an unfinished turn
|
||||||
|
- that fallback is a compatibility measure inspired by the Node oracle, not a claim of real ASR understanding
|
||||||
|
|
||||||
### Internal ASR Direction
|
### Internal ASR Direction
|
||||||
|
|
||||||
The current .NET websocket layer now separates:
|
The current .NET websocket layer now separates:
|
||||||
|
|||||||
@@ -71,6 +71,7 @@ Current websocket scope is still intentionally narrow:
|
|||||||
- explicit websocket turn-state tracking separate from long-lived cloud session state
|
- explicit websocket turn-state tracking separate from long-lived cloud session state
|
||||||
- synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR`
|
- synthetic `LISTEN` result shaping for `LISTEN`, `CLIENT_NLU`, and `CLIENT_ASR`
|
||||||
- buffered audio state tracking behind a dedicated turn-finalization layer
|
- buffered audio state tracking behind a dedicated turn-finalization layer
|
||||||
|
- raw audio auto-finalization once `LISTEN` + `CONTEXT` + minimum buffered audio thresholds are present
|
||||||
- synthetic STT strategy selection for fixture-driven audio turn completion
|
- synthetic STT strategy selection for fixture-driven audio turn completion
|
||||||
- structured websocket telemetry and live-run fixture export
|
- structured websocket telemetry and live-run fixture export
|
||||||
- `CONTEXT` capture and follow-up turn state
|
- `CONTEXT` capture and follow-up turn state
|
||||||
@@ -100,3 +101,9 @@ It has not yet confirmed:
|
|||||||
- full startup parity with the successful Node run cadence
|
- full startup parity with the successful Node run cadence
|
||||||
- consistent eye-open / wake completion on the robot
|
- consistent eye-open / wake completion on the robot
|
||||||
- the later health/log upload sequence currently seen in the working Node run
|
- the later health/log upload sequence currently seen in the working Node run
|
||||||
|
|
||||||
|
Current raw-audio behavior is still a compatibility bridge:
|
||||||
|
|
||||||
|
- if buffered audio has a synthetic transcript hint, the server now auto-finalizes the turn and emits `LISTEN` + `EOS` + `SKILL_ACTION`
|
||||||
|
- if buffered audio crosses the finalize threshold without a usable transcript, the server now emits a Node-style fallback completion with `EOS` instead of hanging the turn forever
|
||||||
|
- this is intentionally not a claim of real ASR parity
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ public sealed class JiboWebSocketService(
|
|||||||
|
|
||||||
if (envelope.IsBinary)
|
if (envelope.IsBinary)
|
||||||
{
|
{
|
||||||
var replies = turnFinalizationService.HandleBinaryAudio(session, envelope);
|
var replies = await turnFinalizationService.HandleBinaryAudioAsync(session, envelope, cancellationToken);
|
||||||
await telemetrySink.RecordTurnEventAsync(envelope, session, "binary_audio_received", new Dictionary<string, object?>
|
await telemetrySink.RecordTurnEventAsync(envelope, session, "binary_audio_received", new Dictionary<string, object?>
|
||||||
{
|
{
|
||||||
["bytes"] = envelope.Binary?.Length ?? 0
|
["bytes"] = envelope.Binary?.Length ?? 0
|
||||||
@@ -42,7 +42,7 @@ public sealed class JiboWebSocketService(
|
|||||||
|
|
||||||
if (parsedType == "CONTEXT")
|
if (parsedType == "CONTEXT")
|
||||||
{
|
{
|
||||||
var replies = turnFinalizationService.HandleContext(session, envelope.Text);
|
var replies = await turnFinalizationService.HandleContextAsync(session, envelope, cancellationToken);
|
||||||
await telemetrySink.RecordTurnEventAsync(envelope, session, "context_received", new Dictionary<string, object?>
|
await telemetrySink.RecordTurnEventAsync(envelope, session, "context_received", new Dictionary<string, object?>
|
||||||
{
|
{
|
||||||
["transID"] = session.TurnState.TransId
|
["transID"] = session.TurnState.TransId
|
||||||
|
|||||||
@@ -63,6 +63,50 @@ public sealed class ResponsePlanToSocketMessagesMapper
|
|||||||
return messages;
|
return messages;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public IReadOnlyList<string> MapFallback(CloudSession session, string transId, IReadOnlyList<string> rules)
|
||||||
|
{
|
||||||
|
return
|
||||||
|
[
|
||||||
|
JsonSerializer.Serialize(new
|
||||||
|
{
|
||||||
|
type = "LISTEN",
|
||||||
|
transID = transId,
|
||||||
|
data = new
|
||||||
|
{
|
||||||
|
asr = new
|
||||||
|
{
|
||||||
|
confidence = 0.95,
|
||||||
|
final = true,
|
||||||
|
text = string.Empty
|
||||||
|
},
|
||||||
|
nlu = new
|
||||||
|
{
|
||||||
|
confidence = 0.95,
|
||||||
|
intent = "heyJibo",
|
||||||
|
rules,
|
||||||
|
entities = new Dictionary<string, object?>()
|
||||||
|
},
|
||||||
|
match = new
|
||||||
|
{
|
||||||
|
intent = "heyJibo",
|
||||||
|
rule = rules.FirstOrDefault() ?? string.Empty,
|
||||||
|
score = 0.95
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
JsonSerializer.Serialize(new
|
||||||
|
{
|
||||||
|
type = "EOS",
|
||||||
|
data = new
|
||||||
|
{
|
||||||
|
sessionId = session.SessionId,
|
||||||
|
transID = transId
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
JsonSerializer.Serialize(BuildGenericFallbackSkillPayload(transId))
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
private static IReadOnlyList<string> ReadRules(TurnContext turn)
|
private static IReadOnlyList<string> ReadRules(TurnContext turn)
|
||||||
{
|
{
|
||||||
if (!turn.Attributes.TryGetValue("listenRules", out var value))
|
if (!turn.Attributes.TryGetValue("listenRules", out var value))
|
||||||
@@ -132,6 +176,52 @@ public sealed class ResponsePlanToSocketMessagesMapper
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static object BuildGenericFallbackSkillPayload(string transId)
|
||||||
|
{
|
||||||
|
return new
|
||||||
|
{
|
||||||
|
type = "SKILL_ACTION",
|
||||||
|
ts = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(),
|
||||||
|
msgID = $"msg-{Guid.NewGuid():N}",
|
||||||
|
transID = transId,
|
||||||
|
data = new
|
||||||
|
{
|
||||||
|
skill = new
|
||||||
|
{
|
||||||
|
id = "chitchat-skill"
|
||||||
|
},
|
||||||
|
action = new
|
||||||
|
{
|
||||||
|
config = new
|
||||||
|
{
|
||||||
|
jcp = new
|
||||||
|
{
|
||||||
|
type = "SLIM",
|
||||||
|
config = new
|
||||||
|
{
|
||||||
|
play = new
|
||||||
|
{
|
||||||
|
esml = "<speak><es cat='neutral' filter='!ssa-only, !sfx-only' endNeutral='true'>I heard you.</es></speak>",
|
||||||
|
meta = new
|
||||||
|
{
|
||||||
|
prompt_id = "RUNTIME_PROMPT",
|
||||||
|
prompt_sub_category = "AN",
|
||||||
|
mim_id = "runtime-chat",
|
||||||
|
mim_type = "announcement",
|
||||||
|
intent = "unknown",
|
||||||
|
transcript = string.Empty
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
analytics = new Dictionary<string, object?>(),
|
||||||
|
final = true
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
private static string EscapeXml(string value)
|
private static string EscapeXml(string value)
|
||||||
{
|
{
|
||||||
return value
|
return value
|
||||||
|
|||||||
@@ -10,16 +10,28 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
ResponsePlanToSocketMessagesMapper replyMapper,
|
ResponsePlanToSocketMessagesMapper replyMapper,
|
||||||
ISttStrategySelector sttStrategySelector)
|
ISttStrategySelector sttStrategySelector)
|
||||||
{
|
{
|
||||||
public IReadOnlyList<WebSocketReply> HandleBinaryAudio(CloudSession session, WebSocketMessageEnvelope envelope)
|
private const int AutoFinalizeMinBufferedAudioBytes = 12000;
|
||||||
|
private const int AutoFinalizeMinBufferedAudioChunks = 5;
|
||||||
|
|
||||||
|
public async Task<IReadOnlyList<WebSocketReply>> HandleBinaryAudioAsync(
|
||||||
|
CloudSession session,
|
||||||
|
WebSocketMessageEnvelope envelope,
|
||||||
|
CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
var turnState = session.TurnState;
|
var turnState = session.TurnState;
|
||||||
session.LastMessageType = "BINARY_AUDIO";
|
session.LastMessageType = "BINARY_AUDIO";
|
||||||
|
turnState.FirstAudioReceivedUtc ??= DateTimeOffset.UtcNow;
|
||||||
turnState.BufferedAudioChunkCount += 1;
|
turnState.BufferedAudioChunkCount += 1;
|
||||||
turnState.BufferedAudioBytes += envelope.Binary?.Length ?? 0;
|
turnState.BufferedAudioBytes += envelope.Binary?.Length ?? 0;
|
||||||
turnState.LastAudioReceivedUtc = DateTimeOffset.UtcNow;
|
turnState.LastAudioReceivedUtc = DateTimeOffset.UtcNow;
|
||||||
turnState.AwaitingTurnCompletion = true;
|
turnState.AwaitingTurnCompletion = true;
|
||||||
session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0;
|
session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0;
|
||||||
|
|
||||||
|
if (ShouldAutoFinalize(session))
|
||||||
|
{
|
||||||
|
return await FinalizeTurnAsync(session, envelope, "AUTO_FINALIZE", allowFallbackOnMissingTranscript: true, cancellationToken);
|
||||||
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
[
|
[
|
||||||
new WebSocketReply
|
new WebSocketReply
|
||||||
@@ -39,19 +51,28 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
public IReadOnlyList<WebSocketReply> HandleContext(CloudSession session, string? text)
|
public async Task<IReadOnlyList<WebSocketReply>> HandleContextAsync(
|
||||||
|
CloudSession session,
|
||||||
|
WebSocketMessageEnvelope envelope,
|
||||||
|
CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
var turnState = session.TurnState;
|
var turnState = session.TurnState;
|
||||||
turnState.ContextPayload = ExtractDataPayload(text);
|
turnState.SawContext = true;
|
||||||
|
turnState.ContextPayload = ExtractDataPayload(envelope.Text);
|
||||||
session.Metadata["context"] = turnState.ContextPayload;
|
session.Metadata["context"] = turnState.ContextPayload;
|
||||||
|
|
||||||
if (TryReadContextProperty(text, "audioTranscriptHint", out var transcriptHint) &&
|
if (TryReadContextProperty(envelope.Text, "audioTranscriptHint", out var transcriptHint) &&
|
||||||
!string.IsNullOrWhiteSpace(transcriptHint))
|
!string.IsNullOrWhiteSpace(transcriptHint))
|
||||||
{
|
{
|
||||||
turnState.AudioTranscriptHint = transcriptHint;
|
turnState.AudioTranscriptHint = transcriptHint;
|
||||||
session.Metadata["audioTranscriptHint"] = transcriptHint;
|
session.Metadata["audioTranscriptHint"] = transcriptHint;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ShouldAutoFinalize(session))
|
||||||
|
{
|
||||||
|
return await FinalizeTurnAsync(session, envelope, "AUTO_FINALIZE", allowFallbackOnMissingTranscript: true, cancellationToken);
|
||||||
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
[
|
[
|
||||||
new WebSocketReply
|
new WebSocketReply
|
||||||
@@ -76,58 +97,7 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
CancellationToken cancellationToken = default)
|
CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
PersistTurnHints(session, envelope.Text);
|
PersistTurnHints(session, envelope.Text);
|
||||||
|
return await FinalizeTurnAsync(session, envelope, messageType, allowFallbackOnMissingTranscript: false, cancellationToken);
|
||||||
var turn = turnContextMapper.MapListenMessage(envelope, session, messageType);
|
|
||||||
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
|
||||||
var turnState = session.TurnState;
|
|
||||||
if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) &&
|
|
||||||
string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript))
|
|
||||||
{
|
|
||||||
turnState.AwaitingTurnCompletion = true;
|
|
||||||
if (turnState.BufferedAudioBytes > 0)
|
|
||||||
{
|
|
||||||
turnState.FinalizeAttemptCount += 1;
|
|
||||||
}
|
|
||||||
return
|
|
||||||
[
|
|
||||||
new WebSocketReply
|
|
||||||
{
|
|
||||||
Text = JsonSerializer.Serialize(new
|
|
||||||
{
|
|
||||||
type = "OPENJIBO_TURN_PENDING",
|
|
||||||
data = new
|
|
||||||
{
|
|
||||||
sessionId = session.SessionId,
|
|
||||||
transID = session.LastTransId,
|
|
||||||
bufferedAudioBytes = turnState.BufferedAudioBytes,
|
|
||||||
bufferedAudioChunks = turnState.BufferedAudioChunkCount,
|
|
||||||
awaitingAudio = turnState.BufferedAudioBytes == 0,
|
|
||||||
awaitingTranscriptHint = turnState.BufferedAudioBytes > 0 && string.IsNullOrWhiteSpace(turnState.AudioTranscriptHint),
|
|
||||||
finalizeAttempts = turnState.FinalizeAttemptCount
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
var plan = await conversationBroker.HandleTurnAsync(finalizedTurn, cancellationToken);
|
|
||||||
var listenAction = plan.Actions.OfType<ListenAction>().OrderBy(action => action.Sequence).LastOrDefault();
|
|
||||||
session.LastTranscript = finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript;
|
|
||||||
session.LastIntent = plan.IntentName;
|
|
||||||
session.LastListenType = listenAction?.Mode;
|
|
||||||
session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen
|
|
||||||
? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout)
|
|
||||||
: null;
|
|
||||||
turnState.AwaitingTurnCompletion = false;
|
|
||||||
|
|
||||||
var emitSkillActions = messageType != "CLIENT_NLU";
|
|
||||||
var replies = replyMapper.Map(plan, finalizedTurn, session, emitSkillActions).Select(text => new WebSocketReply
|
|
||||||
{
|
|
||||||
Text = text
|
|
||||||
}).ToArray();
|
|
||||||
|
|
||||||
ResetBufferedAudio(session);
|
|
||||||
return replies;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private async Task<TurnContext> ResolveTranscriptAsync(TurnContext turn, CloudSession session, CancellationToken cancellationToken)
|
private async Task<TurnContext> ResolveTranscriptAsync(TurnContext turn, CloudSession session, CancellationToken cancellationToken)
|
||||||
@@ -200,6 +170,13 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
using var document = JsonDocument.Parse(text);
|
using var document = JsonDocument.Parse(text);
|
||||||
var root = document.RootElement;
|
var root = document.RootElement;
|
||||||
|
|
||||||
|
if (root.TryGetProperty("type", out var type) &&
|
||||||
|
type.ValueKind == JsonValueKind.String &&
|
||||||
|
string.Equals(type.GetString(), "LISTEN", StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
turnState.SawListen = true;
|
||||||
|
}
|
||||||
|
|
||||||
if (root.TryGetProperty("transID", out var transId) && transId.ValueKind == JsonValueKind.String)
|
if (root.TryGetProperty("transID", out var transId) && transId.ValueKind == JsonValueKind.String)
|
||||||
{
|
{
|
||||||
var nextTransId = transId.GetString();
|
var nextTransId = transId.GetString();
|
||||||
@@ -244,6 +221,7 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
{
|
{
|
||||||
session.TurnState.BufferedAudioBytes = 0;
|
session.TurnState.BufferedAudioBytes = 0;
|
||||||
session.TurnState.BufferedAudioChunkCount = 0;
|
session.TurnState.BufferedAudioChunkCount = 0;
|
||||||
|
session.TurnState.FirstAudioReceivedUtc = null;
|
||||||
session.TurnState.LastAudioReceivedUtc = null;
|
session.TurnState.LastAudioReceivedUtc = null;
|
||||||
session.TurnState.FinalizeAttemptCount = 0;
|
session.TurnState.FinalizeAttemptCount = 0;
|
||||||
session.Metadata.Remove("audioTranscriptHint");
|
session.Metadata.Remove("audioTranscriptHint");
|
||||||
@@ -254,14 +232,101 @@ public sealed class WebSocketTurnFinalizationService(
|
|||||||
turnState.TransId = transId;
|
turnState.TransId = transId;
|
||||||
turnState.ContextPayload = null;
|
turnState.ContextPayload = null;
|
||||||
turnState.AudioTranscriptHint = null;
|
turnState.AudioTranscriptHint = null;
|
||||||
|
turnState.FirstAudioReceivedUtc = null;
|
||||||
turnState.LastAudioReceivedUtc = null;
|
turnState.LastAudioReceivedUtc = null;
|
||||||
turnState.BufferedAudioChunkCount = 0;
|
turnState.BufferedAudioChunkCount = 0;
|
||||||
turnState.BufferedAudioBytes = 0;
|
turnState.BufferedAudioBytes = 0;
|
||||||
turnState.FinalizeAttemptCount = 0;
|
turnState.FinalizeAttemptCount = 0;
|
||||||
turnState.AwaitingTurnCompletion = false;
|
turnState.AwaitingTurnCompletion = false;
|
||||||
|
turnState.SawListen = false;
|
||||||
|
turnState.SawContext = false;
|
||||||
turnState.ListenRules = [];
|
turnState.ListenRules = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async Task<IReadOnlyList<WebSocketReply>> FinalizeTurnAsync(
|
||||||
|
CloudSession session,
|
||||||
|
WebSocketMessageEnvelope envelope,
|
||||||
|
string messageType,
|
||||||
|
bool allowFallbackOnMissingTranscript,
|
||||||
|
CancellationToken cancellationToken)
|
||||||
|
{
|
||||||
|
var turn = turnContextMapper.MapListenMessage(envelope, session, messageType);
|
||||||
|
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
||||||
|
var turnState = session.TurnState;
|
||||||
|
if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) &&
|
||||||
|
string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript))
|
||||||
|
{
|
||||||
|
turnState.AwaitingTurnCompletion = true;
|
||||||
|
if (turnState.BufferedAudioBytes > 0)
|
||||||
|
{
|
||||||
|
turnState.FinalizeAttemptCount += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allowFallbackOnMissingTranscript && turnState.BufferedAudioBytes >= AutoFinalizeMinBufferedAudioBytes)
|
||||||
|
{
|
||||||
|
turnState.AwaitingTurnCompletion = false;
|
||||||
|
session.LastTranscript = string.Empty;
|
||||||
|
session.LastIntent = "heyJibo";
|
||||||
|
session.LastListenType = "fallback";
|
||||||
|
var fallbackReplies = replyMapper.MapFallback(session, turnState.TransId ?? session.LastTransId ?? string.Empty, turnState.ListenRules)
|
||||||
|
.Select(text => new WebSocketReply { Text = text })
|
||||||
|
.ToArray();
|
||||||
|
ResetBufferedAudio(session);
|
||||||
|
return fallbackReplies;
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
[
|
||||||
|
new WebSocketReply
|
||||||
|
{
|
||||||
|
Text = JsonSerializer.Serialize(new
|
||||||
|
{
|
||||||
|
type = "OPENJIBO_TURN_PENDING",
|
||||||
|
data = new
|
||||||
|
{
|
||||||
|
sessionId = session.SessionId,
|
||||||
|
transID = session.LastTransId,
|
||||||
|
bufferedAudioBytes = turnState.BufferedAudioBytes,
|
||||||
|
bufferedAudioChunks = turnState.BufferedAudioChunkCount,
|
||||||
|
awaitingAudio = turnState.BufferedAudioBytes == 0,
|
||||||
|
awaitingTranscriptHint = turnState.BufferedAudioBytes > 0 && string.IsNullOrWhiteSpace(turnState.AudioTranscriptHint),
|
||||||
|
finalizeAttempts = turnState.FinalizeAttemptCount
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
var plan = await conversationBroker.HandleTurnAsync(finalizedTurn, cancellationToken);
|
||||||
|
var listenAction = plan.Actions.OfType<ListenAction>().OrderBy(action => action.Sequence).LastOrDefault();
|
||||||
|
session.LastTranscript = finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript;
|
||||||
|
session.LastIntent = plan.IntentName;
|
||||||
|
session.LastListenType = listenAction?.Mode;
|
||||||
|
session.FollowUpExpiresUtc = plan.FollowUp.KeepMicOpen
|
||||||
|
? DateTimeOffset.UtcNow.Add(plan.FollowUp.Timeout)
|
||||||
|
: null;
|
||||||
|
turnState.AwaitingTurnCompletion = false;
|
||||||
|
|
||||||
|
var emitSkillActions = messageType != "CLIENT_NLU";
|
||||||
|
var replies = replyMapper.Map(plan, finalizedTurn, session, emitSkillActions).Select(text => new WebSocketReply
|
||||||
|
{
|
||||||
|
Text = text
|
||||||
|
}).ToArray();
|
||||||
|
|
||||||
|
ResetBufferedAudio(session);
|
||||||
|
return replies;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool ShouldAutoFinalize(CloudSession session)
|
||||||
|
{
|
||||||
|
var turnState = session.TurnState;
|
||||||
|
return turnState.AwaitingTurnCompletion &&
|
||||||
|
turnState.SawListen &&
|
||||||
|
turnState.SawContext &&
|
||||||
|
turnState.BufferedAudioChunkCount >= AutoFinalizeMinBufferedAudioChunks &&
|
||||||
|
turnState.BufferedAudioBytes >= AutoFinalizeMinBufferedAudioBytes;
|
||||||
|
}
|
||||||
|
|
||||||
private static string? ExtractDataPayload(string? text)
|
private static string? ExtractDataPayload(string? text)
|
||||||
{
|
{
|
||||||
if (string.IsNullOrWhiteSpace(text))
|
if (string.IsNullOrWhiteSpace(text))
|
||||||
|
|||||||
@@ -5,10 +5,13 @@ public sealed class WebSocketTurnState
|
|||||||
public string? TransId { get; set; }
|
public string? TransId { get; set; }
|
||||||
public string? ContextPayload { get; set; }
|
public string? ContextPayload { get; set; }
|
||||||
public string? AudioTranscriptHint { get; set; }
|
public string? AudioTranscriptHint { get; set; }
|
||||||
|
public DateTimeOffset? FirstAudioReceivedUtc { get; set; }
|
||||||
public DateTimeOffset? LastAudioReceivedUtc { get; set; }
|
public DateTimeOffset? LastAudioReceivedUtc { get; set; }
|
||||||
public int BufferedAudioChunkCount { get; set; }
|
public int BufferedAudioChunkCount { get; set; }
|
||||||
public int BufferedAudioBytes { get; set; }
|
public int BufferedAudioBytes { get; set; }
|
||||||
public int FinalizeAttemptCount { get; set; }
|
public int FinalizeAttemptCount { get; set; }
|
||||||
public bool AwaitingTurnCompletion { get; set; }
|
public bool AwaitingTurnCompletion { get; set; }
|
||||||
|
public bool SawListen { get; set; }
|
||||||
|
public bool SawContext { get; set; }
|
||||||
public IReadOnlyList<string> ListenRules { get; set; } = [];
|
public IReadOnlyList<string> ListenRules { get; set; } = [];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -74,6 +74,118 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32());
|
Assert.Equal(1, payload.RootElement.GetProperty("data").GetProperty("bufferedChunks").GetInt32());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task BufferedAudio_WithContextAndTranscriptHint_AutoFinalizesAfterThreshold()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-auto-finalize-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-auto","data":{"rules":["launch"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-auto-finalize-token",
|
||||||
|
Text = """{"type":"CONTEXT","transID":"trans-auto","data":{"audioTranscriptHint":"tell me a joke"}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
IReadOnlyList<WebSocketReply> replies = [];
|
||||||
|
for (var index = 0; index < 4; index += 1)
|
||||||
|
{
|
||||||
|
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-auto-finalize-token",
|
||||||
|
Binary = new byte[3000]
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Single(replies);
|
||||||
|
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-auto-finalize-token",
|
||||||
|
Binary = new byte[3000]
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal(3, replies.Count);
|
||||||
|
Assert.Equal("LISTEN", ReadReplyType(replies[0]));
|
||||||
|
Assert.Equal("EOS", ReadReplyType(replies[1]));
|
||||||
|
Assert.Equal("SKILL_ACTION", ReadReplyType(replies[2]));
|
||||||
|
|
||||||
|
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
|
||||||
|
Assert.Equal("tell me a joke", listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
|
||||||
|
Assert.Equal("joke", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task BufferedAudio_WithoutTranscriptHint_AutoFinalizesWithFallbackAndEos()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-auto-fallback-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-auto-fallback","data":{"rules":["launch"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-auto-fallback-token",
|
||||||
|
Text = """{"type":"CONTEXT","transID":"trans-auto-fallback","data":{"topic":"conversation"}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
IReadOnlyList<WebSocketReply> replies = [];
|
||||||
|
for (var index = 0; index < 4; index += 1)
|
||||||
|
{
|
||||||
|
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-auto-fallback-token",
|
||||||
|
Binary = new byte[3000]
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Single(replies);
|
||||||
|
Assert.Equal("OPENJIBO_AUDIO_RECEIVED", ReadReplyType(replies[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-auto-fallback-token",
|
||||||
|
Binary = new byte[3000]
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal(3, replies.Count);
|
||||||
|
Assert.Equal("LISTEN", ReadReplyType(replies[0]));
|
||||||
|
Assert.Equal("EOS", ReadReplyType(replies[1]));
|
||||||
|
Assert.Equal("SKILL_ACTION", ReadReplyType(replies[2]));
|
||||||
|
|
||||||
|
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
|
||||||
|
Assert.Equal("heyJibo", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||||
|
Assert.Equal(string.Empty, listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public async Task MultiChunkAudio_AccumulatesBufferedStateAcrossMessages()
|
public async Task MultiChunkAudio_AccumulatesBufferedStateAcrossMessages()
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user