enhanced skill and yes/no routing

This commit is contained in:
Jacob Dubin
2026-04-18 16:29:27 -05:00
parent faf021eb89
commit 83a9350a9d
13 changed files with 455 additions and 29 deletions

View File

@@ -108,6 +108,9 @@ Current raw-audio behavior is still a compatibility bridge:
- if buffered audio has a synthetic transcript hint, the server now auto-finalizes the turn and emits `LISTEN` + `EOS` + `SKILL_ACTION`
- if buffered audio crosses the finalize threshold without a usable transcript, the server now emits a Node-style fallback completion with `EOS` instead of hanging the turn forever
- this is intentionally not a claim of real ASR parity
- follow-up turns now preserve enough constraint state to distinguish yes/no-style replies from ordinary free-form chat
- create-flow yes/no turns now preserve `create/is_it_a_keeper` and `domain=create` in the outbound synthetic `LISTEN` payload
- phrase matching has been widened slightly for known test prompts such as joke, dance, surprise, weather, calendar, commute, and news variants
## Buffered Audio STT
@@ -138,6 +141,13 @@ Configuration lives under `OpenJibo:Stt`:
This is not yet a claim of production-ready onboard ASR. It is a `.NET` discovery seam that keeps us compatible with the Node oracle while we evaluate longer-term options such as Azure-hosted STT or a managed decode/transcribe stack.
Latest live-capture guidance after the `2026-04-18` round:
- prefer synthetic transcript hints when they are present in the observed turn
- only use local `whisper.cpp` when the configured tool paths are real and the decode chain is behaving
- treat `ffmpeg` decode failures on normalized Ogg captures as evidence that the local audio path still needs more hardening before it can be the default live-test expectation
- keep the Node implementation as the oracle for yes/no turn semantics and audio preprocessing details until the `.NET` port catches up
## Current Interaction Paths
The working cloud model currently looks like three main paths:

View File

@@ -1,5 +1,6 @@
using Jibo.Cloud.Application.Abstractions;
using Jibo.Runtime.Abstractions;
using System.Text.Json;
namespace Jibo.Cloud.Application.Services;
@@ -15,8 +16,9 @@ public sealed class JiboInteractionService(
var clientIntent = turn.Attributes.TryGetValue("clientIntent", out var rawClientIntent)
? rawClientIntent?.ToString()
: null;
var isYesNoTurn = IsYesNoTurn(turn);
var semanticIntent = ResolveSemanticIntent(lowered, clientIntent);
var semanticIntent = ResolveSemanticIntent(lowered, clientIntent, isYesNoTurn);
return semanticIntent switch
{
"joke" => BuildJokeDecision(catalog),
@@ -25,6 +27,8 @@ public sealed class JiboInteractionService(
"date" => new JiboInteractionDecision("date", $"Today is {DateTime.Now:dddd, MMMM d}."),
"hello" => new JiboInteractionDecision("hello", randomizer.Choose(catalog.GreetingReplies)),
"how_are_you" => new JiboInteractionDecision("how_are_you", randomizer.Choose(catalog.HowAreYouReplies)),
"yes" => new JiboInteractionDecision("yes", "Yes."),
"no" => new JiboInteractionDecision("no", "No."),
"surprise" => new JiboInteractionDecision("surprise", randomizer.Choose(catalog.SurpriseReplies)),
"personal_report" => new JiboInteractionDecision("personal_report", randomizer.Choose(catalog.PersonalReportReplies)),
"weather" => new JiboInteractionDecision("weather", randomizer.Choose(catalog.WeatherReplies)),
@@ -86,7 +90,7 @@ public sealed class JiboInteractionService(
.Replace("{transcript}", transcript, StringComparison.Ordinal);
}
private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent)
private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent, bool isYesNoTurn)
{
if (string.Equals(clientIntent, "askForTime", StringComparison.OrdinalIgnoreCase))
{
@@ -98,72 +102,112 @@ public sealed class JiboInteractionService(
return "date";
}
if (loweredTranscript.Contains("joke", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "joke", "funny", "make me laugh"))
{
return "joke";
}
if (loweredTranscript.Contains("dance", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "dance", "boogie"))
{
return "dance";
}
if (loweredTranscript.Contains("surprise", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "surprise", "surprise me", "show me something fun"))
{
return "surprise";
}
if (loweredTranscript.Contains("personal report", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "personal report", "my report", "daily report", "my update"))
{
return "personal_report";
}
if (loweredTranscript.Contains("weather", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "weather", "forecast", "weather report", "is it raining"))
{
return "weather";
}
if (loweredTranscript.Contains("calendar", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "calendar", "schedule", "what's on my calendar", "what is on my calendar"))
{
return "calendar";
}
if (loweredTranscript.Contains("commute", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "commute", "traffic", "drive to work", "how long to work"))
{
return "commute";
}
if (loweredTranscript.Contains("news", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "news", "headlines", "news update", "tell me the news"))
{
return "news";
}
if (loweredTranscript.Contains("how are you", StringComparison.Ordinal) ||
loweredTranscript.Contains("what's up", StringComparison.Ordinal) ||
loweredTranscript.Contains("what s up", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "how are you", "what's up", "what s up", "what up"))
{
return "how_are_you";
}
if (loweredTranscript.Contains("hello", StringComparison.Ordinal) ||
loweredTranscript.Contains("hi", StringComparison.Ordinal) ||
loweredTranscript.Contains("hey", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "hello", "hi", "hey"))
{
return "hello";
}
if (loweredTranscript.Contains("time", StringComparison.Ordinal))
if (isYesNoTurn && MatchesAny(loweredTranscript, "yes", "yeah", "yup", "sure", "uh huh"))
{
return "yes";
}
if (isYesNoTurn && MatchesAny(loweredTranscript, "no", "nope", "nah"))
{
return "no";
}
if (MatchesAny(loweredTranscript, "what time is it", "current time", "the time", "time is it") ||
loweredTranscript.Contains("time", StringComparison.Ordinal))
{
return "time";
}
if (loweredTranscript.Contains("date", StringComparison.Ordinal) || loweredTranscript.Contains("day", StringComparison.Ordinal))
if (MatchesAny(loweredTranscript, "what day is it", "what is the date", "today s date", "today's date") ||
loweredTranscript.Contains("date", StringComparison.Ordinal) ||
loweredTranscript.Contains("day", StringComparison.Ordinal))
{
return "date";
}
return "chat";
}
private static bool IsYesNoTurn(TurnContext turn)
{
return ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules"))
.Any(static rule =>
string.Equals(rule, "$YESNO", StringComparison.OrdinalIgnoreCase) ||
string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase));
}
private static IEnumerable<string> ReadRules(TurnContext turn, string key)
{
if (!turn.Attributes.TryGetValue(key, out var value) || value is null)
{
return [];
}
return value switch
{
IReadOnlyList<string> typed => typed,
IEnumerable<string> strings => strings,
JsonElement { ValueKind: JsonValueKind.Array } json => json.EnumerateArray()
.Where(static item => item.ValueKind == JsonValueKind.String)
.Select(static item => item.GetString() ?? string.Empty),
_ => []
};
}
private static bool MatchesAny(string loweredTranscript, params string[] candidates)
{
return candidates.Any(candidate => loweredTranscript.Contains(candidate, StringComparison.Ordinal));
}
}
public sealed record JiboInteractionDecision(

View File

@@ -17,13 +17,20 @@ public sealed class ResponsePlanToSocketMessagesMapper
var transcript = turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty;
var clientIntent = ReadAttribute(turn, "clientIntent");
var rules = ReadRules(turn, messageType);
var yesNoCreateRule = ReadYesNoCreateRule(turn);
var isYesNoTurn = !string.IsNullOrWhiteSpace(yesNoCreateRule);
var isYesNoIntent = string.Equals(plan.IntentName, "yes", StringComparison.OrdinalIgnoreCase) ||
string.Equals(plan.IntentName, "no", StringComparison.OrdinalIgnoreCase);
var outboundIntent = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent)
? clientIntent
: plan.IntentName ?? "unknown";
var outboundAsrText = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent)
? clientIntent
: transcript;
var entities = ReadEntities(turn, messageType);
var outboundAsrText = isYesNoTurn && isYesNoIntent
? transcript
: string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent)
? clientIntent
: transcript;
var outboundRules = isYesNoTurn && isYesNoIntent ? [yesNoCreateRule!] : rules;
var entities = ReadEntities(turn, messageType, isYesNoTurn && isYesNoIntent);
var messages = new List<SocketReplyPlan>
{
new(JsonSerializer.Serialize(new
@@ -42,13 +49,13 @@ public sealed class ResponsePlanToSocketMessagesMapper
{
confidence = 0.95,
intent = outboundIntent,
rules,
rules = outboundRules,
entities
},
match = new
{
intent = outboundIntent,
rule = rules.FirstOrDefault() ?? string.Empty,
rule = outboundRules.FirstOrDefault() ?? string.Empty,
score = 0.95
}
}
@@ -135,8 +142,16 @@ public sealed class ResponsePlanToSocketMessagesMapper
};
}
private static object ReadEntities(TurnContext turn, string? messageType)
private static object ReadEntities(TurnContext turn, string? messageType, bool yesNoCreateTurn)
{
if (yesNoCreateTurn)
{
return new Dictionary<string, object?>
{
["domain"] = "create"
};
}
if (!string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase))
{
return new Dictionary<string, object?>();
@@ -155,6 +170,35 @@ public sealed class ResponsePlanToSocketMessagesMapper
};
}
private static string? ReadYesNoCreateRule(TurnContext turn)
{
return ReadRuleValues(turn)
.FirstOrDefault(static rule => string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase));
}
private static IEnumerable<string> ReadRuleValues(TurnContext turn)
{
return ReadRuleValues(turn, "listenRules").Concat(ReadRuleValues(turn, "clientRules"));
}
private static IEnumerable<string> ReadRuleValues(TurnContext turn, string key)
{
if (!turn.Attributes.TryGetValue(key, out var value) || value is null)
{
return [];
}
return value switch
{
IReadOnlyList<string> typedRules => typedRules,
IEnumerable<string> rules => rules,
JsonElement { ValueKind: JsonValueKind.Array } jsonElement => jsonElement.EnumerateArray()
.Where(static item => item.ValueKind == JsonValueKind.String)
.Select(static item => item.GetString() ?? string.Empty),
_ => []
};
}
private static string? ReadAttribute(TurnContext turn, string key)
{
return turn.Attributes.TryGetValue(key, out var value)

View File

@@ -2,6 +2,7 @@ using System.Text.Json;
using Jibo.Cloud.Application.Abstractions;
using Jibo.Cloud.Domain.Models;
using Jibo.Runtime.Abstractions;
using System.Text.RegularExpressions;
namespace Jibo.Cloud.Application.Services;
@@ -302,6 +303,32 @@ public sealed class WebSocketTurnFinalizationService(
{
var turn = ProtocolToTurnContextMapper.MapListenMessage(envelope, session, messageType);
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
if (!IsTranscriptUsable(finalizedTurn))
{
finalizedTurn = new TurnContext
{
TurnId = finalizedTurn.TurnId,
SessionId = finalizedTurn.SessionId,
TimestampUtc = finalizedTurn.TimestampUtc,
InputMode = finalizedTurn.InputMode,
SourceKind = finalizedTurn.SourceKind,
WakePhrase = finalizedTurn.WakePhrase,
RawTranscript = null,
NormalizedTranscript = null,
DeviceId = finalizedTurn.DeviceId,
HostName = finalizedTurn.HostName,
RequestId = finalizedTurn.RequestId,
ProtocolService = finalizedTurn.ProtocolService,
ProtocolOperation = finalizedTurn.ProtocolOperation,
FirmwareVersion = finalizedTurn.FirmwareVersion,
ApplicationVersion = finalizedTurn.ApplicationVersion,
Locale = finalizedTurn.Locale,
TimeZone = finalizedTurn.TimeZone,
IsFollowUpEligible = finalizedTurn.IsFollowUpEligible,
Attributes = finalizedTurn.Attributes
};
}
var turnState = session.TurnState;
if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) &&
string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript))
@@ -460,4 +487,63 @@ public sealed class WebSocketTurnFinalizationService(
return false;
}
}
private static bool IsTranscriptUsable(TurnContext turn)
{
var transcript = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
if (string.IsNullOrWhiteSpace(transcript))
{
return false;
}
if (transcript.Length >= 6)
{
return true;
}
if (IsYesNoTurn(turn) && transcript is "yes" or "no" or "sure" or "nope" or "yup" or "uh huh" or "yeah" or "nah")
{
return true;
}
return transcript is "joke" or "dance" or "time" or "date" or "today" or "day" or "hello" or "hi" or "hey";
}
private static bool IsYesNoTurn(TurnContext turn)
{
return ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules"))
.Any(static rule =>
string.Equals(rule, "$YESNO", StringComparison.OrdinalIgnoreCase) ||
string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase));
}
private static IEnumerable<string> ReadRules(TurnContext turn, string key)
{
if (!turn.Attributes.TryGetValue(key, out var value) || value is null)
{
return [];
}
return value switch
{
IReadOnlyList<string> typed => typed,
IEnumerable<string> strings => strings,
JsonElement { ValueKind: JsonValueKind.Array } json => json.EnumerateArray()
.Where(static item => item.ValueKind == JsonValueKind.String)
.Select(static item => item.GetString() ?? string.Empty),
_ => []
};
}
private static string NormalizeTranscript(string? transcript)
{
if (string.IsNullOrWhiteSpace(transcript))
{
return string.Empty;
}
return Regex.Replace(transcript.Trim().ToLowerInvariant(), @"[^\w\s]", " ")
.Replace(" ", " ", StringComparison.Ordinal)
.Trim();
}
}

View File

@@ -12,9 +12,9 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
public bool CanHandle(TurnContext turn)
{
return options.EnableLocalWhisperCpp &&
!string.IsNullOrWhiteSpace(options.FfmpegPath) &&
!string.IsNullOrWhiteSpace(options.WhisperCliPath) &&
!string.IsNullOrWhiteSpace(options.WhisperModelPath) &&
IsConfiguredPathAvailable(options.FfmpegPath, checkFileExists: false) &&
IsConfiguredPathAvailable(options.WhisperCliPath, checkFileExists: true) &&
IsConfiguredPathAvailable(options.WhisperModelPath, checkFileExists: true) &&
ReadBufferedAudioFrames(turn).Count > 0;
}
@@ -148,4 +148,19 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
// Best-effort cleanup only.
}
}
private static bool IsConfiguredPathAvailable(string? path, bool checkFileExists)
{
if (string.IsNullOrWhiteSpace(path))
{
return false;
}
if (!Path.IsPathRooted(path))
{
return true;
}
return checkFileExists ? File.Exists(path) : true;
}
}

View File

@@ -31,8 +31,8 @@ public static class ServiceCollectionExtensions
services.AddSingleton<JiboInteractionService>();
services.AddSingleton<IConversationBroker, DemoConversationBroker>();
services.AddSingleton<IExternalProcessRunner, ExternalProcessRunner>();
services.AddSingleton<ISttStrategy, LocalWhisperCppBufferedAudioSttStrategy>();
services.AddSingleton<ISttStrategy, SyntheticBufferedAudioSttStrategy>();
services.AddSingleton<ISttStrategy, LocalWhisperCppBufferedAudioSttStrategy>();
services.AddSingleton<ISttStrategySelector, DefaultSttStrategySelector>();
services.AddSingleton<IWebSocketTelemetrySink, FileWebSocketTelemetrySink>();
services.AddSingleton<IProtocolTelemetrySink, FileProtocolTelemetrySink>();