enhanced skill and yes/no routing
This commit is contained in:
@@ -108,6 +108,9 @@ Current raw-audio behavior is still a compatibility bridge:
|
||||
- if buffered audio has a synthetic transcript hint, the server now auto-finalizes the turn and emits `LISTEN` + `EOS` + `SKILL_ACTION`
|
||||
- if buffered audio crosses the finalize threshold without a usable transcript, the server now emits a Node-style fallback completion with `EOS` instead of hanging the turn forever
|
||||
- this is intentionally not a claim of real ASR parity
|
||||
- follow-up turns now preserve enough constraint state to distinguish yes/no-style replies from ordinary free-form chat
|
||||
- create-flow yes/no turns now preserve `create/is_it_a_keeper` and `domain=create` in the outbound synthetic `LISTEN` payload
|
||||
- phrase matching has been widened slightly for known test prompts such as joke, dance, surprise, weather, calendar, commute, and news variants
|
||||
|
||||
## Buffered Audio STT
|
||||
|
||||
@@ -138,6 +141,13 @@ Configuration lives under `OpenJibo:Stt`:
|
||||
|
||||
This is not yet a claim of production-ready onboard ASR. It is a `.NET` discovery seam that keeps us compatible with the Node oracle while we evaluate longer-term options such as Azure-hosted STT or a managed decode/transcribe stack.
|
||||
|
||||
Latest live-capture guidance after the `2026-04-18` round:
|
||||
|
||||
- prefer synthetic transcript hints when they are present in the observed turn
|
||||
- only use local `whisper.cpp` when the configured tool paths are real and the decode chain is behaving
|
||||
- treat `ffmpeg` decode failures on normalized Ogg captures as evidence that the local audio path still needs more hardening before it can be the default live-test expectation
|
||||
- keep the Node implementation as the oracle for yes/no turn semantics and audio preprocessing details until the `.NET` port catches up
|
||||
|
||||
## Current Interaction Paths
|
||||
|
||||
The working cloud model currently looks like three main paths:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
using Jibo.Cloud.Application.Abstractions;
|
||||
using Jibo.Runtime.Abstractions;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace Jibo.Cloud.Application.Services;
|
||||
|
||||
@@ -15,8 +16,9 @@ public sealed class JiboInteractionService(
|
||||
var clientIntent = turn.Attributes.TryGetValue("clientIntent", out var rawClientIntent)
|
||||
? rawClientIntent?.ToString()
|
||||
: null;
|
||||
var isYesNoTurn = IsYesNoTurn(turn);
|
||||
|
||||
var semanticIntent = ResolveSemanticIntent(lowered, clientIntent);
|
||||
var semanticIntent = ResolveSemanticIntent(lowered, clientIntent, isYesNoTurn);
|
||||
return semanticIntent switch
|
||||
{
|
||||
"joke" => BuildJokeDecision(catalog),
|
||||
@@ -25,6 +27,8 @@ public sealed class JiboInteractionService(
|
||||
"date" => new JiboInteractionDecision("date", $"Today is {DateTime.Now:dddd, MMMM d}."),
|
||||
"hello" => new JiboInteractionDecision("hello", randomizer.Choose(catalog.GreetingReplies)),
|
||||
"how_are_you" => new JiboInteractionDecision("how_are_you", randomizer.Choose(catalog.HowAreYouReplies)),
|
||||
"yes" => new JiboInteractionDecision("yes", "Yes."),
|
||||
"no" => new JiboInteractionDecision("no", "No."),
|
||||
"surprise" => new JiboInteractionDecision("surprise", randomizer.Choose(catalog.SurpriseReplies)),
|
||||
"personal_report" => new JiboInteractionDecision("personal_report", randomizer.Choose(catalog.PersonalReportReplies)),
|
||||
"weather" => new JiboInteractionDecision("weather", randomizer.Choose(catalog.WeatherReplies)),
|
||||
@@ -86,7 +90,7 @@ public sealed class JiboInteractionService(
|
||||
.Replace("{transcript}", transcript, StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent)
|
||||
private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent, bool isYesNoTurn)
|
||||
{
|
||||
if (string.Equals(clientIntent, "askForTime", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
@@ -98,72 +102,112 @@ public sealed class JiboInteractionService(
|
||||
return "date";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("joke", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "joke", "funny", "make me laugh"))
|
||||
{
|
||||
return "joke";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("dance", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "dance", "boogie"))
|
||||
{
|
||||
return "dance";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("surprise", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "surprise", "surprise me", "show me something fun"))
|
||||
{
|
||||
return "surprise";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("personal report", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "personal report", "my report", "daily report", "my update"))
|
||||
{
|
||||
return "personal_report";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("weather", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "weather", "forecast", "weather report", "is it raining"))
|
||||
{
|
||||
return "weather";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("calendar", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "calendar", "schedule", "what's on my calendar", "what is on my calendar"))
|
||||
{
|
||||
return "calendar";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("commute", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "commute", "traffic", "drive to work", "how long to work"))
|
||||
{
|
||||
return "commute";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("news", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "news", "headlines", "news update", "tell me the news"))
|
||||
{
|
||||
return "news";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("how are you", StringComparison.Ordinal) ||
|
||||
loweredTranscript.Contains("what's up", StringComparison.Ordinal) ||
|
||||
loweredTranscript.Contains("what s up", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "how are you", "what's up", "what s up", "what up"))
|
||||
{
|
||||
return "how_are_you";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("hello", StringComparison.Ordinal) ||
|
||||
loweredTranscript.Contains("hi", StringComparison.Ordinal) ||
|
||||
loweredTranscript.Contains("hey", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "hello", "hi", "hey"))
|
||||
{
|
||||
return "hello";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("time", StringComparison.Ordinal))
|
||||
if (isYesNoTurn && MatchesAny(loweredTranscript, "yes", "yeah", "yup", "sure", "uh huh"))
|
||||
{
|
||||
return "yes";
|
||||
}
|
||||
|
||||
if (isYesNoTurn && MatchesAny(loweredTranscript, "no", "nope", "nah"))
|
||||
{
|
||||
return "no";
|
||||
}
|
||||
|
||||
if (MatchesAny(loweredTranscript, "what time is it", "current time", "the time", "time is it") ||
|
||||
loweredTranscript.Contains("time", StringComparison.Ordinal))
|
||||
{
|
||||
return "time";
|
||||
}
|
||||
|
||||
if (loweredTranscript.Contains("date", StringComparison.Ordinal) || loweredTranscript.Contains("day", StringComparison.Ordinal))
|
||||
if (MatchesAny(loweredTranscript, "what day is it", "what is the date", "today s date", "today's date") ||
|
||||
loweredTranscript.Contains("date", StringComparison.Ordinal) ||
|
||||
loweredTranscript.Contains("day", StringComparison.Ordinal))
|
||||
{
|
||||
return "date";
|
||||
}
|
||||
|
||||
return "chat";
|
||||
}
|
||||
|
||||
private static bool IsYesNoTurn(TurnContext turn)
|
||||
{
|
||||
return ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules"))
|
||||
.Any(static rule =>
|
||||
string.Equals(rule, "$YESNO", StringComparison.OrdinalIgnoreCase) ||
|
||||
string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ReadRules(TurnContext turn, string key)
|
||||
{
|
||||
if (!turn.Attributes.TryGetValue(key, out var value) || value is null)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
return value switch
|
||||
{
|
||||
IReadOnlyList<string> typed => typed,
|
||||
IEnumerable<string> strings => strings,
|
||||
JsonElement { ValueKind: JsonValueKind.Array } json => json.EnumerateArray()
|
||||
.Where(static item => item.ValueKind == JsonValueKind.String)
|
||||
.Select(static item => item.GetString() ?? string.Empty),
|
||||
_ => []
|
||||
};
|
||||
}
|
||||
|
||||
private static bool MatchesAny(string loweredTranscript, params string[] candidates)
|
||||
{
|
||||
return candidates.Any(candidate => loweredTranscript.Contains(candidate, StringComparison.Ordinal));
|
||||
}
|
||||
}
|
||||
|
||||
public sealed record JiboInteractionDecision(
|
||||
|
||||
@@ -17,13 +17,20 @@ public sealed class ResponsePlanToSocketMessagesMapper
|
||||
var transcript = turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty;
|
||||
var clientIntent = ReadAttribute(turn, "clientIntent");
|
||||
var rules = ReadRules(turn, messageType);
|
||||
var yesNoCreateRule = ReadYesNoCreateRule(turn);
|
||||
var isYesNoTurn = !string.IsNullOrWhiteSpace(yesNoCreateRule);
|
||||
var isYesNoIntent = string.Equals(plan.IntentName, "yes", StringComparison.OrdinalIgnoreCase) ||
|
||||
string.Equals(plan.IntentName, "no", StringComparison.OrdinalIgnoreCase);
|
||||
var outboundIntent = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent)
|
||||
? clientIntent
|
||||
: plan.IntentName ?? "unknown";
|
||||
var outboundAsrText = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent)
|
||||
? clientIntent
|
||||
: transcript;
|
||||
var entities = ReadEntities(turn, messageType);
|
||||
var outboundAsrText = isYesNoTurn && isYesNoIntent
|
||||
? transcript
|
||||
: string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent)
|
||||
? clientIntent
|
||||
: transcript;
|
||||
var outboundRules = isYesNoTurn && isYesNoIntent ? [yesNoCreateRule!] : rules;
|
||||
var entities = ReadEntities(turn, messageType, isYesNoTurn && isYesNoIntent);
|
||||
var messages = new List<SocketReplyPlan>
|
||||
{
|
||||
new(JsonSerializer.Serialize(new
|
||||
@@ -42,13 +49,13 @@ public sealed class ResponsePlanToSocketMessagesMapper
|
||||
{
|
||||
confidence = 0.95,
|
||||
intent = outboundIntent,
|
||||
rules,
|
||||
rules = outboundRules,
|
||||
entities
|
||||
},
|
||||
match = new
|
||||
{
|
||||
intent = outboundIntent,
|
||||
rule = rules.FirstOrDefault() ?? string.Empty,
|
||||
rule = outboundRules.FirstOrDefault() ?? string.Empty,
|
||||
score = 0.95
|
||||
}
|
||||
}
|
||||
@@ -135,8 +142,16 @@ public sealed class ResponsePlanToSocketMessagesMapper
|
||||
};
|
||||
}
|
||||
|
||||
private static object ReadEntities(TurnContext turn, string? messageType)
|
||||
private static object ReadEntities(TurnContext turn, string? messageType, bool yesNoCreateTurn)
|
||||
{
|
||||
if (yesNoCreateTurn)
|
||||
{
|
||||
return new Dictionary<string, object?>
|
||||
{
|
||||
["domain"] = "create"
|
||||
};
|
||||
}
|
||||
|
||||
if (!string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return new Dictionary<string, object?>();
|
||||
@@ -155,6 +170,35 @@ public sealed class ResponsePlanToSocketMessagesMapper
|
||||
};
|
||||
}
|
||||
|
||||
private static string? ReadYesNoCreateRule(TurnContext turn)
|
||||
{
|
||||
return ReadRuleValues(turn)
|
||||
.FirstOrDefault(static rule => string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ReadRuleValues(TurnContext turn)
|
||||
{
|
||||
return ReadRuleValues(turn, "listenRules").Concat(ReadRuleValues(turn, "clientRules"));
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ReadRuleValues(TurnContext turn, string key)
|
||||
{
|
||||
if (!turn.Attributes.TryGetValue(key, out var value) || value is null)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
return value switch
|
||||
{
|
||||
IReadOnlyList<string> typedRules => typedRules,
|
||||
IEnumerable<string> rules => rules,
|
||||
JsonElement { ValueKind: JsonValueKind.Array } jsonElement => jsonElement.EnumerateArray()
|
||||
.Where(static item => item.ValueKind == JsonValueKind.String)
|
||||
.Select(static item => item.GetString() ?? string.Empty),
|
||||
_ => []
|
||||
};
|
||||
}
|
||||
|
||||
private static string? ReadAttribute(TurnContext turn, string key)
|
||||
{
|
||||
return turn.Attributes.TryGetValue(key, out var value)
|
||||
|
||||
@@ -2,6 +2,7 @@ using System.Text.Json;
|
||||
using Jibo.Cloud.Application.Abstractions;
|
||||
using Jibo.Cloud.Domain.Models;
|
||||
using Jibo.Runtime.Abstractions;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace Jibo.Cloud.Application.Services;
|
||||
|
||||
@@ -302,6 +303,32 @@ public sealed class WebSocketTurnFinalizationService(
|
||||
{
|
||||
var turn = ProtocolToTurnContextMapper.MapListenMessage(envelope, session, messageType);
|
||||
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
||||
if (!IsTranscriptUsable(finalizedTurn))
|
||||
{
|
||||
finalizedTurn = new TurnContext
|
||||
{
|
||||
TurnId = finalizedTurn.TurnId,
|
||||
SessionId = finalizedTurn.SessionId,
|
||||
TimestampUtc = finalizedTurn.TimestampUtc,
|
||||
InputMode = finalizedTurn.InputMode,
|
||||
SourceKind = finalizedTurn.SourceKind,
|
||||
WakePhrase = finalizedTurn.WakePhrase,
|
||||
RawTranscript = null,
|
||||
NormalizedTranscript = null,
|
||||
DeviceId = finalizedTurn.DeviceId,
|
||||
HostName = finalizedTurn.HostName,
|
||||
RequestId = finalizedTurn.RequestId,
|
||||
ProtocolService = finalizedTurn.ProtocolService,
|
||||
ProtocolOperation = finalizedTurn.ProtocolOperation,
|
||||
FirmwareVersion = finalizedTurn.FirmwareVersion,
|
||||
ApplicationVersion = finalizedTurn.ApplicationVersion,
|
||||
Locale = finalizedTurn.Locale,
|
||||
TimeZone = finalizedTurn.TimeZone,
|
||||
IsFollowUpEligible = finalizedTurn.IsFollowUpEligible,
|
||||
Attributes = finalizedTurn.Attributes
|
||||
};
|
||||
}
|
||||
|
||||
var turnState = session.TurnState;
|
||||
if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) &&
|
||||
string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript))
|
||||
@@ -460,4 +487,63 @@ public sealed class WebSocketTurnFinalizationService(
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsTranscriptUsable(TurnContext turn)
|
||||
{
|
||||
var transcript = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
||||
if (string.IsNullOrWhiteSpace(transcript))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (transcript.Length >= 6)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (IsYesNoTurn(turn) && transcript is "yes" or "no" or "sure" or "nope" or "yup" or "uh huh" or "yeah" or "nah")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return transcript is "joke" or "dance" or "time" or "date" or "today" or "day" or "hello" or "hi" or "hey";
|
||||
}
|
||||
|
||||
private static bool IsYesNoTurn(TurnContext turn)
|
||||
{
|
||||
return ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules"))
|
||||
.Any(static rule =>
|
||||
string.Equals(rule, "$YESNO", StringComparison.OrdinalIgnoreCase) ||
|
||||
string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ReadRules(TurnContext turn, string key)
|
||||
{
|
||||
if (!turn.Attributes.TryGetValue(key, out var value) || value is null)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
return value switch
|
||||
{
|
||||
IReadOnlyList<string> typed => typed,
|
||||
IEnumerable<string> strings => strings,
|
||||
JsonElement { ValueKind: JsonValueKind.Array } json => json.EnumerateArray()
|
||||
.Where(static item => item.ValueKind == JsonValueKind.String)
|
||||
.Select(static item => item.GetString() ?? string.Empty),
|
||||
_ => []
|
||||
};
|
||||
}
|
||||
|
||||
private static string NormalizeTranscript(string? transcript)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(transcript))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
return Regex.Replace(transcript.Trim().ToLowerInvariant(), @"[^\w\s]", " ")
|
||||
.Replace(" ", " ", StringComparison.Ordinal)
|
||||
.Trim();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,9 +12,9 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
|
||||
public bool CanHandle(TurnContext turn)
|
||||
{
|
||||
return options.EnableLocalWhisperCpp &&
|
||||
!string.IsNullOrWhiteSpace(options.FfmpegPath) &&
|
||||
!string.IsNullOrWhiteSpace(options.WhisperCliPath) &&
|
||||
!string.IsNullOrWhiteSpace(options.WhisperModelPath) &&
|
||||
IsConfiguredPathAvailable(options.FfmpegPath, checkFileExists: false) &&
|
||||
IsConfiguredPathAvailable(options.WhisperCliPath, checkFileExists: true) &&
|
||||
IsConfiguredPathAvailable(options.WhisperModelPath, checkFileExists: true) &&
|
||||
ReadBufferedAudioFrames(turn).Count > 0;
|
||||
}
|
||||
|
||||
@@ -148,4 +148,19 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
|
||||
// Best-effort cleanup only.
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsConfiguredPathAvailable(string? path, bool checkFileExists)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(path))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!Path.IsPathRooted(path))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return checkFileExists ? File.Exists(path) : true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,8 +31,8 @@ public static class ServiceCollectionExtensions
|
||||
services.AddSingleton<JiboInteractionService>();
|
||||
services.AddSingleton<IConversationBroker, DemoConversationBroker>();
|
||||
services.AddSingleton<IExternalProcessRunner, ExternalProcessRunner>();
|
||||
services.AddSingleton<ISttStrategy, LocalWhisperCppBufferedAudioSttStrategy>();
|
||||
services.AddSingleton<ISttStrategy, SyntheticBufferedAudioSttStrategy>();
|
||||
services.AddSingleton<ISttStrategy, LocalWhisperCppBufferedAudioSttStrategy>();
|
||||
services.AddSingleton<ISttStrategySelector, DefaultSttStrategySelector>();
|
||||
services.AddSingleton<IWebSocketTelemetrySink, FileWebSocketTelemetrySink>();
|
||||
services.AddSingleton<IProtocolTelemetrySink, FileProtocolTelemetrySink>();
|
||||
|
||||
Reference in New Issue
Block a user