From 83a9350a9d92b8e56285a9b0c7ce8f3f00729fdd Mon Sep 17 00:00:00 2001 From: Jacob Dubin Date: Sat, 18 Apr 2026 16:29:27 -0500 Subject: [PATCH] enhanced skill and yes/no routing --- OpenJibo/docs/development-plan.md | 21 +++++ OpenJibo/docs/live-jibo-test-runbook.md | 17 ++++ .../prompts/cloud-deploy-and-jibo-rcm-path.md | 54 ++++++++++++ OpenJibo/docs/prompts/stt-upgrade-path.md | 47 ++++++++++ OpenJibo/src/Jibo.Cloud/dotnet/README.md | 10 +++ .../Services/JiboInteractionService.cs | 80 +++++++++++++---- .../ResponsePlanToSocketMessagesMapper.cs | 58 +++++++++++-- .../WebSocketTurnFinalizationService.cs | 86 +++++++++++++++++++ ...LocalWhisperCppBufferedAudioSttStrategy.cs | 21 ++++- .../ServiceCollectionExtensions.cs | 2 +- .../WebSockets/JiboInteractionServiceTests.cs | 33 +++++++ .../WebSockets/JiboWebSocketServiceTests.cs | 31 +++++++ ...WhisperCppBufferedAudioSttStrategyTests.cs | 24 ++++++ 13 files changed, 455 insertions(+), 29 deletions(-) create mode 100644 OpenJibo/docs/prompts/cloud-deploy-and-jibo-rcm-path.md create mode 100644 OpenJibo/docs/prompts/stt-upgrade-path.md diff --git a/OpenJibo/docs/development-plan.md b/OpenJibo/docs/development-plan.md index 6062a69..f2db553 100644 --- a/OpenJibo/docs/development-plan.md +++ b/OpenJibo/docs/development-plan.md @@ -69,6 +69,27 @@ Near-term ASR work should stay staged: That keeps Node as the reverse-engineering oracle while letting the long-term `.NET` cloud gain real STT seams without pretending they are finished. +## Latest Capture Findings + +The latest live test round tightened up three priorities: + +- yes/no turns need explicit constrained follow-up handling instead of generic chat routing +- skill invocation still depends too much on narrow phrase matching and is vulnerable to STT drift +- local buffered-audio STT in `.NET` is useful for discovery, but it is not yet stable enough to be the default live-test assumption + +Evidence from the latest `2026-04-18` captures: + +- several buffered-audio turns never produced a usable transcript because the local `whisper.cpp` path was missing or the temporary normalized Ogg file was rejected by `ffmpeg` +- some recognized phrases fell into placeholder provider replies because the intent was recognized but the feature path behind it is still a stub +- short yes/no responses need the same session-aware treatment already prototyped in Node, especially for create-flow style follow-ups + +Near-term interaction work should now prioritize: + +1. preserve and interpret yes/no turn constraints from observed listen rules +2. broaden phrase-to-intent matching for the small set of known working skills before moving to larger NLU ambitions +3. keep synthetic transcript hints as the most reliable parity path when captures already provide them +4. continue evaluating whether local preprocessing is worth further investment or whether managed STT should replace it for the next serious testing phase + ## Working Cloud Framework The current evidence in captures, fixtures, and Node behavior supports three main cloud interaction paths: diff --git a/OpenJibo/docs/live-jibo-test-runbook.md b/OpenJibo/docs/live-jibo-test-runbook.md index 3e66532..4287e52 100644 --- a/OpenJibo/docs/live-jibo-test-runbook.md +++ b/OpenJibo/docs/live-jibo-test-runbook.md @@ -130,6 +130,23 @@ python3 ./scripts/cloud/import-websocket-capture-fixture.py \ - whether EOS timing matched expectations - whether any unexpected message families appeared +## Latest Test Notes To Carry Forward + +The most recent live round showed that startup and some Q-and-A paths are progressing, but audio-turn reliability is still uneven. + +Carry these expectations into the next run: + +- constrained yes/no replies should be tested intentionally because they need special handling and are easy to miss if STT drifts +- phrases intended to trigger known skills should be repeated using a small, documented wording set so we can separate routing issues from Whisper errors +- provider-backed placeholder answers are still expected for weather, commute, calendar, news, and similar routes unless that feature path is explicitly implemented + +For STT during live testing: + +- prefer runs where `audioTranscriptHint` or other synthetic replay cues are available +- do not assume local `whisper.cpp` success means the audio pipeline is stable overall +- if many turns stay pending or `ffmpeg` rejects normalized Ogg files, treat that as a speech-pipeline issue first, not an intent-mapping issue +- keep the Node server available as the comparison path for yes/no and audio-preprocessing behavior + ## What To Do If The Test Fails If the robot does not connect or the first turn fails: diff --git a/OpenJibo/docs/prompts/cloud-deploy-and-jibo-rcm-path.md b/OpenJibo/docs/prompts/cloud-deploy-and-jibo-rcm-path.md new file mode 100644 index 0000000..96489e5 --- /dev/null +++ b/OpenJibo/docs/prompts/cloud-deploy-and-jibo-rcm-path.md @@ -0,0 +1,54 @@ +# Cloud Deploy And Jibo RCM Path Prompt + +Prepare OpenJibo for a lightweight v1 cloud deployment and the cleanest practical Jibo configuration path for group testing. + +Current repo context: + +- workspace root: `C:\Projects\JiboExperiments\OpenJibo` +- the current `.NET` cloud is the target runtime +- the Node server remains a discovery oracle and fallback +- latest live-test guidance is in: + - `docs/live-jibo-test-runbook.md` + - `docs/live-jibo-capture.md` + - `docs/device-bootstrap.md` + - `docs/development-plan.md` + - `src/Jibo.Cloud/dotnet/README.md` + +What we need from this workstream: + +1. define the smallest, cleanest, easiest-to-repeat deployment path for a v1 hosted OpenJibo cloud +2. define the lightest reliable way to configure Jibo devices to use that cloud, with as few manual error-prone steps as possible +3. produce scripts and docs that make it realistic for additional revival-group testers to get connected quickly + +Important goals: + +- prefer a path that is easy for non-experts in the revival group to follow +- minimize hand-edited device changes and confusing setup steps +- preserve a clear fallback path when a deployment or routing change fails +- keep the deployment practical for a small testing cohort first; enterprise polish can come later + +Areas to review: + +- current API host and routing logic in `src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Api/Program.cs` +- existing scripts under: + - `scripts/cloud/` + - `scripts/bootstrap/` +- docs around routing and bootstrap in: + - `docs/device-bootstrap.md` + - `docs/live-jibo-test-runbook.md` + - `docs/live-jibo-capture.md` + +Deliverables: + +- a concrete v1 deployment recommendation +- any needed deployment scripts or setup helpers +- a clean Jibo configuration / routing / RCM procedure with the fewest practical steps +- validation steps that clearly distinguish cloud issues from robot/network issues +- doc updates aimed at making group adoption fast and low-risk + +Constraints: + +- do not over-design for full production scale yet +- avoid adding multiple competing deployment paths unless there is a strong reason +- optimize for reliability, repeatability, and low support burden for the next round of testers +- keep the Node oracle available as a troubleshooting fallback until `.NET` parity is clearly strong enough diff --git a/OpenJibo/docs/prompts/stt-upgrade-path.md b/OpenJibo/docs/prompts/stt-upgrade-path.md new file mode 100644 index 0000000..0b04835 --- /dev/null +++ b/OpenJibo/docs/prompts/stt-upgrade-path.md @@ -0,0 +1,47 @@ +# STT Upgrade Path Prompt + +Improve the OpenJibo `.NET` speech-to-text path for live robot testing. + +Current repo context: + +- workspace root: `C:\Projects\JiboExperiments\OpenJibo` +- current live captures from `2026-04-18` showed that some turns succeeded, but many buffered-audio turns failed before producing a usable transcript +- the current local `.NET` STT path is in: + - `src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs` + - `src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/OggOpusAudioNormalizer.cs` + - `src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs` + - `src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/DefaultSttStrategySelector.cs` +- Node remains the oracle for current behavior: + - `src/Jibo.Cloud/node/open-jibo-link.js` +- live test evidence and guidance are documented in: + - `docs/development-plan.md` + - `docs/live-jibo-test-runbook.md` + - `src/Jibo.Cloud/dotnet/README.md` + +Observed problems to ground the work: + +- one captured run could not find `whisper-cli` at the configured rooted path +- many buffered-audio turns failed because `ffmpeg` rejected the normalized Ogg output +- we need a more reliable path for testing than the current partially working local whisper chain + +Goals: + +1. review the current `.NET` STT seam and compare it against the Node preprocessing flow +2. recommend and implement the best next STT path for testing, preferring reliability and simplicity over novelty +3. keep the STT integration behind the existing abstractions so we can swap providers later +4. preserve or improve telemetry so failed turns clearly show whether the problem is decode, tool lookup, provider failure, or unusable transcript quality +5. update tests and docs to match the chosen direction + +Constraints: + +- do not remove the synthetic transcript-hint path; it is still valuable for fixture replay and parity +- do not assume Azure-hosted STT is automatically the answer unless the codebase and testing needs support that choice +- prefer an implementation that is easy for other revival-group testers to run consistently +- avoid large speculative architecture changes that are not needed for a near-term v1 testable cloud + +Deliverables: + +- code changes for the improved STT path +- tests covering strategy selection, success, and failure handling +- doc updates with exact setup guidance and a recommendation on whether local whisper remains optional, fallback-only, or deprecated for testing +- a short summary of the tradeoffs and why the chosen path is the best next step diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/README.md b/OpenJibo/src/Jibo.Cloud/dotnet/README.md index 542a694..e15d0c0 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/README.md +++ b/OpenJibo/src/Jibo.Cloud/dotnet/README.md @@ -108,6 +108,9 @@ Current raw-audio behavior is still a compatibility bridge: - if buffered audio has a synthetic transcript hint, the server now auto-finalizes the turn and emits `LISTEN` + `EOS` + `SKILL_ACTION` - if buffered audio crosses the finalize threshold without a usable transcript, the server now emits a Node-style fallback completion with `EOS` instead of hanging the turn forever - this is intentionally not a claim of real ASR parity +- follow-up turns now preserve enough constraint state to distinguish yes/no-style replies from ordinary free-form chat +- create-flow yes/no turns now preserve `create/is_it_a_keeper` and `domain=create` in the outbound synthetic `LISTEN` payload +- phrase matching has been widened slightly for known test prompts such as joke, dance, surprise, weather, calendar, commute, and news variants ## Buffered Audio STT @@ -138,6 +141,13 @@ Configuration lives under `OpenJibo:Stt`: This is not yet a claim of production-ready onboard ASR. It is a `.NET` discovery seam that keeps us compatible with the Node oracle while we evaluate longer-term options such as Azure-hosted STT or a managed decode/transcribe stack. +Latest live-capture guidance after the `2026-04-18` round: + +- prefer synthetic transcript hints when they are present in the observed turn +- only use local `whisper.cpp` when the configured tool paths are real and the decode chain is behaving +- treat `ffmpeg` decode failures on normalized Ogg captures as evidence that the local audio path still needs more hardening before it can be the default live-test expectation +- keep the Node implementation as the oracle for yes/no turn semantics and audio preprocessing details until the `.NET` port catches up + ## Current Interaction Paths The working cloud model currently looks like three main paths: diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboInteractionService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboInteractionService.cs index 2bab65d..ea42c6e 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboInteractionService.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboInteractionService.cs @@ -1,5 +1,6 @@ using Jibo.Cloud.Application.Abstractions; using Jibo.Runtime.Abstractions; +using System.Text.Json; namespace Jibo.Cloud.Application.Services; @@ -15,8 +16,9 @@ public sealed class JiboInteractionService( var clientIntent = turn.Attributes.TryGetValue("clientIntent", out var rawClientIntent) ? rawClientIntent?.ToString() : null; + var isYesNoTurn = IsYesNoTurn(turn); - var semanticIntent = ResolveSemanticIntent(lowered, clientIntent); + var semanticIntent = ResolveSemanticIntent(lowered, clientIntent, isYesNoTurn); return semanticIntent switch { "joke" => BuildJokeDecision(catalog), @@ -25,6 +27,8 @@ public sealed class JiboInteractionService( "date" => new JiboInteractionDecision("date", $"Today is {DateTime.Now:dddd, MMMM d}."), "hello" => new JiboInteractionDecision("hello", randomizer.Choose(catalog.GreetingReplies)), "how_are_you" => new JiboInteractionDecision("how_are_you", randomizer.Choose(catalog.HowAreYouReplies)), + "yes" => new JiboInteractionDecision("yes", "Yes."), + "no" => new JiboInteractionDecision("no", "No."), "surprise" => new JiboInteractionDecision("surprise", randomizer.Choose(catalog.SurpriseReplies)), "personal_report" => new JiboInteractionDecision("personal_report", randomizer.Choose(catalog.PersonalReportReplies)), "weather" => new JiboInteractionDecision("weather", randomizer.Choose(catalog.WeatherReplies)), @@ -86,7 +90,7 @@ public sealed class JiboInteractionService( .Replace("{transcript}", transcript, StringComparison.Ordinal); } - private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent) + private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent, bool isYesNoTurn) { if (string.Equals(clientIntent, "askForTime", StringComparison.OrdinalIgnoreCase)) { @@ -98,72 +102,112 @@ public sealed class JiboInteractionService( return "date"; } - if (loweredTranscript.Contains("joke", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "joke", "funny", "make me laugh")) { return "joke"; } - if (loweredTranscript.Contains("dance", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "dance", "boogie")) { return "dance"; } - if (loweredTranscript.Contains("surprise", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "surprise", "surprise me", "show me something fun")) { return "surprise"; } - if (loweredTranscript.Contains("personal report", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "personal report", "my report", "daily report", "my update")) { return "personal_report"; } - if (loweredTranscript.Contains("weather", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "weather", "forecast", "weather report", "is it raining")) { return "weather"; } - if (loweredTranscript.Contains("calendar", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "calendar", "schedule", "what's on my calendar", "what is on my calendar")) { return "calendar"; } - if (loweredTranscript.Contains("commute", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "commute", "traffic", "drive to work", "how long to work")) { return "commute"; } - if (loweredTranscript.Contains("news", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "news", "headlines", "news update", "tell me the news")) { return "news"; } - if (loweredTranscript.Contains("how are you", StringComparison.Ordinal) || - loweredTranscript.Contains("what's up", StringComparison.Ordinal) || - loweredTranscript.Contains("what s up", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "how are you", "what's up", "what s up", "what up")) { return "how_are_you"; } - if (loweredTranscript.Contains("hello", StringComparison.Ordinal) || - loweredTranscript.Contains("hi", StringComparison.Ordinal) || - loweredTranscript.Contains("hey", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "hello", "hi", "hey")) { return "hello"; } - if (loweredTranscript.Contains("time", StringComparison.Ordinal)) + if (isYesNoTurn && MatchesAny(loweredTranscript, "yes", "yeah", "yup", "sure", "uh huh")) + { + return "yes"; + } + + if (isYesNoTurn && MatchesAny(loweredTranscript, "no", "nope", "nah")) + { + return "no"; + } + + if (MatchesAny(loweredTranscript, "what time is it", "current time", "the time", "time is it") || + loweredTranscript.Contains("time", StringComparison.Ordinal)) { return "time"; } - if (loweredTranscript.Contains("date", StringComparison.Ordinal) || loweredTranscript.Contains("day", StringComparison.Ordinal)) + if (MatchesAny(loweredTranscript, "what day is it", "what is the date", "today s date", "today's date") || + loweredTranscript.Contains("date", StringComparison.Ordinal) || + loweredTranscript.Contains("day", StringComparison.Ordinal)) { return "date"; } return "chat"; } + + private static bool IsYesNoTurn(TurnContext turn) + { + return ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")) + .Any(static rule => + string.Equals(rule, "$YESNO", StringComparison.OrdinalIgnoreCase) || + string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase)); + } + + private static IEnumerable ReadRules(TurnContext turn, string key) + { + if (!turn.Attributes.TryGetValue(key, out var value) || value is null) + { + return []; + } + + return value switch + { + IReadOnlyList typed => typed, + IEnumerable strings => strings, + JsonElement { ValueKind: JsonValueKind.Array } json => json.EnumerateArray() + .Where(static item => item.ValueKind == JsonValueKind.String) + .Select(static item => item.GetString() ?? string.Empty), + _ => [] + }; + } + + private static bool MatchesAny(string loweredTranscript, params string[] candidates) + { + return candidates.Any(candidate => loweredTranscript.Contains(candidate, StringComparison.Ordinal)); + } } public sealed record JiboInteractionDecision( diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ResponsePlanToSocketMessagesMapper.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ResponsePlanToSocketMessagesMapper.cs index 20d2097..4b7949a 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ResponsePlanToSocketMessagesMapper.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/ResponsePlanToSocketMessagesMapper.cs @@ -17,13 +17,20 @@ public sealed class ResponsePlanToSocketMessagesMapper var transcript = turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty; var clientIntent = ReadAttribute(turn, "clientIntent"); var rules = ReadRules(turn, messageType); + var yesNoCreateRule = ReadYesNoCreateRule(turn); + var isYesNoTurn = !string.IsNullOrWhiteSpace(yesNoCreateRule); + var isYesNoIntent = string.Equals(plan.IntentName, "yes", StringComparison.OrdinalIgnoreCase) || + string.Equals(plan.IntentName, "no", StringComparison.OrdinalIgnoreCase); var outboundIntent = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent) ? clientIntent : plan.IntentName ?? "unknown"; - var outboundAsrText = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent) - ? clientIntent - : transcript; - var entities = ReadEntities(turn, messageType); + var outboundAsrText = isYesNoTurn && isYesNoIntent + ? transcript + : string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent) + ? clientIntent + : transcript; + var outboundRules = isYesNoTurn && isYesNoIntent ? [yesNoCreateRule!] : rules; + var entities = ReadEntities(turn, messageType, isYesNoTurn && isYesNoIntent); var messages = new List { new(JsonSerializer.Serialize(new @@ -42,13 +49,13 @@ public sealed class ResponsePlanToSocketMessagesMapper { confidence = 0.95, intent = outboundIntent, - rules, + rules = outboundRules, entities }, match = new { intent = outboundIntent, - rule = rules.FirstOrDefault() ?? string.Empty, + rule = outboundRules.FirstOrDefault() ?? string.Empty, score = 0.95 } } @@ -135,8 +142,16 @@ public sealed class ResponsePlanToSocketMessagesMapper }; } - private static object ReadEntities(TurnContext turn, string? messageType) + private static object ReadEntities(TurnContext turn, string? messageType, bool yesNoCreateTurn) { + if (yesNoCreateTurn) + { + return new Dictionary + { + ["domain"] = "create" + }; + } + if (!string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase)) { return new Dictionary(); @@ -155,6 +170,35 @@ public sealed class ResponsePlanToSocketMessagesMapper }; } + private static string? ReadYesNoCreateRule(TurnContext turn) + { + return ReadRuleValues(turn) + .FirstOrDefault(static rule => string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase)); + } + + private static IEnumerable ReadRuleValues(TurnContext turn) + { + return ReadRuleValues(turn, "listenRules").Concat(ReadRuleValues(turn, "clientRules")); + } + + private static IEnumerable ReadRuleValues(TurnContext turn, string key) + { + if (!turn.Attributes.TryGetValue(key, out var value) || value is null) + { + return []; + } + + return value switch + { + IReadOnlyList typedRules => typedRules, + IEnumerable rules => rules, + JsonElement { ValueKind: JsonValueKind.Array } jsonElement => jsonElement.EnumerateArray() + .Where(static item => item.ValueKind == JsonValueKind.String) + .Select(static item => item.GetString() ?? string.Empty), + _ => [] + }; + } + private static string? ReadAttribute(TurnContext turn, string key) { return turn.Attributes.TryGetValue(key, out var value) diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs index 5a9c177..ab6124d 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs @@ -2,6 +2,7 @@ using System.Text.Json; using Jibo.Cloud.Application.Abstractions; using Jibo.Cloud.Domain.Models; using Jibo.Runtime.Abstractions; +using System.Text.RegularExpressions; namespace Jibo.Cloud.Application.Services; @@ -302,6 +303,32 @@ public sealed class WebSocketTurnFinalizationService( { var turn = ProtocolToTurnContextMapper.MapListenMessage(envelope, session, messageType); var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken); + if (!IsTranscriptUsable(finalizedTurn)) + { + finalizedTurn = new TurnContext + { + TurnId = finalizedTurn.TurnId, + SessionId = finalizedTurn.SessionId, + TimestampUtc = finalizedTurn.TimestampUtc, + InputMode = finalizedTurn.InputMode, + SourceKind = finalizedTurn.SourceKind, + WakePhrase = finalizedTurn.WakePhrase, + RawTranscript = null, + NormalizedTranscript = null, + DeviceId = finalizedTurn.DeviceId, + HostName = finalizedTurn.HostName, + RequestId = finalizedTurn.RequestId, + ProtocolService = finalizedTurn.ProtocolService, + ProtocolOperation = finalizedTurn.ProtocolOperation, + FirmwareVersion = finalizedTurn.FirmwareVersion, + ApplicationVersion = finalizedTurn.ApplicationVersion, + Locale = finalizedTurn.Locale, + TimeZone = finalizedTurn.TimeZone, + IsFollowUpEligible = finalizedTurn.IsFollowUpEligible, + Attributes = finalizedTurn.Attributes + }; + } + var turnState = session.TurnState; if (string.IsNullOrWhiteSpace(finalizedTurn.NormalizedTranscript) && string.IsNullOrWhiteSpace(finalizedTurn.RawTranscript)) @@ -460,4 +487,63 @@ public sealed class WebSocketTurnFinalizationService( return false; } } + + private static bool IsTranscriptUsable(TurnContext turn) + { + var transcript = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript); + if (string.IsNullOrWhiteSpace(transcript)) + { + return false; + } + + if (transcript.Length >= 6) + { + return true; + } + + if (IsYesNoTurn(turn) && transcript is "yes" or "no" or "sure" or "nope" or "yup" or "uh huh" or "yeah" or "nah") + { + return true; + } + + return transcript is "joke" or "dance" or "time" or "date" or "today" or "day" or "hello" or "hi" or "hey"; + } + + private static bool IsYesNoTurn(TurnContext turn) + { + return ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")) + .Any(static rule => + string.Equals(rule, "$YESNO", StringComparison.OrdinalIgnoreCase) || + string.Equals(rule, "create/is_it_a_keeper", StringComparison.OrdinalIgnoreCase)); + } + + private static IEnumerable ReadRules(TurnContext turn, string key) + { + if (!turn.Attributes.TryGetValue(key, out var value) || value is null) + { + return []; + } + + return value switch + { + IReadOnlyList typed => typed, + IEnumerable strings => strings, + JsonElement { ValueKind: JsonValueKind.Array } json => json.EnumerateArray() + .Where(static item => item.ValueKind == JsonValueKind.String) + .Select(static item => item.GetString() ?? string.Empty), + _ => [] + }; + } + + private static string NormalizeTranscript(string? transcript) + { + if (string.IsNullOrWhiteSpace(transcript)) + { + return string.Empty; + } + + return Regex.Replace(transcript.Trim().ToLowerInvariant(), @"[^\w\s]", " ") + .Replace(" ", " ", StringComparison.Ordinal) + .Trim(); + } } diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs index dcfee7d..54e88c7 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs @@ -12,9 +12,9 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy( public bool CanHandle(TurnContext turn) { return options.EnableLocalWhisperCpp && - !string.IsNullOrWhiteSpace(options.FfmpegPath) && - !string.IsNullOrWhiteSpace(options.WhisperCliPath) && - !string.IsNullOrWhiteSpace(options.WhisperModelPath) && + IsConfiguredPathAvailable(options.FfmpegPath, checkFileExists: false) && + IsConfiguredPathAvailable(options.WhisperCliPath, checkFileExists: true) && + IsConfiguredPathAvailable(options.WhisperModelPath, checkFileExists: true) && ReadBufferedAudioFrames(turn).Count > 0; } @@ -148,4 +148,19 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy( // Best-effort cleanup only. } } + + private static bool IsConfiguredPathAvailable(string? path, bool checkFileExists) + { + if (string.IsNullOrWhiteSpace(path)) + { + return false; + } + + if (!Path.IsPathRooted(path)) + { + return true; + } + + return checkFileExists ? File.Exists(path) : true; + } } diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/DependencyInjection/ServiceCollectionExtensions.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/DependencyInjection/ServiceCollectionExtensions.cs index f87d4ea..eced6b4 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/DependencyInjection/ServiceCollectionExtensions.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/DependencyInjection/ServiceCollectionExtensions.cs @@ -31,8 +31,8 @@ public static class ServiceCollectionExtensions services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); - services.AddSingleton(); services.AddSingleton(); + services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboInteractionServiceTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboInteractionServiceTests.cs index eb9bb1c..d37b072 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboInteractionServiceTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboInteractionServiceTests.cs @@ -56,6 +56,39 @@ public sealed class JiboInteractionServiceTests Assert.Contains("Today is", decision.ReplyText, StringComparison.Ordinal); } + [Fact] + public async Task BuildDecisionAsync_YesNoFollowUp_MapsShortAffirmationToYesIntent() + { + var service = CreateService(); + + var decision = await service.BuildDecisionAsync(new TurnContext + { + RawTranscript = "yeah", + NormalizedTranscript = "yeah", + Attributes = new Dictionary + { + ["listenRules"] = new[] { "create/is_it_a_keeper" } + } + }); + + Assert.Equal("yes", decision.IntentName); + Assert.Equal("Yes.", decision.ReplyText); + } + + [Fact] + public async Task BuildDecisionAsync_SkillPhraseVariant_MapsToKnownIntent() + { + var service = CreateService(); + + var decision = await service.BuildDecisionAsync(new TurnContext + { + RawTranscript = "make me laugh", + NormalizedTranscript = "make me laugh" + }); + + Assert.Equal("joke", decision.IntentName); + } + private static JiboInteractionService CreateService() { return new JiboInteractionService( diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs index fd263d3..9d7e154 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs @@ -342,6 +342,37 @@ public sealed class JiboWebSocketServiceTests Assert.Equal("clock/clock_menu", listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString()); } + [Fact] + public async Task ClientAsr_YesNoCreateFlow_PreservesCreateRuleAndDomain() + { + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-yesno-token", + Text = """{"type":"LISTEN","transID":"trans-yesno","data":{"rules":["create/is_it_a_keeper","$YESNO"]}}""" + }); + + var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-yesno-token", + Text = """{"type":"CLIENT_ASR","transID":"trans-yesno","data":{"text":"yeah"}}""" + }); + + Assert.Equal(3, replies.Count); + + using var listenPayload = JsonDocument.Parse(replies[0].Text!); + Assert.Equal("yeah", listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString()); + Assert.Equal("yes", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString()); + Assert.Equal("create/is_it_a_keeper", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("rules")[0].GetString()); + Assert.Equal("create", listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("entities").GetProperty("domain").GetString()); + Assert.Equal("create/is_it_a_keeper", listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString()); + } + [Fact] public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam() { diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/LocalWhisperCppBufferedAudioSttStrategyTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/LocalWhisperCppBufferedAudioSttStrategyTests.cs index c131bcf..6774e6e 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/LocalWhisperCppBufferedAudioSttStrategyTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/LocalWhisperCppBufferedAudioSttStrategyTests.cs @@ -29,6 +29,30 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests Assert.False(strategy.CanHandle(turn)); } + [Fact] + public void CanHandle_ReturnsFalse_WhenConfiguredAbsoluteWhisperPathIsMissing() + { + var strategy = new LocalWhisperCppBufferedAudioSttStrategy( + new BufferedAudioSttOptions + { + EnableLocalWhisperCpp = true, + FfmpegPath = "/usr/bin/ffmpeg", + WhisperCliPath = "/path/that/does/not/exist/whisper-cli", + WhisperModelPath = "/path/that/does/not/exist/model.bin" + }, + new FakeExternalProcessRunner()); + + var turn = new TurnContext + { + Attributes = new Dictionary + { + ["bufferedAudioFrames"] = new[] { BuildMinimalOggPage() } + } + }; + + Assert.False(strategy.CanHandle(turn)); + } + [Fact] public async Task TranscribeAsync_UsesFfmpegAndWhisperCpp_WhenConfigured() {