From d8949fcc9a719e2474323abc1892440eb0a72505 Mon Sep 17 00:00:00 2001 From: Jacob Dubin Date: Sun, 17 May 2026 11:18:57 -0500 Subject: [PATCH] Tighten STT noise filtering and preserve yes-no replies --- .../Services/JiboInteractionService.cs | 7 +- .../WebSocketTurnFinalizationService.cs | 102 ++++++++++++++++- .../WebSockets/JiboWebSocketServiceTests.cs | 104 +++++++++++++++++- 3 files changed, 206 insertions(+), 7 deletions(-) diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboInteractionService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboInteractionService.cs index 9cd501d..6f89193 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboInteractionService.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/JiboInteractionService.cs @@ -3448,6 +3448,11 @@ public sealed class JiboInteractionService( { foreach (var acknowledgement in YesNoAcknowledgementPrefixes) { + if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) && + (string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) || + normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal))) + continue; + if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal)) { trimmedTranscript = string.Empty; @@ -5093,4 +5098,4 @@ public sealed record JiboInteractionDecision( string ReplyText, string? SkillName = null, IDictionary? SkillPayload = null, - IDictionary? ContextUpdates = null); \ No newline at end of file + IDictionary? ContextUpdates = null); diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs index 95ac547..6e4b147 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/WebSocketTurnFinalizationService.cs @@ -88,6 +88,23 @@ public sealed partial class WebSocketTurnFinalizationService( "thank you" ]; + private static readonly string[] TranscriptNoisePrefixes = + [ + "uh", + "um", + "hmm", + "erm", + "er", + "ah", + "eh", + "mm", + "mmm", + "well", + "so", + "actually", + "honestly" + ]; + private static readonly HashSet YesNoAffirmativeLeadTokens = new(StringComparer.Ordinal) { "yes", @@ -519,7 +536,7 @@ public sealed partial class WebSocketTurnFinalizationService( } var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken); - if (!IsTranscriptUsable(finalizedTurn)) + if (!TryGetUsableTranscript(finalizedTurn, out var usableTranscript)) finalizedTurn = new TurnContext { TurnId = finalizedTurn.TurnId, @@ -542,6 +559,10 @@ public sealed partial class WebSocketTurnFinalizationService( IsFollowUpEligible = finalizedTurn.IsFollowUpEligible, Attributes = finalizedTurn.Attributes }; + else if (string.Equals(messageType, "CLIENT_ASR", StringComparison.OrdinalIgnoreCase) && + !string.Equals(usableTranscript, finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript, + StringComparison.Ordinal)) + finalizedTurn = WithSanitizedTranscript(finalizedTurn, usableTranscript); if (ShouldTreatBufferedHotphraseAsGreeting(finalizedTurn, turnState, allowFallbackOnMissingTranscript)) finalizedTurn = WithSyntheticTranscript(finalizedTurn, "hello"); @@ -1065,12 +1086,17 @@ public sealed partial class WebSocketTurnFinalizationService( } private static bool IsTranscriptUsable(TurnContext turn) + { + return TryGetUsableTranscript(turn, out _); + } + + private static bool TryGetUsableTranscript(TurnContext turn, out string transcript) { var messageType = ReadMessageType(turn); var clientIntent = ReadAttribute(turn, "clientIntent"); var pendingProactivityOffer = ReadAttribute(turn, "pendingProactivityOffer"); var personalReportState = ReadAttribute(turn, PersonalReportOrchestrator.StateMetadataKey); - var transcript = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript); + transcript = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript); var listenRules = ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")).ToArray(); if (string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && @@ -1226,7 +1252,7 @@ public sealed partial class WebSocketTurnFinalizationService( { if (string.IsNullOrWhiteSpace(normalizedTranscript)) return YesNoReply.None; - var normalized = normalizedTranscript; + var normalized = NormalizeUsableTranscript(normalizedTranscript); while (TryTrimLeadingAcknowledgement(normalized, out var trimmed)) normalized = trimmed; if (string.IsNullOrWhiteSpace(normalized)) return YesNoReply.None; @@ -1261,6 +1287,11 @@ public sealed partial class WebSocketTurnFinalizationService( { foreach (var acknowledgement in YesNoAcknowledgementPrefixes) { + if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) && + (string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) || + normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal))) + continue; + if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal)) { trimmedTranscript = string.Empty; @@ -1278,6 +1309,41 @@ public sealed partial class WebSocketTurnFinalizationService( return false; } + private static string NormalizeUsableTranscript(string? transcript) + { + var normalized = NormalizeTranscript(transcript); + if (string.IsNullOrWhiteSpace(normalized)) return string.Empty; + + while (TryTrimLeadingTranscriptNoise(normalized, out var trimmed)) normalized = trimmed; + return normalized; + } + + private static bool TryTrimLeadingTranscriptNoise(string normalizedTranscript, out string trimmedTranscript) + { + foreach (var noisePrefix in TranscriptNoisePrefixes) + { + if (string.Equals(noisePrefix, "uh", StringComparison.Ordinal) && + (string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) || + normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal))) + continue; + + if (string.Equals(normalizedTranscript, noisePrefix, StringComparison.Ordinal)) + { + trimmedTranscript = string.Empty; + return true; + } + + if (normalizedTranscript.StartsWith($"{noisePrefix} ", StringComparison.Ordinal)) + { + trimmedTranscript = normalizedTranscript[(noisePrefix.Length + 1)..].TrimStart(); + return true; + } + } + + trimmedTranscript = normalizedTranscript; + return false; + } + private static YesNoReply TryClassifyTrailingYesNoReply(IReadOnlyList tokens) { var selectedReply = YesNoReply.None; @@ -1677,7 +1743,7 @@ public sealed partial class WebSocketTurnFinalizationService( turnState.FinalizeAttemptCount >= AutoFinalizeContinuationDeferralMaxAttempts) return false; - var normalized = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript); + var normalized = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript); if (string.IsNullOrWhiteSpace(normalized)) return false; if (normalized is "my birthday" or "my birthday is") @@ -1833,6 +1899,32 @@ public sealed partial class WebSocketTurnFinalizationService( }; } + private static TurnContext WithSanitizedTranscript(TurnContext turn, string transcript) + { + return new TurnContext + { + TurnId = turn.TurnId, + SessionId = turn.SessionId, + TimestampUtc = turn.TimestampUtc, + InputMode = turn.InputMode, + SourceKind = turn.SourceKind, + WakePhrase = turn.WakePhrase, + RawTranscript = transcript, + NormalizedTranscript = transcript, + DeviceId = turn.DeviceId, + HostName = turn.HostName, + RequestId = turn.RequestId, + ProtocolService = turn.ProtocolService, + ProtocolOperation = turn.ProtocolOperation, + FirmwareVersion = turn.FirmwareVersion, + ApplicationVersion = turn.ApplicationVersion, + Locale = turn.Locale, + TimeZone = turn.TimeZone, + IsFollowUpEligible = turn.IsFollowUpEligible, + Attributes = turn.Attributes + }; + } + private static bool ReadBoolAttribute(TurnContext turn, string key) { if (!turn.Attributes.TryGetValue(key, out var value) || value is null) return false; @@ -1856,4 +1948,4 @@ public sealed partial class WebSocketTurnFinalizationService( Affirmative = 1, Negative = 2 } -} \ No newline at end of file +} diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs index de993c6..143a30e 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/JiboWebSocketServiceTests.cs @@ -1848,6 +1848,41 @@ public sealed class JiboWebSocketServiceTests listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString()); } + [Fact] + public async Task ClientAsr_YesNoPromptFromAsrHints_MapsUhHuhToYesIntent() + { + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-yesno-huh-token", + Text = + """{"type":"LISTEN","transID":"trans-yesno-huh","data":{"rules":["surprises-ota/want_to_download_now"],"asr":{"hints":["$YESNO"]}}}""" + }); + + var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-yesno-huh-token", + Text = """{"type":"CLIENT_ASR","transID":"trans-yesno-huh","data":{"text":"uh huh"}}""" + }); + + Assert.Equal(3, replies.Count); + + using var listenPayload = JsonDocument.Parse(replies[0].Text!); + Assert.Equal("uh huh", + listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString()); + Assert.Equal("yes", + listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString()); + Assert.Equal("surprises-ota/want_to_download_now", + listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("rules")[0].GetString()); + Assert.Equal("surprises-ota/want_to_download_now", + listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString()); + } + [Fact] public async Task ClientAsr_SharedYesNoPrompt_StripsGlobalRulesAndStaysLocal() { @@ -3535,6 +3570,73 @@ public sealed class JiboWebSocketServiceTests Assert.True(session.FollowUpOpen); } + [Fact] + public async Task ClientAsr_LeadingFillerBeforeGreeting_IsStrippedAndRoutesToGreeting() + { + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-filler-greeting-token", + Text = """{"type":"LISTEN","transID":"trans-filler-greeting","data":{"rules":["wake-word"]}}""" + }); + + var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-filler-greeting-token", + Text = """{"type":"CLIENT_ASR","transID":"trans-filler-greeting","data":{"text":"um hello"}}""" + }); + + Assert.Equal(3, replies.Count); + Assert.Equal("LISTEN", ReadReplyType(replies[0])); + Assert.Equal("EOS", ReadReplyType(replies[1])); + Assert.Equal("SKILL_ACTION", ReadReplyType(replies[2])); + + using var listenPayload = JsonDocument.Parse(replies[0].Text!); + Assert.Equal("hello", + listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString()); + Assert.Equal("hello", + listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString()); + + var session = _store.FindSessionByToken("hub-filler-greeting-token"); + Assert.NotNull(session); + Assert.Equal("hello", session.LastTranscript); + Assert.Equal("hello", session.LastIntent); + } + + [Fact] + public async Task ClientAsr_FillerOnlyTranscript_IsIgnoredAsNoise() + { + await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-filler-only-token", + Text = """{"type":"LISTEN","transID":"trans-filler-only","data":{"rules":["wake-word"]}}""" + }); + + var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope + { + HostName = "neo-hub.jibo.com", + Path = "/listen", + Kind = "neo-hub-listen", + Token = "hub-filler-only-token", + Text = """{"type":"CLIENT_ASR","transID":"trans-filler-only","data":{"text":"hmm"}}""" + }); + + Assert.Empty(replies); + + var session = _store.FindSessionByToken("hub-filler-only-token"); + Assert.NotNull(session); + Assert.Null(session.LastIntent); + Assert.Null(session.LastTranscript); + } + [Fact] public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam() { @@ -4753,4 +4855,4 @@ public sealed class JiboWebSocketServiceTests return Task.FromResult(snapshot); } } -} \ No newline at end of file +}