Tighten STT noise filtering and preserve yes-no replies

This commit is contained in:
Jacob Dubin
2026-05-17 11:18:57 -05:00
parent 3b279fdd6f
commit d8949fcc9a
3 changed files with 206 additions and 7 deletions

View File

@@ -3448,6 +3448,11 @@ public sealed class JiboInteractionService(
{ {
foreach (var acknowledgement in YesNoAcknowledgementPrefixes) foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
{ {
if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) &&
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
continue;
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal)) if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
{ {
trimmedTranscript = string.Empty; trimmedTranscript = string.Empty;
@@ -5093,4 +5098,4 @@ public sealed record JiboInteractionDecision(
string ReplyText, string ReplyText,
string? SkillName = null, string? SkillName = null,
IDictionary<string, object?>? SkillPayload = null, IDictionary<string, object?>? SkillPayload = null,
IDictionary<string, object?>? ContextUpdates = null); IDictionary<string, object?>? ContextUpdates = null);

View File

@@ -88,6 +88,23 @@ public sealed partial class WebSocketTurnFinalizationService(
"thank you" "thank you"
]; ];
private static readonly string[] TranscriptNoisePrefixes =
[
"uh",
"um",
"hmm",
"erm",
"er",
"ah",
"eh",
"mm",
"mmm",
"well",
"so",
"actually",
"honestly"
];
private static readonly HashSet<string> YesNoAffirmativeLeadTokens = new(StringComparer.Ordinal) private static readonly HashSet<string> YesNoAffirmativeLeadTokens = new(StringComparer.Ordinal)
{ {
"yes", "yes",
@@ -519,7 +536,7 @@ public sealed partial class WebSocketTurnFinalizationService(
} }
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken); var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
if (!IsTranscriptUsable(finalizedTurn)) if (!TryGetUsableTranscript(finalizedTurn, out var usableTranscript))
finalizedTurn = new TurnContext finalizedTurn = new TurnContext
{ {
TurnId = finalizedTurn.TurnId, TurnId = finalizedTurn.TurnId,
@@ -542,6 +559,10 @@ public sealed partial class WebSocketTurnFinalizationService(
IsFollowUpEligible = finalizedTurn.IsFollowUpEligible, IsFollowUpEligible = finalizedTurn.IsFollowUpEligible,
Attributes = finalizedTurn.Attributes Attributes = finalizedTurn.Attributes
}; };
else if (string.Equals(messageType, "CLIENT_ASR", StringComparison.OrdinalIgnoreCase) &&
!string.Equals(usableTranscript, finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript,
StringComparison.Ordinal))
finalizedTurn = WithSanitizedTranscript(finalizedTurn, usableTranscript);
if (ShouldTreatBufferedHotphraseAsGreeting(finalizedTurn, turnState, allowFallbackOnMissingTranscript)) if (ShouldTreatBufferedHotphraseAsGreeting(finalizedTurn, turnState, allowFallbackOnMissingTranscript))
finalizedTurn = WithSyntheticTranscript(finalizedTurn, "hello"); finalizedTurn = WithSyntheticTranscript(finalizedTurn, "hello");
@@ -1065,12 +1086,17 @@ public sealed partial class WebSocketTurnFinalizationService(
} }
private static bool IsTranscriptUsable(TurnContext turn) private static bool IsTranscriptUsable(TurnContext turn)
{
return TryGetUsableTranscript(turn, out _);
}
private static bool TryGetUsableTranscript(TurnContext turn, out string transcript)
{ {
var messageType = ReadMessageType(turn); var messageType = ReadMessageType(turn);
var clientIntent = ReadAttribute(turn, "clientIntent"); var clientIntent = ReadAttribute(turn, "clientIntent");
var pendingProactivityOffer = ReadAttribute(turn, "pendingProactivityOffer"); var pendingProactivityOffer = ReadAttribute(turn, "pendingProactivityOffer");
var personalReportState = ReadAttribute(turn, PersonalReportOrchestrator.StateMetadataKey); var personalReportState = ReadAttribute(turn, PersonalReportOrchestrator.StateMetadataKey);
var transcript = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript); transcript = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
var listenRules = ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")).ToArray(); var listenRules = ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")).ToArray();
if (string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && if (string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) &&
@@ -1226,7 +1252,7 @@ public sealed partial class WebSocketTurnFinalizationService(
{ {
if (string.IsNullOrWhiteSpace(normalizedTranscript)) return YesNoReply.None; if (string.IsNullOrWhiteSpace(normalizedTranscript)) return YesNoReply.None;
var normalized = normalizedTranscript; var normalized = NormalizeUsableTranscript(normalizedTranscript);
while (TryTrimLeadingAcknowledgement(normalized, out var trimmed)) normalized = trimmed; while (TryTrimLeadingAcknowledgement(normalized, out var trimmed)) normalized = trimmed;
if (string.IsNullOrWhiteSpace(normalized)) return YesNoReply.None; if (string.IsNullOrWhiteSpace(normalized)) return YesNoReply.None;
@@ -1261,6 +1287,11 @@ public sealed partial class WebSocketTurnFinalizationService(
{ {
foreach (var acknowledgement in YesNoAcknowledgementPrefixes) foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
{ {
if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) &&
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
continue;
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal)) if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
{ {
trimmedTranscript = string.Empty; trimmedTranscript = string.Empty;
@@ -1278,6 +1309,41 @@ public sealed partial class WebSocketTurnFinalizationService(
return false; return false;
} }
private static string NormalizeUsableTranscript(string? transcript)
{
var normalized = NormalizeTranscript(transcript);
if (string.IsNullOrWhiteSpace(normalized)) return string.Empty;
while (TryTrimLeadingTranscriptNoise(normalized, out var trimmed)) normalized = trimmed;
return normalized;
}
private static bool TryTrimLeadingTranscriptNoise(string normalizedTranscript, out string trimmedTranscript)
{
foreach (var noisePrefix in TranscriptNoisePrefixes)
{
if (string.Equals(noisePrefix, "uh", StringComparison.Ordinal) &&
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
continue;
if (string.Equals(normalizedTranscript, noisePrefix, StringComparison.Ordinal))
{
trimmedTranscript = string.Empty;
return true;
}
if (normalizedTranscript.StartsWith($"{noisePrefix} ", StringComparison.Ordinal))
{
trimmedTranscript = normalizedTranscript[(noisePrefix.Length + 1)..].TrimStart();
return true;
}
}
trimmedTranscript = normalizedTranscript;
return false;
}
private static YesNoReply TryClassifyTrailingYesNoReply(IReadOnlyList<string> tokens) private static YesNoReply TryClassifyTrailingYesNoReply(IReadOnlyList<string> tokens)
{ {
var selectedReply = YesNoReply.None; var selectedReply = YesNoReply.None;
@@ -1677,7 +1743,7 @@ public sealed partial class WebSocketTurnFinalizationService(
turnState.FinalizeAttemptCount >= AutoFinalizeContinuationDeferralMaxAttempts) turnState.FinalizeAttemptCount >= AutoFinalizeContinuationDeferralMaxAttempts)
return false; return false;
var normalized = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript); var normalized = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
if (string.IsNullOrWhiteSpace(normalized)) return false; if (string.IsNullOrWhiteSpace(normalized)) return false;
if (normalized is "my birthday" or "my birthday is") if (normalized is "my birthday" or "my birthday is")
@@ -1833,6 +1899,32 @@ public sealed partial class WebSocketTurnFinalizationService(
}; };
} }
private static TurnContext WithSanitizedTranscript(TurnContext turn, string transcript)
{
return new TurnContext
{
TurnId = turn.TurnId,
SessionId = turn.SessionId,
TimestampUtc = turn.TimestampUtc,
InputMode = turn.InputMode,
SourceKind = turn.SourceKind,
WakePhrase = turn.WakePhrase,
RawTranscript = transcript,
NormalizedTranscript = transcript,
DeviceId = turn.DeviceId,
HostName = turn.HostName,
RequestId = turn.RequestId,
ProtocolService = turn.ProtocolService,
ProtocolOperation = turn.ProtocolOperation,
FirmwareVersion = turn.FirmwareVersion,
ApplicationVersion = turn.ApplicationVersion,
Locale = turn.Locale,
TimeZone = turn.TimeZone,
IsFollowUpEligible = turn.IsFollowUpEligible,
Attributes = turn.Attributes
};
}
private static bool ReadBoolAttribute(TurnContext turn, string key) private static bool ReadBoolAttribute(TurnContext turn, string key)
{ {
if (!turn.Attributes.TryGetValue(key, out var value) || value is null) return false; if (!turn.Attributes.TryGetValue(key, out var value) || value is null) return false;
@@ -1856,4 +1948,4 @@ public sealed partial class WebSocketTurnFinalizationService(
Affirmative = 1, Affirmative = 1,
Negative = 2 Negative = 2
} }
} }

View File

@@ -1848,6 +1848,41 @@ public sealed class JiboWebSocketServiceTests
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString()); listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
} }
[Fact]
public async Task ClientAsr_YesNoPromptFromAsrHints_MapsUhHuhToYesIntent()
{
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-yesno-huh-token",
Text =
"""{"type":"LISTEN","transID":"trans-yesno-huh","data":{"rules":["surprises-ota/want_to_download_now"],"asr":{"hints":["$YESNO"]}}}"""
});
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-yesno-huh-token",
Text = """{"type":"CLIENT_ASR","transID":"trans-yesno-huh","data":{"text":"uh huh"}}"""
});
Assert.Equal(3, replies.Count);
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
Assert.Equal("uh huh",
listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
Assert.Equal("yes",
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
Assert.Equal("surprises-ota/want_to_download_now",
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("rules")[0].GetString());
Assert.Equal("surprises-ota/want_to_download_now",
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
}
[Fact] [Fact]
public async Task ClientAsr_SharedYesNoPrompt_StripsGlobalRulesAndStaysLocal() public async Task ClientAsr_SharedYesNoPrompt_StripsGlobalRulesAndStaysLocal()
{ {
@@ -3535,6 +3570,73 @@ public sealed class JiboWebSocketServiceTests
Assert.True(session.FollowUpOpen); Assert.True(session.FollowUpOpen);
} }
[Fact]
public async Task ClientAsr_LeadingFillerBeforeGreeting_IsStrippedAndRoutesToGreeting()
{
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-filler-greeting-token",
Text = """{"type":"LISTEN","transID":"trans-filler-greeting","data":{"rules":["wake-word"]}}"""
});
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-filler-greeting-token",
Text = """{"type":"CLIENT_ASR","transID":"trans-filler-greeting","data":{"text":"um hello"}}"""
});
Assert.Equal(3, replies.Count);
Assert.Equal("LISTEN", ReadReplyType(replies[0]));
Assert.Equal("EOS", ReadReplyType(replies[1]));
Assert.Equal("SKILL_ACTION", ReadReplyType(replies[2]));
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
Assert.Equal("hello",
listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
Assert.Equal("hello",
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
var session = _store.FindSessionByToken("hub-filler-greeting-token");
Assert.NotNull(session);
Assert.Equal("hello", session.LastTranscript);
Assert.Equal("hello", session.LastIntent);
}
[Fact]
public async Task ClientAsr_FillerOnlyTranscript_IsIgnoredAsNoise()
{
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-filler-only-token",
Text = """{"type":"LISTEN","transID":"trans-filler-only","data":{"rules":["wake-word"]}}"""
});
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-filler-only-token",
Text = """{"type":"CLIENT_ASR","transID":"trans-filler-only","data":{"text":"hmm"}}"""
});
Assert.Empty(replies);
var session = _store.FindSessionByToken("hub-filler-only-token");
Assert.NotNull(session);
Assert.Null(session.LastIntent);
Assert.Null(session.LastTranscript);
}
[Fact] [Fact]
public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam() public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam()
{ {
@@ -4753,4 +4855,4 @@ public sealed class JiboWebSocketServiceTests
return Task.FromResult<NewsBriefingSnapshot?>(snapshot); return Task.FromResult<NewsBriefingSnapshot?>(snapshot);
} }
} }
} }