Tighten STT noise filtering and preserve yes-no replies

This commit is contained in:
Jacob Dubin
2026-05-17 11:18:57 -05:00
parent 3b279fdd6f
commit d8949fcc9a
3 changed files with 206 additions and 7 deletions

View File

@@ -3448,6 +3448,11 @@ public sealed class JiboInteractionService(
{
foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
{
if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) &&
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
continue;
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
{
trimmedTranscript = string.Empty;

View File

@@ -88,6 +88,23 @@ public sealed partial class WebSocketTurnFinalizationService(
"thank you"
];
private static readonly string[] TranscriptNoisePrefixes =
[
"uh",
"um",
"hmm",
"erm",
"er",
"ah",
"eh",
"mm",
"mmm",
"well",
"so",
"actually",
"honestly"
];
private static readonly HashSet<string> YesNoAffirmativeLeadTokens = new(StringComparer.Ordinal)
{
"yes",
@@ -519,7 +536,7 @@ public sealed partial class WebSocketTurnFinalizationService(
}
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
if (!IsTranscriptUsable(finalizedTurn))
if (!TryGetUsableTranscript(finalizedTurn, out var usableTranscript))
finalizedTurn = new TurnContext
{
TurnId = finalizedTurn.TurnId,
@@ -542,6 +559,10 @@ public sealed partial class WebSocketTurnFinalizationService(
IsFollowUpEligible = finalizedTurn.IsFollowUpEligible,
Attributes = finalizedTurn.Attributes
};
else if (string.Equals(messageType, "CLIENT_ASR", StringComparison.OrdinalIgnoreCase) &&
!string.Equals(usableTranscript, finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript,
StringComparison.Ordinal))
finalizedTurn = WithSanitizedTranscript(finalizedTurn, usableTranscript);
if (ShouldTreatBufferedHotphraseAsGreeting(finalizedTurn, turnState, allowFallbackOnMissingTranscript))
finalizedTurn = WithSyntheticTranscript(finalizedTurn, "hello");
@@ -1065,12 +1086,17 @@ public sealed partial class WebSocketTurnFinalizationService(
}
private static bool IsTranscriptUsable(TurnContext turn)
{
return TryGetUsableTranscript(turn, out _);
}
private static bool TryGetUsableTranscript(TurnContext turn, out string transcript)
{
var messageType = ReadMessageType(turn);
var clientIntent = ReadAttribute(turn, "clientIntent");
var pendingProactivityOffer = ReadAttribute(turn, "pendingProactivityOffer");
var personalReportState = ReadAttribute(turn, PersonalReportOrchestrator.StateMetadataKey);
var transcript = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
transcript = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
var listenRules = ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")).ToArray();
if (string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) &&
@@ -1226,7 +1252,7 @@ public sealed partial class WebSocketTurnFinalizationService(
{
if (string.IsNullOrWhiteSpace(normalizedTranscript)) return YesNoReply.None;
var normalized = normalizedTranscript;
var normalized = NormalizeUsableTranscript(normalizedTranscript);
while (TryTrimLeadingAcknowledgement(normalized, out var trimmed)) normalized = trimmed;
if (string.IsNullOrWhiteSpace(normalized)) return YesNoReply.None;
@@ -1261,6 +1287,11 @@ public sealed partial class WebSocketTurnFinalizationService(
{
foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
{
if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) &&
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
continue;
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
{
trimmedTranscript = string.Empty;
@@ -1278,6 +1309,41 @@ public sealed partial class WebSocketTurnFinalizationService(
return false;
}
private static string NormalizeUsableTranscript(string? transcript)
{
var normalized = NormalizeTranscript(transcript);
if (string.IsNullOrWhiteSpace(normalized)) return string.Empty;
while (TryTrimLeadingTranscriptNoise(normalized, out var trimmed)) normalized = trimmed;
return normalized;
}
private static bool TryTrimLeadingTranscriptNoise(string normalizedTranscript, out string trimmedTranscript)
{
foreach (var noisePrefix in TranscriptNoisePrefixes)
{
if (string.Equals(noisePrefix, "uh", StringComparison.Ordinal) &&
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
continue;
if (string.Equals(normalizedTranscript, noisePrefix, StringComparison.Ordinal))
{
trimmedTranscript = string.Empty;
return true;
}
if (normalizedTranscript.StartsWith($"{noisePrefix} ", StringComparison.Ordinal))
{
trimmedTranscript = normalizedTranscript[(noisePrefix.Length + 1)..].TrimStart();
return true;
}
}
trimmedTranscript = normalizedTranscript;
return false;
}
private static YesNoReply TryClassifyTrailingYesNoReply(IReadOnlyList<string> tokens)
{
var selectedReply = YesNoReply.None;
@@ -1677,7 +1743,7 @@ public sealed partial class WebSocketTurnFinalizationService(
turnState.FinalizeAttemptCount >= AutoFinalizeContinuationDeferralMaxAttempts)
return false;
var normalized = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
var normalized = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
if (string.IsNullOrWhiteSpace(normalized)) return false;
if (normalized is "my birthday" or "my birthday is")
@@ -1833,6 +1899,32 @@ public sealed partial class WebSocketTurnFinalizationService(
};
}
private static TurnContext WithSanitizedTranscript(TurnContext turn, string transcript)
{
return new TurnContext
{
TurnId = turn.TurnId,
SessionId = turn.SessionId,
TimestampUtc = turn.TimestampUtc,
InputMode = turn.InputMode,
SourceKind = turn.SourceKind,
WakePhrase = turn.WakePhrase,
RawTranscript = transcript,
NormalizedTranscript = transcript,
DeviceId = turn.DeviceId,
HostName = turn.HostName,
RequestId = turn.RequestId,
ProtocolService = turn.ProtocolService,
ProtocolOperation = turn.ProtocolOperation,
FirmwareVersion = turn.FirmwareVersion,
ApplicationVersion = turn.ApplicationVersion,
Locale = turn.Locale,
TimeZone = turn.TimeZone,
IsFollowUpEligible = turn.IsFollowUpEligible,
Attributes = turn.Attributes
};
}
private static bool ReadBoolAttribute(TurnContext turn, string key)
{
if (!turn.Attributes.TryGetValue(key, out var value) || value is null) return false;

View File

@@ -1848,6 +1848,41 @@ public sealed class JiboWebSocketServiceTests
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
}
[Fact]
public async Task ClientAsr_YesNoPromptFromAsrHints_MapsUhHuhToYesIntent()
{
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-yesno-huh-token",
Text =
"""{"type":"LISTEN","transID":"trans-yesno-huh","data":{"rules":["surprises-ota/want_to_download_now"],"asr":{"hints":["$YESNO"]}}}"""
});
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-yesno-huh-token",
Text = """{"type":"CLIENT_ASR","transID":"trans-yesno-huh","data":{"text":"uh huh"}}"""
});
Assert.Equal(3, replies.Count);
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
Assert.Equal("uh huh",
listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
Assert.Equal("yes",
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
Assert.Equal("surprises-ota/want_to_download_now",
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("rules")[0].GetString());
Assert.Equal("surprises-ota/want_to_download_now",
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
}
[Fact]
public async Task ClientAsr_SharedYesNoPrompt_StripsGlobalRulesAndStaysLocal()
{
@@ -3535,6 +3570,73 @@ public sealed class JiboWebSocketServiceTests
Assert.True(session.FollowUpOpen);
}
[Fact]
public async Task ClientAsr_LeadingFillerBeforeGreeting_IsStrippedAndRoutesToGreeting()
{
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-filler-greeting-token",
Text = """{"type":"LISTEN","transID":"trans-filler-greeting","data":{"rules":["wake-word"]}}"""
});
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-filler-greeting-token",
Text = """{"type":"CLIENT_ASR","transID":"trans-filler-greeting","data":{"text":"um hello"}}"""
});
Assert.Equal(3, replies.Count);
Assert.Equal("LISTEN", ReadReplyType(replies[0]));
Assert.Equal("EOS", ReadReplyType(replies[1]));
Assert.Equal("SKILL_ACTION", ReadReplyType(replies[2]));
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
Assert.Equal("hello",
listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
Assert.Equal("hello",
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
var session = _store.FindSessionByToken("hub-filler-greeting-token");
Assert.NotNull(session);
Assert.Equal("hello", session.LastTranscript);
Assert.Equal("hello", session.LastIntent);
}
[Fact]
public async Task ClientAsr_FillerOnlyTranscript_IsIgnoredAsNoise()
{
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-filler-only-token",
Text = """{"type":"LISTEN","transID":"trans-filler-only","data":{"rules":["wake-word"]}}"""
});
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
{
HostName = "neo-hub.jibo.com",
Path = "/listen",
Kind = "neo-hub-listen",
Token = "hub-filler-only-token",
Text = """{"type":"CLIENT_ASR","transID":"trans-filler-only","data":{"text":"hmm"}}"""
});
Assert.Empty(replies);
var session = _store.FindSessionByToken("hub-filler-only-token");
Assert.NotNull(session);
Assert.Null(session.LastIntent);
Assert.Null(session.LastTranscript);
}
[Fact]
public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam()
{