Tighten STT noise filtering and preserve yes-no replies
This commit is contained in:
@@ -3448,6 +3448,11 @@ public sealed class JiboInteractionService(
|
||||
{
|
||||
foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
|
||||
{
|
||||
if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) &&
|
||||
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
|
||||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
|
||||
continue;
|
||||
|
||||
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
|
||||
{
|
||||
trimmedTranscript = string.Empty;
|
||||
|
||||
@@ -88,6 +88,23 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
"thank you"
|
||||
];
|
||||
|
||||
private static readonly string[] TranscriptNoisePrefixes =
|
||||
[
|
||||
"uh",
|
||||
"um",
|
||||
"hmm",
|
||||
"erm",
|
||||
"er",
|
||||
"ah",
|
||||
"eh",
|
||||
"mm",
|
||||
"mmm",
|
||||
"well",
|
||||
"so",
|
||||
"actually",
|
||||
"honestly"
|
||||
];
|
||||
|
||||
private static readonly HashSet<string> YesNoAffirmativeLeadTokens = new(StringComparer.Ordinal)
|
||||
{
|
||||
"yes",
|
||||
@@ -519,7 +536,7 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
}
|
||||
|
||||
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
||||
if (!IsTranscriptUsable(finalizedTurn))
|
||||
if (!TryGetUsableTranscript(finalizedTurn, out var usableTranscript))
|
||||
finalizedTurn = new TurnContext
|
||||
{
|
||||
TurnId = finalizedTurn.TurnId,
|
||||
@@ -542,6 +559,10 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
IsFollowUpEligible = finalizedTurn.IsFollowUpEligible,
|
||||
Attributes = finalizedTurn.Attributes
|
||||
};
|
||||
else if (string.Equals(messageType, "CLIENT_ASR", StringComparison.OrdinalIgnoreCase) &&
|
||||
!string.Equals(usableTranscript, finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript,
|
||||
StringComparison.Ordinal))
|
||||
finalizedTurn = WithSanitizedTranscript(finalizedTurn, usableTranscript);
|
||||
|
||||
if (ShouldTreatBufferedHotphraseAsGreeting(finalizedTurn, turnState, allowFallbackOnMissingTranscript))
|
||||
finalizedTurn = WithSyntheticTranscript(finalizedTurn, "hello");
|
||||
@@ -1065,12 +1086,17 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
}
|
||||
|
||||
private static bool IsTranscriptUsable(TurnContext turn)
|
||||
{
|
||||
return TryGetUsableTranscript(turn, out _);
|
||||
}
|
||||
|
||||
private static bool TryGetUsableTranscript(TurnContext turn, out string transcript)
|
||||
{
|
||||
var messageType = ReadMessageType(turn);
|
||||
var clientIntent = ReadAttribute(turn, "clientIntent");
|
||||
var pendingProactivityOffer = ReadAttribute(turn, "pendingProactivityOffer");
|
||||
var personalReportState = ReadAttribute(turn, PersonalReportOrchestrator.StateMetadataKey);
|
||||
var transcript = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
||||
transcript = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
||||
var listenRules = ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")).ToArray();
|
||||
|
||||
if (string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) &&
|
||||
@@ -1226,7 +1252,7 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(normalizedTranscript)) return YesNoReply.None;
|
||||
|
||||
var normalized = normalizedTranscript;
|
||||
var normalized = NormalizeUsableTranscript(normalizedTranscript);
|
||||
while (TryTrimLeadingAcknowledgement(normalized, out var trimmed)) normalized = trimmed;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(normalized)) return YesNoReply.None;
|
||||
@@ -1261,6 +1287,11 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
{
|
||||
foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
|
||||
{
|
||||
if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) &&
|
||||
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
|
||||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
|
||||
continue;
|
||||
|
||||
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
|
||||
{
|
||||
trimmedTranscript = string.Empty;
|
||||
@@ -1278,6 +1309,41 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
return false;
|
||||
}
|
||||
|
||||
private static string NormalizeUsableTranscript(string? transcript)
|
||||
{
|
||||
var normalized = NormalizeTranscript(transcript);
|
||||
if (string.IsNullOrWhiteSpace(normalized)) return string.Empty;
|
||||
|
||||
while (TryTrimLeadingTranscriptNoise(normalized, out var trimmed)) normalized = trimmed;
|
||||
return normalized;
|
||||
}
|
||||
|
||||
private static bool TryTrimLeadingTranscriptNoise(string normalizedTranscript, out string trimmedTranscript)
|
||||
{
|
||||
foreach (var noisePrefix in TranscriptNoisePrefixes)
|
||||
{
|
||||
if (string.Equals(noisePrefix, "uh", StringComparison.Ordinal) &&
|
||||
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
|
||||
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
|
||||
continue;
|
||||
|
||||
if (string.Equals(normalizedTranscript, noisePrefix, StringComparison.Ordinal))
|
||||
{
|
||||
trimmedTranscript = string.Empty;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (normalizedTranscript.StartsWith($"{noisePrefix} ", StringComparison.Ordinal))
|
||||
{
|
||||
trimmedTranscript = normalizedTranscript[(noisePrefix.Length + 1)..].TrimStart();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
trimmedTranscript = normalizedTranscript;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static YesNoReply TryClassifyTrailingYesNoReply(IReadOnlyList<string> tokens)
|
||||
{
|
||||
var selectedReply = YesNoReply.None;
|
||||
@@ -1677,7 +1743,7 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
turnState.FinalizeAttemptCount >= AutoFinalizeContinuationDeferralMaxAttempts)
|
||||
return false;
|
||||
|
||||
var normalized = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
||||
var normalized = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
||||
if (string.IsNullOrWhiteSpace(normalized)) return false;
|
||||
|
||||
if (normalized is "my birthday" or "my birthday is")
|
||||
@@ -1833,6 +1899,32 @@ public sealed partial class WebSocketTurnFinalizationService(
|
||||
};
|
||||
}
|
||||
|
||||
private static TurnContext WithSanitizedTranscript(TurnContext turn, string transcript)
|
||||
{
|
||||
return new TurnContext
|
||||
{
|
||||
TurnId = turn.TurnId,
|
||||
SessionId = turn.SessionId,
|
||||
TimestampUtc = turn.TimestampUtc,
|
||||
InputMode = turn.InputMode,
|
||||
SourceKind = turn.SourceKind,
|
||||
WakePhrase = turn.WakePhrase,
|
||||
RawTranscript = transcript,
|
||||
NormalizedTranscript = transcript,
|
||||
DeviceId = turn.DeviceId,
|
||||
HostName = turn.HostName,
|
||||
RequestId = turn.RequestId,
|
||||
ProtocolService = turn.ProtocolService,
|
||||
ProtocolOperation = turn.ProtocolOperation,
|
||||
FirmwareVersion = turn.FirmwareVersion,
|
||||
ApplicationVersion = turn.ApplicationVersion,
|
||||
Locale = turn.Locale,
|
||||
TimeZone = turn.TimeZone,
|
||||
IsFollowUpEligible = turn.IsFollowUpEligible,
|
||||
Attributes = turn.Attributes
|
||||
};
|
||||
}
|
||||
|
||||
private static bool ReadBoolAttribute(TurnContext turn, string key)
|
||||
{
|
||||
if (!turn.Attributes.TryGetValue(key, out var value) || value is null) return false;
|
||||
|
||||
@@ -1848,6 +1848,41 @@ public sealed class JiboWebSocketServiceTests
|
||||
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ClientAsr_YesNoPromptFromAsrHints_MapsUhHuhToYesIntent()
|
||||
{
|
||||
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||
{
|
||||
HostName = "neo-hub.jibo.com",
|
||||
Path = "/listen",
|
||||
Kind = "neo-hub-listen",
|
||||
Token = "hub-yesno-huh-token",
|
||||
Text =
|
||||
"""{"type":"LISTEN","transID":"trans-yesno-huh","data":{"rules":["surprises-ota/want_to_download_now"],"asr":{"hints":["$YESNO"]}}}"""
|
||||
});
|
||||
|
||||
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||
{
|
||||
HostName = "neo-hub.jibo.com",
|
||||
Path = "/listen",
|
||||
Kind = "neo-hub-listen",
|
||||
Token = "hub-yesno-huh-token",
|
||||
Text = """{"type":"CLIENT_ASR","transID":"trans-yesno-huh","data":{"text":"uh huh"}}"""
|
||||
});
|
||||
|
||||
Assert.Equal(3, replies.Count);
|
||||
|
||||
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
|
||||
Assert.Equal("uh huh",
|
||||
listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
|
||||
Assert.Equal("yes",
|
||||
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||
Assert.Equal("surprises-ota/want_to_download_now",
|
||||
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("rules")[0].GetString());
|
||||
Assert.Equal("surprises-ota/want_to_download_now",
|
||||
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ClientAsr_SharedYesNoPrompt_StripsGlobalRulesAndStaysLocal()
|
||||
{
|
||||
@@ -3535,6 +3570,73 @@ public sealed class JiboWebSocketServiceTests
|
||||
Assert.True(session.FollowUpOpen);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ClientAsr_LeadingFillerBeforeGreeting_IsStrippedAndRoutesToGreeting()
|
||||
{
|
||||
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||
{
|
||||
HostName = "neo-hub.jibo.com",
|
||||
Path = "/listen",
|
||||
Kind = "neo-hub-listen",
|
||||
Token = "hub-filler-greeting-token",
|
||||
Text = """{"type":"LISTEN","transID":"trans-filler-greeting","data":{"rules":["wake-word"]}}"""
|
||||
});
|
||||
|
||||
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||
{
|
||||
HostName = "neo-hub.jibo.com",
|
||||
Path = "/listen",
|
||||
Kind = "neo-hub-listen",
|
||||
Token = "hub-filler-greeting-token",
|
||||
Text = """{"type":"CLIENT_ASR","transID":"trans-filler-greeting","data":{"text":"um hello"}}"""
|
||||
});
|
||||
|
||||
Assert.Equal(3, replies.Count);
|
||||
Assert.Equal("LISTEN", ReadReplyType(replies[0]));
|
||||
Assert.Equal("EOS", ReadReplyType(replies[1]));
|
||||
Assert.Equal("SKILL_ACTION", ReadReplyType(replies[2]));
|
||||
|
||||
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
|
||||
Assert.Equal("hello",
|
||||
listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
|
||||
Assert.Equal("hello",
|
||||
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||
|
||||
var session = _store.FindSessionByToken("hub-filler-greeting-token");
|
||||
Assert.NotNull(session);
|
||||
Assert.Equal("hello", session.LastTranscript);
|
||||
Assert.Equal("hello", session.LastIntent);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ClientAsr_FillerOnlyTranscript_IsIgnoredAsNoise()
|
||||
{
|
||||
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||
{
|
||||
HostName = "neo-hub.jibo.com",
|
||||
Path = "/listen",
|
||||
Kind = "neo-hub-listen",
|
||||
Token = "hub-filler-only-token",
|
||||
Text = """{"type":"LISTEN","transID":"trans-filler-only","data":{"rules":["wake-word"]}}"""
|
||||
});
|
||||
|
||||
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||
{
|
||||
HostName = "neo-hub.jibo.com",
|
||||
Path = "/listen",
|
||||
Kind = "neo-hub-listen",
|
||||
Token = "hub-filler-only-token",
|
||||
Text = """{"type":"CLIENT_ASR","transID":"trans-filler-only","data":{"text":"hmm"}}"""
|
||||
});
|
||||
|
||||
Assert.Empty(replies);
|
||||
|
||||
var session = _store.FindSessionByToken("hub-filler-only-token");
|
||||
Assert.NotNull(session);
|
||||
Assert.Null(session.LastIntent);
|
||||
Assert.Null(session.LastTranscript);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user