Tighten STT noise filtering and preserve yes-no replies
This commit is contained in:
@@ -3448,6 +3448,11 @@ public sealed class JiboInteractionService(
|
|||||||
{
|
{
|
||||||
foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
|
foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
|
||||||
{
|
{
|
||||||
|
if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) &&
|
||||||
|
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
|
||||||
|
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
|
||||||
|
continue;
|
||||||
|
|
||||||
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
|
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
|
||||||
{
|
{
|
||||||
trimmedTranscript = string.Empty;
|
trimmedTranscript = string.Empty;
|
||||||
@@ -5093,4 +5098,4 @@ public sealed record JiboInteractionDecision(
|
|||||||
string ReplyText,
|
string ReplyText,
|
||||||
string? SkillName = null,
|
string? SkillName = null,
|
||||||
IDictionary<string, object?>? SkillPayload = null,
|
IDictionary<string, object?>? SkillPayload = null,
|
||||||
IDictionary<string, object?>? ContextUpdates = null);
|
IDictionary<string, object?>? ContextUpdates = null);
|
||||||
|
|||||||
@@ -88,6 +88,23 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
"thank you"
|
"thank you"
|
||||||
];
|
];
|
||||||
|
|
||||||
|
private static readonly string[] TranscriptNoisePrefixes =
|
||||||
|
[
|
||||||
|
"uh",
|
||||||
|
"um",
|
||||||
|
"hmm",
|
||||||
|
"erm",
|
||||||
|
"er",
|
||||||
|
"ah",
|
||||||
|
"eh",
|
||||||
|
"mm",
|
||||||
|
"mmm",
|
||||||
|
"well",
|
||||||
|
"so",
|
||||||
|
"actually",
|
||||||
|
"honestly"
|
||||||
|
];
|
||||||
|
|
||||||
private static readonly HashSet<string> YesNoAffirmativeLeadTokens = new(StringComparer.Ordinal)
|
private static readonly HashSet<string> YesNoAffirmativeLeadTokens = new(StringComparer.Ordinal)
|
||||||
{
|
{
|
||||||
"yes",
|
"yes",
|
||||||
@@ -519,7 +536,7 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
}
|
}
|
||||||
|
|
||||||
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
var finalizedTurn = await ResolveTranscriptAsync(turn, session, cancellationToken);
|
||||||
if (!IsTranscriptUsable(finalizedTurn))
|
if (!TryGetUsableTranscript(finalizedTurn, out var usableTranscript))
|
||||||
finalizedTurn = new TurnContext
|
finalizedTurn = new TurnContext
|
||||||
{
|
{
|
||||||
TurnId = finalizedTurn.TurnId,
|
TurnId = finalizedTurn.TurnId,
|
||||||
@@ -542,6 +559,10 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
IsFollowUpEligible = finalizedTurn.IsFollowUpEligible,
|
IsFollowUpEligible = finalizedTurn.IsFollowUpEligible,
|
||||||
Attributes = finalizedTurn.Attributes
|
Attributes = finalizedTurn.Attributes
|
||||||
};
|
};
|
||||||
|
else if (string.Equals(messageType, "CLIENT_ASR", StringComparison.OrdinalIgnoreCase) &&
|
||||||
|
!string.Equals(usableTranscript, finalizedTurn.NormalizedTranscript ?? finalizedTurn.RawTranscript,
|
||||||
|
StringComparison.Ordinal))
|
||||||
|
finalizedTurn = WithSanitizedTranscript(finalizedTurn, usableTranscript);
|
||||||
|
|
||||||
if (ShouldTreatBufferedHotphraseAsGreeting(finalizedTurn, turnState, allowFallbackOnMissingTranscript))
|
if (ShouldTreatBufferedHotphraseAsGreeting(finalizedTurn, turnState, allowFallbackOnMissingTranscript))
|
||||||
finalizedTurn = WithSyntheticTranscript(finalizedTurn, "hello");
|
finalizedTurn = WithSyntheticTranscript(finalizedTurn, "hello");
|
||||||
@@ -1065,12 +1086,17 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static bool IsTranscriptUsable(TurnContext turn)
|
private static bool IsTranscriptUsable(TurnContext turn)
|
||||||
|
{
|
||||||
|
return TryGetUsableTranscript(turn, out _);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool TryGetUsableTranscript(TurnContext turn, out string transcript)
|
||||||
{
|
{
|
||||||
var messageType = ReadMessageType(turn);
|
var messageType = ReadMessageType(turn);
|
||||||
var clientIntent = ReadAttribute(turn, "clientIntent");
|
var clientIntent = ReadAttribute(turn, "clientIntent");
|
||||||
var pendingProactivityOffer = ReadAttribute(turn, "pendingProactivityOffer");
|
var pendingProactivityOffer = ReadAttribute(turn, "pendingProactivityOffer");
|
||||||
var personalReportState = ReadAttribute(turn, PersonalReportOrchestrator.StateMetadataKey);
|
var personalReportState = ReadAttribute(turn, PersonalReportOrchestrator.StateMetadataKey);
|
||||||
var transcript = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
transcript = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
||||||
var listenRules = ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")).ToArray();
|
var listenRules = ReadRules(turn, "listenRules").Concat(ReadRules(turn, "clientRules")).ToArray();
|
||||||
|
|
||||||
if (string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) &&
|
if (string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) &&
|
||||||
@@ -1226,7 +1252,7 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
{
|
{
|
||||||
if (string.IsNullOrWhiteSpace(normalizedTranscript)) return YesNoReply.None;
|
if (string.IsNullOrWhiteSpace(normalizedTranscript)) return YesNoReply.None;
|
||||||
|
|
||||||
var normalized = normalizedTranscript;
|
var normalized = NormalizeUsableTranscript(normalizedTranscript);
|
||||||
while (TryTrimLeadingAcknowledgement(normalized, out var trimmed)) normalized = trimmed;
|
while (TryTrimLeadingAcknowledgement(normalized, out var trimmed)) normalized = trimmed;
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(normalized)) return YesNoReply.None;
|
if (string.IsNullOrWhiteSpace(normalized)) return YesNoReply.None;
|
||||||
@@ -1261,6 +1287,11 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
{
|
{
|
||||||
foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
|
foreach (var acknowledgement in YesNoAcknowledgementPrefixes)
|
||||||
{
|
{
|
||||||
|
if (string.Equals(acknowledgement, "uh", StringComparison.Ordinal) &&
|
||||||
|
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
|
||||||
|
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
|
||||||
|
continue;
|
||||||
|
|
||||||
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
|
if (string.Equals(normalizedTranscript, acknowledgement, StringComparison.Ordinal))
|
||||||
{
|
{
|
||||||
trimmedTranscript = string.Empty;
|
trimmedTranscript = string.Empty;
|
||||||
@@ -1278,6 +1309,41 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static string NormalizeUsableTranscript(string? transcript)
|
||||||
|
{
|
||||||
|
var normalized = NormalizeTranscript(transcript);
|
||||||
|
if (string.IsNullOrWhiteSpace(normalized)) return string.Empty;
|
||||||
|
|
||||||
|
while (TryTrimLeadingTranscriptNoise(normalized, out var trimmed)) normalized = trimmed;
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool TryTrimLeadingTranscriptNoise(string normalizedTranscript, out string trimmedTranscript)
|
||||||
|
{
|
||||||
|
foreach (var noisePrefix in TranscriptNoisePrefixes)
|
||||||
|
{
|
||||||
|
if (string.Equals(noisePrefix, "uh", StringComparison.Ordinal) &&
|
||||||
|
(string.Equals(normalizedTranscript, "uh huh", StringComparison.Ordinal) ||
|
||||||
|
normalizedTranscript.StartsWith("uh huh ", StringComparison.Ordinal)))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (string.Equals(normalizedTranscript, noisePrefix, StringComparison.Ordinal))
|
||||||
|
{
|
||||||
|
trimmedTranscript = string.Empty;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalizedTranscript.StartsWith($"{noisePrefix} ", StringComparison.Ordinal))
|
||||||
|
{
|
||||||
|
trimmedTranscript = normalizedTranscript[(noisePrefix.Length + 1)..].TrimStart();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
trimmedTranscript = normalizedTranscript;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private static YesNoReply TryClassifyTrailingYesNoReply(IReadOnlyList<string> tokens)
|
private static YesNoReply TryClassifyTrailingYesNoReply(IReadOnlyList<string> tokens)
|
||||||
{
|
{
|
||||||
var selectedReply = YesNoReply.None;
|
var selectedReply = YesNoReply.None;
|
||||||
@@ -1677,7 +1743,7 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
turnState.FinalizeAttemptCount >= AutoFinalizeContinuationDeferralMaxAttempts)
|
turnState.FinalizeAttemptCount >= AutoFinalizeContinuationDeferralMaxAttempts)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
var normalized = NormalizeTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
var normalized = NormalizeUsableTranscript(turn.NormalizedTranscript ?? turn.RawTranscript);
|
||||||
if (string.IsNullOrWhiteSpace(normalized)) return false;
|
if (string.IsNullOrWhiteSpace(normalized)) return false;
|
||||||
|
|
||||||
if (normalized is "my birthday" or "my birthday is")
|
if (normalized is "my birthday" or "my birthday is")
|
||||||
@@ -1833,6 +1899,32 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static TurnContext WithSanitizedTranscript(TurnContext turn, string transcript)
|
||||||
|
{
|
||||||
|
return new TurnContext
|
||||||
|
{
|
||||||
|
TurnId = turn.TurnId,
|
||||||
|
SessionId = turn.SessionId,
|
||||||
|
TimestampUtc = turn.TimestampUtc,
|
||||||
|
InputMode = turn.InputMode,
|
||||||
|
SourceKind = turn.SourceKind,
|
||||||
|
WakePhrase = turn.WakePhrase,
|
||||||
|
RawTranscript = transcript,
|
||||||
|
NormalizedTranscript = transcript,
|
||||||
|
DeviceId = turn.DeviceId,
|
||||||
|
HostName = turn.HostName,
|
||||||
|
RequestId = turn.RequestId,
|
||||||
|
ProtocolService = turn.ProtocolService,
|
||||||
|
ProtocolOperation = turn.ProtocolOperation,
|
||||||
|
FirmwareVersion = turn.FirmwareVersion,
|
||||||
|
ApplicationVersion = turn.ApplicationVersion,
|
||||||
|
Locale = turn.Locale,
|
||||||
|
TimeZone = turn.TimeZone,
|
||||||
|
IsFollowUpEligible = turn.IsFollowUpEligible,
|
||||||
|
Attributes = turn.Attributes
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
private static bool ReadBoolAttribute(TurnContext turn, string key)
|
private static bool ReadBoolAttribute(TurnContext turn, string key)
|
||||||
{
|
{
|
||||||
if (!turn.Attributes.TryGetValue(key, out var value) || value is null) return false;
|
if (!turn.Attributes.TryGetValue(key, out var value) || value is null) return false;
|
||||||
@@ -1856,4 +1948,4 @@ public sealed partial class WebSocketTurnFinalizationService(
|
|||||||
Affirmative = 1,
|
Affirmative = 1,
|
||||||
Negative = 2
|
Negative = 2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1848,6 +1848,41 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
|
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task ClientAsr_YesNoPromptFromAsrHints_MapsUhHuhToYesIntent()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-yesno-huh-token",
|
||||||
|
Text =
|
||||||
|
"""{"type":"LISTEN","transID":"trans-yesno-huh","data":{"rules":["surprises-ota/want_to_download_now"],"asr":{"hints":["$YESNO"]}}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-yesno-huh-token",
|
||||||
|
Text = """{"type":"CLIENT_ASR","transID":"trans-yesno-huh","data":{"text":"uh huh"}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal(3, replies.Count);
|
||||||
|
|
||||||
|
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
|
||||||
|
Assert.Equal("uh huh",
|
||||||
|
listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
|
||||||
|
Assert.Equal("yes",
|
||||||
|
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||||
|
Assert.Equal("surprises-ota/want_to_download_now",
|
||||||
|
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("rules")[0].GetString());
|
||||||
|
Assert.Equal("surprises-ota/want_to_download_now",
|
||||||
|
listenPayload.RootElement.GetProperty("data").GetProperty("match").GetProperty("rule").GetString());
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public async Task ClientAsr_SharedYesNoPrompt_StripsGlobalRulesAndStaysLocal()
|
public async Task ClientAsr_SharedYesNoPrompt_StripsGlobalRulesAndStaysLocal()
|
||||||
{
|
{
|
||||||
@@ -3535,6 +3570,73 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
Assert.True(session.FollowUpOpen);
|
Assert.True(session.FollowUpOpen);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task ClientAsr_LeadingFillerBeforeGreeting_IsStrippedAndRoutesToGreeting()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-filler-greeting-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-filler-greeting","data":{"rules":["wake-word"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-filler-greeting-token",
|
||||||
|
Text = """{"type":"CLIENT_ASR","transID":"trans-filler-greeting","data":{"text":"um hello"}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal(3, replies.Count);
|
||||||
|
Assert.Equal("LISTEN", ReadReplyType(replies[0]));
|
||||||
|
Assert.Equal("EOS", ReadReplyType(replies[1]));
|
||||||
|
Assert.Equal("SKILL_ACTION", ReadReplyType(replies[2]));
|
||||||
|
|
||||||
|
using var listenPayload = JsonDocument.Parse(replies[0].Text!);
|
||||||
|
Assert.Equal("hello",
|
||||||
|
listenPayload.RootElement.GetProperty("data").GetProperty("asr").GetProperty("text").GetString());
|
||||||
|
Assert.Equal("hello",
|
||||||
|
listenPayload.RootElement.GetProperty("data").GetProperty("nlu").GetProperty("intent").GetString());
|
||||||
|
|
||||||
|
var session = _store.FindSessionByToken("hub-filler-greeting-token");
|
||||||
|
Assert.NotNull(session);
|
||||||
|
Assert.Equal("hello", session.LastTranscript);
|
||||||
|
Assert.Equal("hello", session.LastIntent);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task ClientAsr_FillerOnlyTranscript_IsIgnoredAsNoise()
|
||||||
|
{
|
||||||
|
await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-filler-only-token",
|
||||||
|
Text = """{"type":"LISTEN","transID":"trans-filler-only","data":{"rules":["wake-word"]}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
var replies = await _service.HandleMessageAsync(new WebSocketMessageEnvelope
|
||||||
|
{
|
||||||
|
HostName = "neo-hub.jibo.com",
|
||||||
|
Path = "/listen",
|
||||||
|
Kind = "neo-hub-listen",
|
||||||
|
Token = "hub-filler-only-token",
|
||||||
|
Text = """{"type":"CLIENT_ASR","transID":"trans-filler-only","data":{"text":"hmm"}}"""
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Empty(replies);
|
||||||
|
|
||||||
|
var session = _store.FindSessionByToken("hub-filler-only-token");
|
||||||
|
Assert.NotNull(session);
|
||||||
|
Assert.Null(session.LastIntent);
|
||||||
|
Assert.Null(session.LastTranscript);
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam()
|
public async Task BufferedAudio_WithSyntheticTranscriptHint_FinalizesThroughSttSeam()
|
||||||
{
|
{
|
||||||
@@ -4753,4 +4855,4 @@ public sealed class JiboWebSocketServiceTests
|
|||||||
return Task.FromResult<NewsBriefingSnapshot?>(snapshot);
|
return Task.FromResult<NewsBriefingSnapshot?>(snapshot);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user