Normalize loose STT transcripts before routing

This commit is contained in:
Jacob Dubin
2026-05-17 14:02:47 -05:00
parent 193fa56847
commit c0485da46d
5 changed files with 111 additions and 7 deletions

View File

@@ -20,7 +20,7 @@ public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy
return Task.FromResult(new SttResult
{
Text = transcriptHint.Trim(),
Text = NormalizeLooseTranscript(transcriptHint),
Provider = Name,
Confidence = 0.75f,
Locale = turn.Locale,
@@ -51,4 +51,16 @@ public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy
? transcriptHint?.ToString()
: null;
}
}
private static string NormalizeLooseTranscript(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
var lowered = value.Trim().ToLowerInvariant();
lowered = System.Text.RegularExpressions.Regex.Replace(lowered, @"[^\p{L}\p{N}\s']+", " ",
System.Text.RegularExpressions.RegexOptions.CultureInvariant | System.Text.RegularExpressions.RegexOptions.Compiled);
lowered = System.Text.RegularExpressions.Regex.Replace(lowered, @"\s+"," ",
System.Text.RegularExpressions.RegexOptions.CultureInvariant | System.Text.RegularExpressions.RegexOptions.Compiled);
return lowered.Trim();
}
}

View File

@@ -0,0 +1,24 @@
using System.Text.RegularExpressions;
namespace Jibo.Cloud.Infrastructure.Audio;
internal static class AudioTranscriptNormalizer
{
private static readonly Regex PunctuationToSpaceRegex = new(
@"[^\p{L}\p{N}\s']+",
RegexOptions.CultureInvariant | RegexOptions.Compiled);
private static readonly Regex WhitespaceRegex = new(
@"\s+",
RegexOptions.CultureInvariant | RegexOptions.Compiled);
public static string NormalizeLooseTranscript(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
return WhitespaceRegex.Replace(
PunctuationToSpaceRegex.Replace(value.Trim().ToLowerInvariant(), " "),
" ")
.Trim();
}
}

View File

@@ -52,6 +52,7 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
cancellationToken);
var transcript = ExtractTranscript(whisperResult.StdOut);
transcript = AudioTranscriptNormalizer.NormalizeLooseTranscript(transcript);
if (string.IsNullOrWhiteSpace(transcript))
throw new InvalidOperationException("whisper.cpp returned no transcript for the buffered audio turn.");
@@ -154,4 +155,4 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
return !checkFileExists || File.Exists(path);
}
}
}

View File

@@ -123,6 +123,48 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
}
}
[Fact]
public async Task TranscribeAsync_NormalizesLoosePunctuationFromWhisperOutput()
{
var tempDirectory = Path.Combine(Path.GetTempPath(), $"openjibo-stt-test-{Guid.NewGuid():N}");
Directory.CreateDirectory(tempDirectory);
try
{
var runner = new FakeExternalProcessRunner("[00:00:00.000 --> 00:00:01.000] - Thank you. - Yes.");
var strategy = new LocalWhisperCppBufferedAudioSttStrategy(
new BufferedAudioSttOptions
{
EnableLocalWhisperCpp = true,
FfmpegPath = "ffmpeg",
WhisperCliPath = "whisper-cli",
WhisperModelPath = "model.bin",
TempDirectory = tempDirectory
},
runner);
var turn = new TurnContext
{
TurnId = "turn-local-stt-punctuation",
Locale = "en-US",
Attributes = new Dictionary<string, object?>
{
["bufferedAudioBytes"] = 47,
["bufferedAudioFrames"] = new[] { BuildMinimalOggPage() }
}
};
var result = await strategy.TranscribeAsync(turn);
Assert.Equal("thank you yes", result.Text);
Assert.Equal("local-whispercpp-buffered-audio", result.Provider);
}
finally
{
if (Directory.Exists(tempDirectory)) Directory.Delete(tempDirectory, true);
}
}
private static byte[] BuildMinimalOggPage()
{
return
@@ -148,7 +190,8 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
return page;
}
private sealed class FakeExternalProcessRunner : IExternalProcessRunner
private sealed class FakeExternalProcessRunner(string whisperStdOut = "[00:00:00.000 --> 00:00:01.000] tell me a joke")
: IExternalProcessRunner
{
public List<(string FileName, IReadOnlyList<string> Arguments)> Calls { get; } = [];
@@ -158,12 +201,11 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
Calls.Add((fileName, arguments));
if (!string.Equals(fileName, "ffmpeg", StringComparison.OrdinalIgnoreCase))
return Task.FromResult(new ExternalProcessResult(0, "[00:00:00.000 --> 00:00:01.000] tell me a joke",
string.Empty));
return Task.FromResult(new ExternalProcessResult(0, whisperStdOut, string.Empty));
var outputPath = arguments[^1];
File.WriteAllBytes(outputPath, "RIFF"u8);
return Task.FromResult(new ExternalProcessResult(0, string.Empty, string.Empty));
}
}
}
}

View File

@@ -0,0 +1,25 @@
using Jibo.Cloud.Application.Services;
using Jibo.Runtime.Abstractions;
namespace Jibo.Cloud.Tests.WebSockets;
public sealed class SyntheticBufferedAudioSttStrategyTests
{
[Fact]
public async Task TranscribeAsync_NormalizesLoosePunctuationInTranscriptHint()
{
var strategy = new SyntheticBufferedAudioSttStrategy();
var result = await strategy.TranscribeAsync(new TurnContext
{
Attributes = new Dictionary<string, object?>
{
["bufferedAudioBytes"] = 42,
["audioTranscriptHint"] = "- Thank you. - Yes."
}
});
Assert.Equal("thank you yes", result.Text);
Assert.Equal("synthetic-buffered-audio", result.Provider);
Assert.Equal(0.75f, result.Confidence);
}
}