Normalize loose STT transcripts before routing
This commit is contained in:
@@ -20,7 +20,7 @@ public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy
|
||||
|
||||
return Task.FromResult(new SttResult
|
||||
{
|
||||
Text = transcriptHint.Trim(),
|
||||
Text = NormalizeLooseTranscript(transcriptHint),
|
||||
Provider = Name,
|
||||
Confidence = 0.75f,
|
||||
Locale = turn.Locale,
|
||||
@@ -51,4 +51,16 @@ public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy
|
||||
? transcriptHint?.ToString()
|
||||
: null;
|
||||
}
|
||||
}
|
||||
|
||||
private static string NormalizeLooseTranscript(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
|
||||
var lowered = value.Trim().ToLowerInvariant();
|
||||
lowered = System.Text.RegularExpressions.Regex.Replace(lowered, @"[^\p{L}\p{N}\s']+", " ",
|
||||
System.Text.RegularExpressions.RegexOptions.CultureInvariant | System.Text.RegularExpressions.RegexOptions.Compiled);
|
||||
lowered = System.Text.RegularExpressions.Regex.Replace(lowered, @"\s+"," ",
|
||||
System.Text.RegularExpressions.RegexOptions.CultureInvariant | System.Text.RegularExpressions.RegexOptions.Compiled);
|
||||
return lowered.Trim();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace Jibo.Cloud.Infrastructure.Audio;
|
||||
|
||||
internal static class AudioTranscriptNormalizer
|
||||
{
|
||||
private static readonly Regex PunctuationToSpaceRegex = new(
|
||||
@"[^\p{L}\p{N}\s']+",
|
||||
RegexOptions.CultureInvariant | RegexOptions.Compiled);
|
||||
|
||||
private static readonly Regex WhitespaceRegex = new(
|
||||
@"\s+",
|
||||
RegexOptions.CultureInvariant | RegexOptions.Compiled);
|
||||
|
||||
public static string NormalizeLooseTranscript(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
|
||||
return WhitespaceRegex.Replace(
|
||||
PunctuationToSpaceRegex.Replace(value.Trim().ToLowerInvariant(), " "),
|
||||
" ")
|
||||
.Trim();
|
||||
}
|
||||
}
|
||||
@@ -52,6 +52,7 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
|
||||
cancellationToken);
|
||||
|
||||
var transcript = ExtractTranscript(whisperResult.StdOut);
|
||||
transcript = AudioTranscriptNormalizer.NormalizeLooseTranscript(transcript);
|
||||
if (string.IsNullOrWhiteSpace(transcript))
|
||||
throw new InvalidOperationException("whisper.cpp returned no transcript for the buffered audio turn.");
|
||||
|
||||
@@ -154,4 +155,4 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
|
||||
|
||||
return !checkFileExists || File.Exists(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,6 +123,48 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TranscribeAsync_NormalizesLoosePunctuationFromWhisperOutput()
|
||||
{
|
||||
var tempDirectory = Path.Combine(Path.GetTempPath(), $"openjibo-stt-test-{Guid.NewGuid():N}");
|
||||
Directory.CreateDirectory(tempDirectory);
|
||||
|
||||
try
|
||||
{
|
||||
var runner = new FakeExternalProcessRunner("[00:00:00.000 --> 00:00:01.000] - Thank you. - Yes.");
|
||||
var strategy = new LocalWhisperCppBufferedAudioSttStrategy(
|
||||
new BufferedAudioSttOptions
|
||||
{
|
||||
EnableLocalWhisperCpp = true,
|
||||
FfmpegPath = "ffmpeg",
|
||||
WhisperCliPath = "whisper-cli",
|
||||
WhisperModelPath = "model.bin",
|
||||
TempDirectory = tempDirectory
|
||||
},
|
||||
runner);
|
||||
|
||||
var turn = new TurnContext
|
||||
{
|
||||
TurnId = "turn-local-stt-punctuation",
|
||||
Locale = "en-US",
|
||||
Attributes = new Dictionary<string, object?>
|
||||
{
|
||||
["bufferedAudioBytes"] = 47,
|
||||
["bufferedAudioFrames"] = new[] { BuildMinimalOggPage() }
|
||||
}
|
||||
};
|
||||
|
||||
var result = await strategy.TranscribeAsync(turn);
|
||||
|
||||
Assert.Equal("thank you yes", result.Text);
|
||||
Assert.Equal("local-whispercpp-buffered-audio", result.Provider);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (Directory.Exists(tempDirectory)) Directory.Delete(tempDirectory, true);
|
||||
}
|
||||
}
|
||||
|
||||
private static byte[] BuildMinimalOggPage()
|
||||
{
|
||||
return
|
||||
@@ -148,7 +190,8 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
|
||||
return page;
|
||||
}
|
||||
|
||||
private sealed class FakeExternalProcessRunner : IExternalProcessRunner
|
||||
private sealed class FakeExternalProcessRunner(string whisperStdOut = "[00:00:00.000 --> 00:00:01.000] tell me a joke")
|
||||
: IExternalProcessRunner
|
||||
{
|
||||
public List<(string FileName, IReadOnlyList<string> Arguments)> Calls { get; } = [];
|
||||
|
||||
@@ -158,12 +201,11 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
|
||||
Calls.Add((fileName, arguments));
|
||||
|
||||
if (!string.Equals(fileName, "ffmpeg", StringComparison.OrdinalIgnoreCase))
|
||||
return Task.FromResult(new ExternalProcessResult(0, "[00:00:00.000 --> 00:00:01.000] tell me a joke",
|
||||
string.Empty));
|
||||
return Task.FromResult(new ExternalProcessResult(0, whisperStdOut, string.Empty));
|
||||
|
||||
var outputPath = arguments[^1];
|
||||
File.WriteAllBytes(outputPath, "RIFF"u8);
|
||||
return Task.FromResult(new ExternalProcessResult(0, string.Empty, string.Empty));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
using Jibo.Cloud.Application.Services;
|
||||
using Jibo.Runtime.Abstractions;
|
||||
|
||||
namespace Jibo.Cloud.Tests.WebSockets;
|
||||
|
||||
public sealed class SyntheticBufferedAudioSttStrategyTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task TranscribeAsync_NormalizesLoosePunctuationInTranscriptHint()
|
||||
{
|
||||
var strategy = new SyntheticBufferedAudioSttStrategy();
|
||||
var result = await strategy.TranscribeAsync(new TurnContext
|
||||
{
|
||||
Attributes = new Dictionary<string, object?>
|
||||
{
|
||||
["bufferedAudioBytes"] = 42,
|
||||
["audioTranscriptHint"] = "- Thank you. - Yes."
|
||||
}
|
||||
});
|
||||
|
||||
Assert.Equal("thank you yes", result.Text);
|
||||
Assert.Equal("synthetic-buffered-audio", result.Provider);
|
||||
Assert.Equal(0.75f, result.Confidence);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user