Normalize loose STT transcripts before routing
This commit is contained in:
@@ -20,7 +20,7 @@ public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy
|
|||||||
|
|
||||||
return Task.FromResult(new SttResult
|
return Task.FromResult(new SttResult
|
||||||
{
|
{
|
||||||
Text = transcriptHint.Trim(),
|
Text = NormalizeLooseTranscript(transcriptHint),
|
||||||
Provider = Name,
|
Provider = Name,
|
||||||
Confidence = 0.75f,
|
Confidence = 0.75f,
|
||||||
Locale = turn.Locale,
|
Locale = turn.Locale,
|
||||||
@@ -51,4 +51,16 @@ public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy
|
|||||||
? transcriptHint?.ToString()
|
? transcriptHint?.ToString()
|
||||||
: null;
|
: null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static string NormalizeLooseTranscript(string? value)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||||
|
|
||||||
|
var lowered = value.Trim().ToLowerInvariant();
|
||||||
|
lowered = System.Text.RegularExpressions.Regex.Replace(lowered, @"[^\p{L}\p{N}\s']+", " ",
|
||||||
|
System.Text.RegularExpressions.RegexOptions.CultureInvariant | System.Text.RegularExpressions.RegexOptions.Compiled);
|
||||||
|
lowered = System.Text.RegularExpressions.Regex.Replace(lowered, @"\s+"," ",
|
||||||
|
System.Text.RegularExpressions.RegexOptions.CultureInvariant | System.Text.RegularExpressions.RegexOptions.Compiled);
|
||||||
|
return lowered.Trim();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
using System.Text.RegularExpressions;
|
||||||
|
|
||||||
|
namespace Jibo.Cloud.Infrastructure.Audio;
|
||||||
|
|
||||||
|
internal static class AudioTranscriptNormalizer
|
||||||
|
{
|
||||||
|
private static readonly Regex PunctuationToSpaceRegex = new(
|
||||||
|
@"[^\p{L}\p{N}\s']+",
|
||||||
|
RegexOptions.CultureInvariant | RegexOptions.Compiled);
|
||||||
|
|
||||||
|
private static readonly Regex WhitespaceRegex = new(
|
||||||
|
@"\s+",
|
||||||
|
RegexOptions.CultureInvariant | RegexOptions.Compiled);
|
||||||
|
|
||||||
|
public static string NormalizeLooseTranscript(string? value)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||||
|
|
||||||
|
return WhitespaceRegex.Replace(
|
||||||
|
PunctuationToSpaceRegex.Replace(value.Trim().ToLowerInvariant(), " "),
|
||||||
|
" ")
|
||||||
|
.Trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -52,6 +52,7 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy(
|
|||||||
cancellationToken);
|
cancellationToken);
|
||||||
|
|
||||||
var transcript = ExtractTranscript(whisperResult.StdOut);
|
var transcript = ExtractTranscript(whisperResult.StdOut);
|
||||||
|
transcript = AudioTranscriptNormalizer.NormalizeLooseTranscript(transcript);
|
||||||
if (string.IsNullOrWhiteSpace(transcript))
|
if (string.IsNullOrWhiteSpace(transcript))
|
||||||
throw new InvalidOperationException("whisper.cpp returned no transcript for the buffered audio turn.");
|
throw new InvalidOperationException("whisper.cpp returned no transcript for the buffered audio turn.");
|
||||||
|
|
||||||
|
|||||||
@@ -123,6 +123,48 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task TranscribeAsync_NormalizesLoosePunctuationFromWhisperOutput()
|
||||||
|
{
|
||||||
|
var tempDirectory = Path.Combine(Path.GetTempPath(), $"openjibo-stt-test-{Guid.NewGuid():N}");
|
||||||
|
Directory.CreateDirectory(tempDirectory);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var runner = new FakeExternalProcessRunner("[00:00:00.000 --> 00:00:01.000] - Thank you. - Yes.");
|
||||||
|
var strategy = new LocalWhisperCppBufferedAudioSttStrategy(
|
||||||
|
new BufferedAudioSttOptions
|
||||||
|
{
|
||||||
|
EnableLocalWhisperCpp = true,
|
||||||
|
FfmpegPath = "ffmpeg",
|
||||||
|
WhisperCliPath = "whisper-cli",
|
||||||
|
WhisperModelPath = "model.bin",
|
||||||
|
TempDirectory = tempDirectory
|
||||||
|
},
|
||||||
|
runner);
|
||||||
|
|
||||||
|
var turn = new TurnContext
|
||||||
|
{
|
||||||
|
TurnId = "turn-local-stt-punctuation",
|
||||||
|
Locale = "en-US",
|
||||||
|
Attributes = new Dictionary<string, object?>
|
||||||
|
{
|
||||||
|
["bufferedAudioBytes"] = 47,
|
||||||
|
["bufferedAudioFrames"] = new[] { BuildMinimalOggPage() }
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
var result = await strategy.TranscribeAsync(turn);
|
||||||
|
|
||||||
|
Assert.Equal("thank you yes", result.Text);
|
||||||
|
Assert.Equal("local-whispercpp-buffered-audio", result.Provider);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (Directory.Exists(tempDirectory)) Directory.Delete(tempDirectory, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static byte[] BuildMinimalOggPage()
|
private static byte[] BuildMinimalOggPage()
|
||||||
{
|
{
|
||||||
return
|
return
|
||||||
@@ -148,7 +190,8 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
|
|||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
private sealed class FakeExternalProcessRunner : IExternalProcessRunner
|
private sealed class FakeExternalProcessRunner(string whisperStdOut = "[00:00:00.000 --> 00:00:01.000] tell me a joke")
|
||||||
|
: IExternalProcessRunner
|
||||||
{
|
{
|
||||||
public List<(string FileName, IReadOnlyList<string> Arguments)> Calls { get; } = [];
|
public List<(string FileName, IReadOnlyList<string> Arguments)> Calls { get; } = [];
|
||||||
|
|
||||||
@@ -158,8 +201,7 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests
|
|||||||
Calls.Add((fileName, arguments));
|
Calls.Add((fileName, arguments));
|
||||||
|
|
||||||
if (!string.Equals(fileName, "ffmpeg", StringComparison.OrdinalIgnoreCase))
|
if (!string.Equals(fileName, "ffmpeg", StringComparison.OrdinalIgnoreCase))
|
||||||
return Task.FromResult(new ExternalProcessResult(0, "[00:00:00.000 --> 00:00:01.000] tell me a joke",
|
return Task.FromResult(new ExternalProcessResult(0, whisperStdOut, string.Empty));
|
||||||
string.Empty));
|
|
||||||
|
|
||||||
var outputPath = arguments[^1];
|
var outputPath = arguments[^1];
|
||||||
File.WriteAllBytes(outputPath, "RIFF"u8);
|
File.WriteAllBytes(outputPath, "RIFF"u8);
|
||||||
|
|||||||
@@ -0,0 +1,25 @@
|
|||||||
|
using Jibo.Cloud.Application.Services;
|
||||||
|
using Jibo.Runtime.Abstractions;
|
||||||
|
|
||||||
|
namespace Jibo.Cloud.Tests.WebSockets;
|
||||||
|
|
||||||
|
public sealed class SyntheticBufferedAudioSttStrategyTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public async Task TranscribeAsync_NormalizesLoosePunctuationInTranscriptHint()
|
||||||
|
{
|
||||||
|
var strategy = new SyntheticBufferedAudioSttStrategy();
|
||||||
|
var result = await strategy.TranscribeAsync(new TurnContext
|
||||||
|
{
|
||||||
|
Attributes = new Dictionary<string, object?>
|
||||||
|
{
|
||||||
|
["bufferedAudioBytes"] = 42,
|
||||||
|
["audioTranscriptHint"] = "- Thank you. - Yes."
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.Equal("thank you yes", result.Text);
|
||||||
|
Assert.Equal("synthetic-buffered-audio", result.Provider);
|
||||||
|
Assert.Equal(0.75f, result.Confidence);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user