diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/SyntheticBufferedAudioSttStrategy.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/SyntheticBufferedAudioSttStrategy.cs index 76a4e61..ed53ebc 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/SyntheticBufferedAudioSttStrategy.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Application/Services/SyntheticBufferedAudioSttStrategy.cs @@ -20,7 +20,7 @@ public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy return Task.FromResult(new SttResult { - Text = transcriptHint.Trim(), + Text = NormalizeLooseTranscript(transcriptHint), Provider = Name, Confidence = 0.75f, Locale = turn.Locale, @@ -51,4 +51,16 @@ public sealed class SyntheticBufferedAudioSttStrategy : ISttStrategy ? transcriptHint?.ToString() : null; } -} \ No newline at end of file + + private static string NormalizeLooseTranscript(string? value) + { + if (string.IsNullOrWhiteSpace(value)) return string.Empty; + + var lowered = value.Trim().ToLowerInvariant(); + lowered = System.Text.RegularExpressions.Regex.Replace(lowered, @"[^\p{L}\p{N}\s']+", " ", + System.Text.RegularExpressions.RegexOptions.CultureInvariant | System.Text.RegularExpressions.RegexOptions.Compiled); + lowered = System.Text.RegularExpressions.Regex.Replace(lowered, @"\s+"," ", + System.Text.RegularExpressions.RegexOptions.CultureInvariant | System.Text.RegularExpressions.RegexOptions.Compiled); + return lowered.Trim(); + } +} diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/AudioTranscriptNormalizer.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/AudioTranscriptNormalizer.cs new file mode 100644 index 0000000..036b4c4 --- /dev/null +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/AudioTranscriptNormalizer.cs @@ -0,0 +1,24 @@ +using System.Text.RegularExpressions; + +namespace Jibo.Cloud.Infrastructure.Audio; + +internal static class AudioTranscriptNormalizer +{ + private static readonly Regex PunctuationToSpaceRegex = new( + @"[^\p{L}\p{N}\s']+", + RegexOptions.CultureInvariant | RegexOptions.Compiled); + + private static readonly Regex WhitespaceRegex = new( + @"\s+", + RegexOptions.CultureInvariant | RegexOptions.Compiled); + + public static string NormalizeLooseTranscript(string? value) + { + if (string.IsNullOrWhiteSpace(value)) return string.Empty; + + return WhitespaceRegex.Replace( + PunctuationToSpaceRegex.Replace(value.Trim().ToLowerInvariant(), " "), + " ") + .Trim(); + } +} diff --git a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs index dd4078c..f34723d 100644 --- a/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs +++ b/OpenJibo/src/Jibo.Cloud/dotnet/src/Jibo.Cloud.Infrastructure/Audio/LocalWhisperCppBufferedAudioSttStrategy.cs @@ -52,6 +52,7 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy( cancellationToken); var transcript = ExtractTranscript(whisperResult.StdOut); + transcript = AudioTranscriptNormalizer.NormalizeLooseTranscript(transcript); if (string.IsNullOrWhiteSpace(transcript)) throw new InvalidOperationException("whisper.cpp returned no transcript for the buffered audio turn."); @@ -154,4 +155,4 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategy( return !checkFileExists || File.Exists(path); } -} \ No newline at end of file +} diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/LocalWhisperCppBufferedAudioSttStrategyTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/LocalWhisperCppBufferedAudioSttStrategyTests.cs index 34a984e..8d6f440 100644 --- a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/LocalWhisperCppBufferedAudioSttStrategyTests.cs +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/LocalWhisperCppBufferedAudioSttStrategyTests.cs @@ -123,6 +123,48 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests } } + [Fact] + public async Task TranscribeAsync_NormalizesLoosePunctuationFromWhisperOutput() + { + var tempDirectory = Path.Combine(Path.GetTempPath(), $"openjibo-stt-test-{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDirectory); + + try + { + var runner = new FakeExternalProcessRunner("[00:00:00.000 --> 00:00:01.000] - Thank you. - Yes."); + var strategy = new LocalWhisperCppBufferedAudioSttStrategy( + new BufferedAudioSttOptions + { + EnableLocalWhisperCpp = true, + FfmpegPath = "ffmpeg", + WhisperCliPath = "whisper-cli", + WhisperModelPath = "model.bin", + TempDirectory = tempDirectory + }, + runner); + + var turn = new TurnContext + { + TurnId = "turn-local-stt-punctuation", + Locale = "en-US", + Attributes = new Dictionary + { + ["bufferedAudioBytes"] = 47, + ["bufferedAudioFrames"] = new[] { BuildMinimalOggPage() } + } + }; + + var result = await strategy.TranscribeAsync(turn); + + Assert.Equal("thank you yes", result.Text); + Assert.Equal("local-whispercpp-buffered-audio", result.Provider); + } + finally + { + if (Directory.Exists(tempDirectory)) Directory.Delete(tempDirectory, true); + } + } + private static byte[] BuildMinimalOggPage() { return @@ -148,7 +190,8 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests return page; } - private sealed class FakeExternalProcessRunner : IExternalProcessRunner + private sealed class FakeExternalProcessRunner(string whisperStdOut = "[00:00:00.000 --> 00:00:01.000] tell me a joke") + : IExternalProcessRunner { public List<(string FileName, IReadOnlyList Arguments)> Calls { get; } = []; @@ -158,12 +201,11 @@ public sealed class LocalWhisperCppBufferedAudioSttStrategyTests Calls.Add((fileName, arguments)); if (!string.Equals(fileName, "ffmpeg", StringComparison.OrdinalIgnoreCase)) - return Task.FromResult(new ExternalProcessResult(0, "[00:00:00.000 --> 00:00:01.000] tell me a joke", - string.Empty)); + return Task.FromResult(new ExternalProcessResult(0, whisperStdOut, string.Empty)); var outputPath = arguments[^1]; File.WriteAllBytes(outputPath, "RIFF"u8); return Task.FromResult(new ExternalProcessResult(0, string.Empty, string.Empty)); } } -} \ No newline at end of file +} diff --git a/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/SyntheticBufferedAudioSttStrategyTests.cs b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/SyntheticBufferedAudioSttStrategyTests.cs new file mode 100644 index 0000000..da8f3a7 --- /dev/null +++ b/OpenJibo/tests/Jibo.Cloud.Tests/WebSockets/SyntheticBufferedAudioSttStrategyTests.cs @@ -0,0 +1,25 @@ +using Jibo.Cloud.Application.Services; +using Jibo.Runtime.Abstractions; + +namespace Jibo.Cloud.Tests.WebSockets; + +public sealed class SyntheticBufferedAudioSttStrategyTests +{ + [Fact] + public async Task TranscribeAsync_NormalizesLoosePunctuationInTranscriptHint() + { + var strategy = new SyntheticBufferedAudioSttStrategy(); + var result = await strategy.TranscribeAsync(new TurnContext + { + Attributes = new Dictionary + { + ["bufferedAudioBytes"] = 42, + ["audioTranscriptHint"] = "- Thank you. - Yes." + } + }); + + Assert.Equal("thank you yes", result.Text); + Assert.Equal("synthetic-buffered-audio", result.Provider); + Assert.Equal(0.75f, result.Confidence); + } +}