first stab at solving for menus and real STT

This commit is contained in:
Jacob Dubin
2026-04-16 15:40:28 -05:00
parent efe4dfd04e
commit fe1e11653f
19 changed files with 799 additions and 19 deletions

View File

@@ -108,3 +108,25 @@ Current raw-audio behavior is still a compatibility bridge:
- if buffered audio has a synthetic transcript hint, the server now auto-finalizes the turn and emits `LISTEN` + `EOS` + `SKILL_ACTION`
- if buffered audio crosses the finalize threshold without a usable transcript, the server now emits a Node-style fallback completion with `EOS` instead of hanging the turn forever
- this is intentionally not a claim of real ASR parity
## Buffered Audio STT
The current `.NET` websocket stack now preserves buffered Ogg/Opus websocket frames in memory for each in-flight turn.
That enables two distinct STT paths:
- fixture-oriented synthetic transcript hints for replay and parity tests
- an opt-in local tool-based path that can normalize the buffered Ogg pages, call `ffmpeg`, and then call `whisper.cpp`
The local tool path is intentionally off by default. It exists to help map real robot audio behavior while the stable hosted cloud remains the primary goal.
Configuration lives under `OpenJibo:Stt`:
- `EnableLocalWhisperCpp`
- `FfmpegPath`
- `WhisperCliPath`
- `WhisperModelPath`
- `WhisperLanguage`
- `TempDirectory`
This is not yet a claim of production-ready onboard ASR. It is a `.NET` discovery seam that keeps us compatible with the Node oracle while we evaluate longer-term options such as Azure-hosted STT or a managed decode/transcribe stack.

View File

@@ -8,22 +8,30 @@ public sealed class DemoConversationBroker : IConversationBroker
{
var transcript = (turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty).Trim();
var lowered = transcript.ToLowerInvariant();
var clientIntent = turn.Attributes.TryGetValue("clientIntent", out var rawClientIntent)
? rawClientIntent?.ToString()
: null;
var semanticIntent = ResolveSemanticIntent(lowered, clientIntent);
var reply = transcript.Length == 0
var reply = semanticIntent switch
{
"time" => $"It is {DateTime.Now:hh:mm tt}.",
"date" => $"Today is {DateTime.Now:dddd, MMMM d}.",
"dance" => "Okay. Watch this.",
_ => transcript.Length == 0
? "I am listening."
: lowered.Contains("time")
? $"It is {DateTime.Now:hh:mm tt}."
: lowered.Contains("hello") || lowered.Contains("hi")
? "Hello from the OpenJibo cloud."
: lowered.Contains("joke")
? "Why did the robot bring a ladder? Because it wanted to reach the cloud."
: $"I heard: {transcript}";
: lowered.Contains("hello") || lowered.Contains("hi")
? "Hello from the OpenJibo cloud."
: lowered.Contains("joke")
? "Why did the robot bring a ladder? Because it wanted to reach the cloud."
: $"I heard: {transcript}"
};
var plan = new ResponsePlan
{
SessionId = turn.SessionId,
Status = ResponseStatus.Succeeded,
IntentName = lowered.Contains("joke") ? "joke" : lowered.Contains("time") ? "time" : "chat",
IntentName = semanticIntent,
Topic = "conversation",
DeviceId = turn.DeviceId,
TargetHost = turn.HostName,
@@ -72,4 +80,39 @@ public sealed class DemoConversationBroker : IConversationBroker
return Task.FromResult(plan);
}
private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent)
{
if (string.Equals(clientIntent, "askForTime", StringComparison.OrdinalIgnoreCase))
{
return "time";
}
if (string.Equals(clientIntent, "askForDate", StringComparison.OrdinalIgnoreCase))
{
return "date";
}
if (loweredTranscript.Contains("joke", StringComparison.Ordinal))
{
return "joke";
}
if (loweredTranscript.Contains("dance", StringComparison.Ordinal))
{
return "dance";
}
if (loweredTranscript.Contains("time", StringComparison.Ordinal))
{
return "time";
}
if (loweredTranscript.Contains("date", StringComparison.Ordinal) || loweredTranscript.Contains("day", StringComparison.Ordinal))
{
return "date";
}
return "chat";
}
}

View File

@@ -9,12 +9,12 @@ public sealed class ProtocolToTurnContextMapper
public TurnContext MapListenMessage(WebSocketMessageEnvelope envelope, CloudSession session, string messageType)
{
var turnState = session.TurnState;
var text = ExtractTranscript(envelope.Text);
var protocolOperation = messageType.ToLowerInvariant();
var attributes = new Dictionary<string, object?>(StringComparer.OrdinalIgnoreCase)
{
["messageType"] = messageType
};
var text = ExtractTranscript(envelope.Text, attributes);
if (!string.IsNullOrWhiteSpace(turnState.TransId))
{
@@ -35,6 +35,7 @@ public sealed class ProtocolToTurnContextMapper
{
attributes["bufferedAudioBytes"] = turnState.BufferedAudioBytes;
attributes["bufferedAudioChunks"] = turnState.BufferedAudioChunkCount;
attributes["bufferedAudioFrames"] = turnState.BufferedAudioFrames.Select(frame => frame.ToArray()).ToArray();
}
if (!string.IsNullOrWhiteSpace(turnState.AudioTranscriptHint))
@@ -66,7 +67,7 @@ public sealed class ProtocolToTurnContextMapper
};
}
private static string? ExtractTranscript(string? text)
private static string? ExtractTranscript(string? text, IDictionary<string, object?> attributes)
{
if (string.IsNullOrWhiteSpace(text))
{
@@ -99,6 +100,25 @@ public sealed class ProtocolToTurnContextMapper
}
if (data.TryGetProperty("intent", out var intent) && intent.ValueKind == JsonValueKind.String)
{
attributes["clientIntent"] = intent.GetString();
}
if (data.TryGetProperty("rules", out var rules) && rules.ValueKind == JsonValueKind.Array)
{
attributes["clientRules"] = rules.EnumerateArray()
.Where(item => item.ValueKind == JsonValueKind.String)
.Select(item => item.GetString() ?? string.Empty)
.Where(rule => !string.IsNullOrWhiteSpace(rule))
.ToArray();
}
if (data.TryGetProperty("entities", out var entities) && entities.ValueKind == JsonValueKind.Object)
{
attributes["clientEntities"] = entities.Clone();
}
if (intent.ValueKind == JsonValueKind.String)
{
return intent.GetString();
}

View File

@@ -10,11 +10,20 @@ public sealed class ResponsePlanToSocketMessagesMapper
{
var speak = plan.Actions.OfType<SpeakAction>().FirstOrDefault();
var skill = plan.Actions.OfType<InvokeNativeSkillAction>().FirstOrDefault();
var messageType = ReadAttribute(turn, "messageType");
var transId = turn.Attributes.TryGetValue("transID", out var transIdValue)
? transIdValue?.ToString() ?? string.Empty
: session.LastTransId ?? string.Empty;
var transcript = turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty;
var rules = ReadRules(turn);
var clientIntent = ReadAttribute(turn, "clientIntent");
var rules = ReadRules(turn, messageType);
var outboundIntent = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent)
? clientIntent!
: plan.IntentName ?? "unknown";
var outboundAsrText = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(clientIntent)
? clientIntent!
: transcript;
var entities = ReadEntities(turn, messageType);
var messages = new List<SocketReplyPlan>();
messages.Add(new SocketReplyPlan(JsonSerializer.Serialize(new
@@ -27,18 +36,18 @@ public sealed class ResponsePlanToSocketMessagesMapper
{
confidence = 0.95,
final = true,
text = transcript
text = outboundAsrText
},
nlu = new
{
confidence = 0.95,
intent = plan.IntentName ?? "unknown",
intent = outboundIntent,
rules,
entities = new Dictionary<string, object?>()
entities
},
match = new
{
intent = plan.IntentName ?? "unknown",
intent = outboundIntent,
rule = rules.FirstOrDefault() ?? string.Empty,
score = 0.95
}
@@ -107,9 +116,13 @@ public sealed class ResponsePlanToSocketMessagesMapper
];
}
private static IReadOnlyList<string> ReadRules(TurnContext turn)
private static IReadOnlyList<string> ReadRules(TurnContext turn, string? messageType)
{
if (!turn.Attributes.TryGetValue("listenRules", out var value))
var attributeName = string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase)
? "clientRules"
: "listenRules";
if (!turn.Attributes.TryGetValue(attributeName, out var value))
{
return [];
}
@@ -122,12 +135,42 @@ public sealed class ResponsePlanToSocketMessagesMapper
};
}
private static object ReadEntities(TurnContext turn, string? messageType)
{
if (!string.Equals(messageType, "CLIENT_NLU", StringComparison.OrdinalIgnoreCase))
{
return new Dictionary<string, object?>();
}
if (!turn.Attributes.TryGetValue("clientEntities", out var value) || value is null)
{
return new Dictionary<string, object?>();
}
return value switch
{
JsonElement jsonElement when jsonElement.ValueKind == JsonValueKind.Object => jsonElement,
IDictionary<string, object?> dictionary => dictionary,
_ => new Dictionary<string, object?>()
};
}
private static string? ReadAttribute(TurnContext turn, string key)
{
return turn.Attributes.TryGetValue(key, out var value)
? value?.ToString()
: null;
}
private static object BuildSkillPayload(ResponsePlan plan, TurnContext turn, string transId, SpeakAction speak, InvokeNativeSkillAction? skill)
{
var isJoke = string.Equals(plan.IntentName, "joke", StringComparison.OrdinalIgnoreCase) ||
string.Equals(skill?.SkillName, "@be/joke", StringComparison.OrdinalIgnoreCase);
var isDance = string.Equals(plan.IntentName, "dance", StringComparison.OrdinalIgnoreCase);
var skillId = isJoke ? "@be/joke" : skill?.SkillName ?? "chitchat-skill";
var esml = isJoke
var esml = isDance
? "<speak>Okay.<break size='0.2'/> Watch this.<anim cat='dance' filter='music, rom-upbeat' /></speak>"
: isJoke
? $"<speak><es cat='happy' filter='!ssa-only, !sfx-only' endNeutral='true'>{EscapeXml(speak.Text)}</es></speak>"
: $"<speak><es cat='neutral' filter='!ssa-only, !sfx-only' endNeutral='true'>{EscapeXml(speak.Text)}</es></speak>";
var mimId = isJoke ? "runtime-joke" : "runtime-chat";

View File

@@ -23,6 +23,10 @@ public sealed class WebSocketTurnFinalizationService(
turnState.FirstAudioReceivedUtc ??= DateTimeOffset.UtcNow;
turnState.BufferedAudioChunkCount += 1;
turnState.BufferedAudioBytes += envelope.Binary?.Length ?? 0;
if (envelope.Binary is { Length: > 0 })
{
turnState.BufferedAudioFrames.Add(envelope.Binary.ToArray());
}
turnState.LastAudioReceivedUtc = DateTimeOffset.UtcNow;
turnState.AwaitingTurnCompletion = true;
session.Metadata["lastAudioBytes"] = envelope.Binary?.Length ?? 0;
@@ -223,6 +227,7 @@ public sealed class WebSocketTurnFinalizationService(
session.TurnState.BufferedAudioChunkCount = 0;
session.TurnState.FirstAudioReceivedUtc = null;
session.TurnState.LastAudioReceivedUtc = null;
session.TurnState.BufferedAudioFrames.Clear();
session.TurnState.FinalizeAttemptCount = 0;
session.Metadata.Remove("audioTranscriptHint");
}
@@ -236,6 +241,7 @@ public sealed class WebSocketTurnFinalizationService(
turnState.LastAudioReceivedUtc = null;
turnState.BufferedAudioChunkCount = 0;
turnState.BufferedAudioBytes = 0;
turnState.BufferedAudioFrames.Clear();
turnState.FinalizeAttemptCount = 0;
turnState.AwaitingTurnCompletion = false;
turnState.SawListen = false;

View File

@@ -9,6 +9,7 @@ public sealed class WebSocketTurnState
public DateTimeOffset? LastAudioReceivedUtc { get; set; }
public int BufferedAudioChunkCount { get; set; }
public int BufferedAudioBytes { get; set; }
public List<byte[]> BufferedAudioFrames { get; } = [];
public int FinalizeAttemptCount { get; set; }
public bool AwaitingTurnCompletion { get; set; }
public bool SawListen { get; set; }

View File

@@ -0,0 +1,11 @@
namespace Jibo.Cloud.Infrastructure.Audio;
public sealed class BufferedAudioSttOptions
{
public bool EnableLocalWhisperCpp { get; set; }
public string? FfmpegPath { get; set; }
public string? WhisperCliPath { get; set; }
public string? WhisperModelPath { get; set; }
public string WhisperLanguage { get; set; } = "en";
public string? TempDirectory { get; set; }
}

View File

@@ -0,0 +1,42 @@
using System.Diagnostics;
namespace Jibo.Cloud.Infrastructure.Audio;
public sealed class ExternalProcessRunner : IExternalProcessRunner
{
public async Task<ExternalProcessResult> RunAsync(string fileName, IReadOnlyList<string> arguments, CancellationToken cancellationToken = default)
{
using var process = new Process
{
StartInfo = new ProcessStartInfo
{
FileName = fileName,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true
}
};
foreach (var argument in arguments)
{
process.StartInfo.ArgumentList.Add(argument);
}
process.Start();
var stdOutTask = process.StandardOutput.ReadToEndAsync(cancellationToken);
var stdErrTask = process.StandardError.ReadToEndAsync(cancellationToken);
await process.WaitForExitAsync(cancellationToken);
var stdOut = await stdOutTask;
var stdErr = await stdErrTask;
if (process.ExitCode != 0)
{
throw new InvalidOperationException($"External process '{fileName}' failed with exit code {process.ExitCode}: {stdErr}");
}
return new ExternalProcessResult(process.ExitCode, stdOut, stdErr);
}
}

View File

@@ -0,0 +1,8 @@
namespace Jibo.Cloud.Infrastructure.Audio;
public interface IExternalProcessRunner
{
Task<ExternalProcessResult> RunAsync(string fileName, IReadOnlyList<string> arguments, CancellationToken cancellationToken = default);
}
public sealed record ExternalProcessResult(int ExitCode, string StdOut, string StdErr);

View File

@@ -0,0 +1,153 @@
using System.Text.Json;
using Jibo.Runtime.Abstractions;
namespace Jibo.Cloud.Infrastructure.Audio;
public sealed class LocalWhisperCppBufferedAudioSttStrategy(
BufferedAudioSttOptions options,
IExternalProcessRunner processRunner) : ISttStrategy
{
public string Name => "local-whispercpp-buffered-audio";
public bool CanHandle(TurnContext turn)
{
return options.EnableLocalWhisperCpp &&
!string.IsNullOrWhiteSpace(options.FfmpegPath) &&
!string.IsNullOrWhiteSpace(options.WhisperCliPath) &&
!string.IsNullOrWhiteSpace(options.WhisperModelPath) &&
ReadBufferedAudioFrames(turn).Count > 0;
}
public async Task<SttResult> TranscribeAsync(TurnContext turn, CancellationToken cancellationToken = default)
{
var frames = ReadBufferedAudioFrames(turn);
if (frames.Count == 0)
{
throw new InvalidOperationException("Local whisper.cpp STT requires buffered websocket audio frames.");
}
var tempDirectory = options.TempDirectory;
if (string.IsNullOrWhiteSpace(tempDirectory))
{
tempDirectory = Path.Combine(Path.GetTempPath(), "openjibo-stt");
}
Directory.CreateDirectory(tempDirectory);
var baseName = $"turn-{turn.TurnId}";
var oggPath = Path.Combine(tempDirectory, $"{baseName}.ogg");
var wavPath = Path.Combine(tempDirectory, $"{baseName}.wav");
try
{
await File.WriteAllBytesAsync(oggPath, OggOpusAudioNormalizer.Normalize(frames), cancellationToken);
await processRunner.RunAsync(
options.FfmpegPath!,
["-y", "-i", oggPath, "-ar", "16000", "-ac", "1", "-f", "wav", wavPath],
cancellationToken);
var whisperResult = await processRunner.RunAsync(
options.WhisperCliPath!,
["-m", options.WhisperModelPath!, "-f", wavPath, "-l", options.WhisperLanguage],
cancellationToken);
var transcript = ExtractTranscript(whisperResult.StdOut);
if (string.IsNullOrWhiteSpace(transcript))
{
throw new InvalidOperationException("whisper.cpp returned no transcript for the buffered audio turn.");
}
return new SttResult
{
Text = transcript,
Provider = Name,
Locale = turn.Locale,
Metadata = new Dictionary<string, object?>(StringComparer.OrdinalIgnoreCase)
{
["bufferedAudioBytes"] = ReadBufferedAudioBytes(turn),
["bufferedAudioChunks"] = frames.Count,
["ffmpegPath"] = options.FfmpegPath,
["whisperCliPath"] = options.WhisperCliPath,
["wavPath"] = wavPath
}
};
}
finally
{
TryDelete(oggPath);
TryDelete(wavPath);
}
}
private static IReadOnlyList<byte[]> ReadBufferedAudioFrames(TurnContext turn)
{
if (!turn.Attributes.TryGetValue("bufferedAudioFrames", out var value) || value is null)
{
return [];
}
return value switch
{
byte[][] jagged => jagged,
IReadOnlyList<byte[]> typed => typed,
IEnumerable<byte[]> enumerable => enumerable.ToArray(),
JsonElement jsonElement when jsonElement.ValueKind == JsonValueKind.Array => jsonElement.EnumerateArray()
.Where(static item => item.ValueKind == JsonValueKind.Array)
.Select(static item => item.EnumerateArray().Select(static b => (byte)b.GetInt32()).ToArray())
.ToArray(),
_ => []
};
}
private static int ReadBufferedAudioBytes(TurnContext turn)
{
return turn.Attributes.TryGetValue("bufferedAudioBytes", out var bufferedAudioBytes) && bufferedAudioBytes is not null
? bufferedAudioBytes switch
{
int value => value,
long value => (int)value,
string value when int.TryParse(value, out var parsed) => parsed,
_ => 0
}
: 0;
}
private static string ExtractTranscript(string standardOutput)
{
var lines = standardOutput
.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
var timecoded = lines
.Where(static line => line.StartsWith("[", StringComparison.Ordinal) && line.Contains("-->", StringComparison.Ordinal))
.Select(static line =>
{
var closingBracket = line.IndexOf(']');
return closingBracket >= 0 ? line[(closingBracket + 1)..].Trim() : line.Trim();
})
.Where(static line => !string.IsNullOrWhiteSpace(line))
.ToArray();
if (timecoded.Length > 0)
{
return string.Join(" ", timecoded).Trim();
}
return string.Join(" ", lines).Trim();
}
private static void TryDelete(string path)
{
try
{
if (File.Exists(path))
{
File.Delete(path);
}
}
catch
{
// Best-effort cleanup only.
}
}
}

View File

@@ -0,0 +1,114 @@
using System.Buffers.Binary;
using System.Text;
namespace Jibo.Cloud.Infrastructure.Audio;
internal static class OggOpusAudioNormalizer
{
private static readonly uint[] CrcTable = BuildCrcTable();
public static byte[] Normalize(IReadOnlyList<byte[]> pages)
{
if (pages.Count == 0)
{
return [];
}
var parsed = pages.Select(ParsePage).ToArray();
var baseGranule = parsed.Length > 1 ? parsed[1].GranulePosition : parsed[0].GranulePosition;
var normalized = new List<byte[]>(pages.Count);
for (var index = 0; index < pages.Count; index += 1)
{
var output = pages[index].ToArray();
var parsedPage = parsed[index];
var newGranule = index >= 1 && parsedPage.GranulePosition >= baseGranule
? parsedPage.GranulePosition - baseGranule
: 0UL;
BinaryPrimitives.WriteUInt64LittleEndian(output.AsSpan(6, 8), newGranule);
BinaryPrimitives.WriteUInt32LittleEndian(output.AsSpan(18, 4), (uint)index);
var headerType = output[5];
output[5] = index == pages.Count - 1
? (byte)(headerType | 0x04)
: (byte)(headerType & ~0x04);
output[22] = 0;
output[23] = 0;
output[24] = 0;
output[25] = 0;
BinaryPrimitives.WriteUInt32LittleEndian(output.AsSpan(22, 4), ComputeCrc(output));
normalized.Add(output);
}
return normalized.SelectMany(static page => page).ToArray();
}
private static ParsedOggPage ParsePage(byte[] buffer)
{
if (buffer.Length < 27)
{
throw new InvalidOperationException($"Buffered Ogg page is too short ({buffer.Length} bytes).");
}
if (!Encoding.ASCII.GetString(buffer, 0, 4).Equals("OggS", StringComparison.Ordinal))
{
throw new InvalidOperationException("Buffered audio frame did not begin with an OggS capture pattern.");
}
var pageSegments = buffer[26];
if (buffer.Length < 27 + pageSegments)
{
throw new InvalidOperationException("Buffered Ogg page segment table was truncated.");
}
var payloadLength = 0;
for (var index = 0; index < pageSegments; index += 1)
{
payloadLength += buffer[27 + index];
}
var expectedLength = 27 + pageSegments + payloadLength;
if (buffer.Length < expectedLength)
{
throw new InvalidOperationException("Buffered Ogg page payload was truncated.");
}
return new ParsedOggPage(BinaryPrimitives.ReadUInt64LittleEndian(buffer.AsSpan(6, 8)));
}
private static uint ComputeCrc(byte[] buffer)
{
uint crc = 0;
foreach (var value in buffer)
{
crc = (crc << 8) ^ CrcTable[((crc >> 24) ^ value) & 0xff];
}
return crc;
}
private static uint[] BuildCrcTable()
{
var table = new uint[256];
for (uint index = 0; index < table.Length; index += 1)
{
var remainder = index << 24;
for (var bit = 0; bit < 8; bit += 1)
{
remainder = (remainder & 0x80000000) != 0
? (remainder << 1) ^ 0x04c11db7
: remainder << 1;
}
table[index] = remainder;
}
return table;
}
private sealed record ParsedOggPage(ulong GranulePosition);
}

View File

@@ -1,5 +1,6 @@
using Jibo.Cloud.Application.Abstractions;
using Jibo.Cloud.Application.Services;
using Jibo.Cloud.Infrastructure.Audio;
using Jibo.Cloud.Infrastructure.Persistence;
using Jibo.Cloud.Infrastructure.Telemetry;
using Jibo.Runtime.Abstractions;
@@ -12,14 +13,19 @@ public static class ServiceCollectionExtensions
{
public static IServiceCollection AddOpenJiboCloud(this IServiceCollection services, IConfiguration? configuration = null)
{
var sttOptions = new BufferedAudioSttOptions();
if (configuration is not null)
{
services.Configure<WebSocketTelemetryOptions>(configuration.GetSection("OpenJibo:Telemetry"));
services.Configure<ProtocolTelemetryOptions>(configuration.GetSection("OpenJibo:ProtocolTelemetry"));
configuration.GetSection("OpenJibo:Stt").Bind(sttOptions);
}
services.AddSingleton(sttOptions);
services.AddSingleton<ICloudStateStore, InMemoryCloudStateStore>();
services.AddSingleton<IConversationBroker, DemoConversationBroker>();
services.AddSingleton<IExternalProcessRunner, ExternalProcessRunner>();
services.AddSingleton<ISttStrategy, LocalWhisperCppBufferedAudioSttStrategy>();
services.AddSingleton<ISttStrategy, SyntheticBufferedAudioSttStrategy>();
services.AddSingleton<ISttStrategySelector, DefaultSttStrategySelector>();
services.AddSingleton<IWebSocketTelemetrySink, FileWebSocketTelemetrySink>();