open jibo architecture narrowing and streamlining

This commit is contained in:
Jacob Dubin
2026-04-17 17:49:43 -05:00
parent fe1e11653f
commit b030d6faeb
13 changed files with 511 additions and 74 deletions

View File

@@ -120,6 +120,13 @@ That enables two distinct STT paths:
The local tool path is intentionally off by default. It exists to help map real robot audio behavior while the stable hosted cloud remains the primary goal.
For local Ubuntu testing, the checked-in API host config now enables that path by default with the current Node-aligned tool locations:
- `/usr/bin/ffmpeg`
- `/usr/bin/whisper.cpp/build/bin/whisper-cli`
- `/usr/bin/whisper.cpp/models/ggml-base.en.bin`
- temp audio under `/tmp/openjibo-stt`
Configuration lives under `OpenJibo:Stt`:
- `EnableLocalWhisperCpp`
@@ -130,3 +137,13 @@ Configuration lives under `OpenJibo:Stt`:
- `TempDirectory`
This is not yet a claim of production-ready onboard ASR. It is a `.NET` discovery seam that keeps us compatible with the Node oracle while we evaluate longer-term options such as Azure-hosted STT or a managed decode/transcribe stack.
## Current Interaction Paths
The working cloud model currently looks like three main paths:
1. Jibo reports what already happened locally and the cloud tracks or lightly completes the turn.
2. Jibo reports what happened locally and the cloud responds with a different synthetic completion path.
3. Jibo streams raw audio and the cloud interprets the turn before sending ESML back.
That framing matches the repo evidence so far and is a good operating model for current discovery. There may still be smaller side paths around proactive traffic, direct skill-to-service communication, or future on-robot extensions, but those are not the main cloud revive loop yet.

View File

@@ -8,6 +8,14 @@
"ProtocolTelemetry": {
"Enabled": true,
"DirectoryPath": "captures/http"
},
"Stt": {
"EnableLocalWhisperCpp": true,
"FfmpegPath": "/usr/bin/ffmpeg",
"WhisperCliPath": "/usr/bin/whisper.cpp/build/bin/whisper-cli",
"WhisperModelPath": "/usr/bin/whisper.cpp/models/ggml-base.en.bin",
"WhisperLanguage": "en",
"TempDirectory": "/tmp/openjibo-stt"
}
}
}

View File

@@ -0,0 +1,21 @@
namespace Jibo.Cloud.Application.Abstractions;
public interface IJiboExperienceContentRepository
{
Task<JiboExperienceCatalog> GetCatalogAsync(CancellationToken cancellationToken = default);
}
public sealed class JiboExperienceCatalog
{
public IReadOnlyList<string> Jokes { get; init; } = [];
public IReadOnlyList<string> DanceAnimations { get; init; } = [];
public IReadOnlyList<string> GreetingReplies { get; init; } = [];
public IReadOnlyList<string> HowAreYouReplies { get; init; } = [];
public IReadOnlyList<string> SurpriseReplies { get; init; } = [];
public IReadOnlyList<string> PersonalReportReplies { get; init; } = [];
public IReadOnlyList<string> WeatherReplies { get; init; } = [];
public IReadOnlyList<string> CalendarReplies { get; init; } = [];
public IReadOnlyList<string> CommuteReplies { get; init; } = [];
public IReadOnlyList<string> NewsReplies { get; init; } = [];
public IReadOnlyList<string> GenericFallbackReplies { get; init; } = [];
}

View File

@@ -2,36 +2,17 @@ using Jibo.Runtime.Abstractions;
namespace Jibo.Cloud.Application.Services;
public sealed class DemoConversationBroker : IConversationBroker
public sealed class DemoConversationBroker(JiboInteractionService interactionService) : IConversationBroker
{
public Task<ResponsePlan> HandleTurnAsync(TurnContext turn, CancellationToken cancellationToken = default)
public async Task<ResponsePlan> HandleTurnAsync(TurnContext turn, CancellationToken cancellationToken = default)
{
var transcript = (turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty).Trim();
var lowered = transcript.ToLowerInvariant();
var clientIntent = turn.Attributes.TryGetValue("clientIntent", out var rawClientIntent)
? rawClientIntent?.ToString()
: null;
var semanticIntent = ResolveSemanticIntent(lowered, clientIntent);
var reply = semanticIntent switch
{
"time" => $"It is {DateTime.Now:hh:mm tt}.",
"date" => $"Today is {DateTime.Now:dddd, MMMM d}.",
"dance" => "Okay. Watch this.",
_ => transcript.Length == 0
? "I am listening."
: lowered.Contains("hello") || lowered.Contains("hi")
? "Hello from the OpenJibo cloud."
: lowered.Contains("joke")
? "Why did the robot bring a ladder? Because it wanted to reach the cloud."
: $"I heard: {transcript}"
};
var decision = await interactionService.BuildDecisionAsync(turn, cancellationToken);
var plan = new ResponsePlan
{
SessionId = turn.SessionId,
Status = ResponseStatus.Succeeded,
IntentName = semanticIntent,
IntentName = decision.IntentName,
Topic = "conversation",
DeviceId = turn.DeviceId,
TargetHost = turn.HostName,
@@ -41,7 +22,7 @@ public sealed class DemoConversationBroker : IConversationBroker
new SpeakAction
{
Sequence = 0,
Text = reply,
Text = decision.ReplyText,
Voice = "griffin"
},
new ListenAction
@@ -65,54 +46,16 @@ public sealed class DemoConversationBroker : IConversationBroker
}
};
if (string.Equals(plan.IntentName, "joke", StringComparison.OrdinalIgnoreCase))
if (!string.IsNullOrWhiteSpace(decision.SkillName))
{
plan.Actions.Add(new InvokeNativeSkillAction
{
Sequence = 2,
SkillName = "@be/joke",
Payload = new Dictionary<string, object?>
{
["replyType"] = "joke"
}
SkillName = decision.SkillName,
Payload = decision.SkillPayload ?? new Dictionary<string, object?>()
});
}
return Task.FromResult(plan);
}
private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent)
{
if (string.Equals(clientIntent, "askForTime", StringComparison.OrdinalIgnoreCase))
{
return "time";
}
if (string.Equals(clientIntent, "askForDate", StringComparison.OrdinalIgnoreCase))
{
return "date";
}
if (loweredTranscript.Contains("joke", StringComparison.Ordinal))
{
return "joke";
}
if (loweredTranscript.Contains("dance", StringComparison.Ordinal))
{
return "dance";
}
if (loweredTranscript.Contains("time", StringComparison.Ordinal))
{
return "time";
}
if (loweredTranscript.Contains("date", StringComparison.Ordinal) || loweredTranscript.Contains("day", StringComparison.Ordinal))
{
return "date";
}
return "chat";
return plan;
}
}

View File

@@ -0,0 +1,19 @@
namespace Jibo.Cloud.Application.Services;
public interface IJiboRandomizer
{
T Choose<T>(IReadOnlyList<T> items);
}
public sealed class DefaultJiboRandomizer : IJiboRandomizer
{
public T Choose<T>(IReadOnlyList<T> items)
{
if (items.Count == 0)
{
throw new InvalidOperationException("Cannot choose from an empty list.");
}
return items[Random.Shared.Next(items.Count)];
}
}

View File

@@ -0,0 +1,28 @@
using Jibo.Cloud.Application.Abstractions;
namespace Jibo.Cloud.Application.Services;
public sealed class JiboExperienceContentCache(IJiboExperienceContentRepository repository)
{
private readonly SemaphoreSlim _gate = new(1, 1);
private JiboExperienceCatalog? _catalog;
public async Task<JiboExperienceCatalog> GetCatalogAsync(CancellationToken cancellationToken = default)
{
if (_catalog is not null)
{
return _catalog;
}
await _gate.WaitAsync(cancellationToken);
try
{
_catalog ??= await repository.GetCatalogAsync(cancellationToken);
return _catalog;
}
finally
{
_gate.Release();
}
}
}

View File

@@ -0,0 +1,175 @@
using Jibo.Cloud.Application.Abstractions;
using Jibo.Runtime.Abstractions;
namespace Jibo.Cloud.Application.Services;
public sealed class JiboInteractionService(
JiboExperienceContentCache contentCache,
IJiboRandomizer randomizer)
{
public async Task<JiboInteractionDecision> BuildDecisionAsync(TurnContext turn, CancellationToken cancellationToken = default)
{
var catalog = await contentCache.GetCatalogAsync(cancellationToken);
var transcript = (turn.NormalizedTranscript ?? turn.RawTranscript ?? string.Empty).Trim();
var lowered = transcript.ToLowerInvariant();
var clientIntent = turn.Attributes.TryGetValue("clientIntent", out var rawClientIntent)
? rawClientIntent?.ToString()
: null;
var semanticIntent = ResolveSemanticIntent(lowered, clientIntent);
return semanticIntent switch
{
"joke" => BuildJokeDecision(catalog),
"dance" => BuildDanceDecision(catalog),
"time" => new JiboInteractionDecision("time", $"It is {DateTime.Now:hh:mm tt}."),
"date" => new JiboInteractionDecision("date", $"Today is {DateTime.Now:dddd, MMMM d}."),
"hello" => new JiboInteractionDecision("hello", randomizer.Choose(catalog.GreetingReplies)),
"how_are_you" => new JiboInteractionDecision("how_are_you", randomizer.Choose(catalog.HowAreYouReplies)),
"surprise" => new JiboInteractionDecision("surprise", randomizer.Choose(catalog.SurpriseReplies)),
"personal_report" => new JiboInteractionDecision("personal_report", randomizer.Choose(catalog.PersonalReportReplies)),
"weather" => new JiboInteractionDecision("weather", randomizer.Choose(catalog.WeatherReplies)),
"calendar" => new JiboInteractionDecision("calendar", randomizer.Choose(catalog.CalendarReplies)),
"commute" => new JiboInteractionDecision("commute", randomizer.Choose(catalog.CommuteReplies)),
"news" => new JiboInteractionDecision("news", randomizer.Choose(catalog.NewsReplies)),
_ => new JiboInteractionDecision("chat", BuildGenericReply(catalog, transcript, lowered))
};
}
private JiboInteractionDecision BuildJokeDecision(JiboExperienceCatalog catalog)
{
var joke = randomizer.Choose(catalog.Jokes);
return new JiboInteractionDecision(
"joke",
joke,
"@be/joke",
new Dictionary<string, object?>
{
["replyType"] = "joke"
});
}
private JiboInteractionDecision BuildDanceDecision(JiboExperienceCatalog catalog)
{
var dance = randomizer.Choose(catalog.DanceAnimations);
return new JiboInteractionDecision(
"dance",
"Okay. Watch this.",
"chitchat-skill",
new Dictionary<string, object?>
{
["esml"] = $"<speak>Okay.<break size='0.2'/> Watch this.<anim cat='dance' filter='music, {dance}' /></speak>",
["mim_id"] = "runtime-chat",
["mim_type"] = "announcement"
});
}
private string BuildGenericReply(JiboExperienceCatalog catalog, string transcript, string lowered)
{
if (string.IsNullOrWhiteSpace(transcript))
{
return "I am listening.";
}
if (lowered.Contains("good morning", StringComparison.Ordinal))
{
return "Good morning! It is nice to hear your voice.";
}
if (lowered.Contains("good afternoon", StringComparison.Ordinal))
{
return "Good afternoon. I am happy to be here.";
}
if (lowered.Contains("good night", StringComparison.Ordinal))
{
return "Good night. Sleep tight.";
}
return randomizer.Choose(catalog.GenericFallbackReplies).Replace("{transcript}", transcript, StringComparison.Ordinal);
}
private static string ResolveSemanticIntent(string loweredTranscript, string? clientIntent)
{
if (string.Equals(clientIntent, "askForTime", StringComparison.OrdinalIgnoreCase))
{
return "time";
}
if (string.Equals(clientIntent, "askForDate", StringComparison.OrdinalIgnoreCase))
{
return "date";
}
if (loweredTranscript.Contains("joke", StringComparison.Ordinal))
{
return "joke";
}
if (loweredTranscript.Contains("dance", StringComparison.Ordinal))
{
return "dance";
}
if (loweredTranscript.Contains("surprise", StringComparison.Ordinal))
{
return "surprise";
}
if (loweredTranscript.Contains("personal report", StringComparison.Ordinal))
{
return "personal_report";
}
if (loweredTranscript.Contains("weather", StringComparison.Ordinal))
{
return "weather";
}
if (loweredTranscript.Contains("calendar", StringComparison.Ordinal))
{
return "calendar";
}
if (loweredTranscript.Contains("commute", StringComparison.Ordinal))
{
return "commute";
}
if (loweredTranscript.Contains("news", StringComparison.Ordinal))
{
return "news";
}
if (loweredTranscript.Contains("how are you", StringComparison.Ordinal) ||
loweredTranscript.Contains("what's up", StringComparison.Ordinal) ||
loweredTranscript.Contains("what s up", StringComparison.Ordinal))
{
return "how_are_you";
}
if (loweredTranscript.Contains("hello", StringComparison.Ordinal) ||
loweredTranscript.Contains("hi", StringComparison.Ordinal) ||
loweredTranscript.Contains("hey", StringComparison.Ordinal))
{
return "hello";
}
if (loweredTranscript.Contains("time", StringComparison.Ordinal))
{
return "time";
}
if (loweredTranscript.Contains("date", StringComparison.Ordinal) || loweredTranscript.Contains("day", StringComparison.Ordinal))
{
return "date";
}
return "chat";
}
}
public sealed record JiboInteractionDecision(
string IntentName,
string ReplyText,
string? SkillName = null,
IDictionary<string, object?>? SkillPayload = null);

View File

@@ -164,16 +164,18 @@ public sealed class ResponsePlanToSocketMessagesMapper
private static object BuildSkillPayload(ResponsePlan plan, TurnContext turn, string transId, SpeakAction speak, InvokeNativeSkillAction? skill)
{
var skillPayload = skill?.Payload;
var isJoke = string.Equals(plan.IntentName, "joke", StringComparison.OrdinalIgnoreCase) ||
string.Equals(skill?.SkillName, "@be/joke", StringComparison.OrdinalIgnoreCase);
var isDance = string.Equals(plan.IntentName, "dance", StringComparison.OrdinalIgnoreCase);
var skillId = isJoke ? "@be/joke" : skill?.SkillName ?? "chitchat-skill";
var esml = isDance
var skillId = ReadPayloadString(skillPayload, "skillId") ?? (isJoke ? "@be/joke" : skill?.SkillName ?? "chitchat-skill");
var esml = ReadPayloadString(skillPayload, "esml") ?? (isDance
? "<speak>Okay.<break size='0.2'/> Watch this.<anim cat='dance' filter='music, rom-upbeat' /></speak>"
: isJoke
? $"<speak><es cat='happy' filter='!ssa-only, !sfx-only' endNeutral='true'>{EscapeXml(speak.Text)}</es></speak>"
: $"<speak><es cat='neutral' filter='!ssa-only, !sfx-only' endNeutral='true'>{EscapeXml(speak.Text)}</es></speak>";
var mimId = isJoke ? "runtime-joke" : "runtime-chat";
: $"<speak><es cat='neutral' filter='!ssa-only, !sfx-only' endNeutral='true'>{EscapeXml(speak.Text)}</es></speak>");
var mimId = ReadPayloadString(skillPayload, "mim_id") ?? (isJoke ? "runtime-joke" : "runtime-chat");
var mimType = ReadPayloadString(skillPayload, "mim_type") ?? "announcement";
return new
{
@@ -204,7 +206,7 @@ public sealed class ResponsePlanToSocketMessagesMapper
prompt_id = "RUNTIME_PROMPT",
prompt_sub_category = "AN",
mim_id = mimId,
mim_type = "announcement"
mim_type = mimType
}
}
}
@@ -271,6 +273,16 @@ public sealed class ResponsePlanToSocketMessagesMapper
.Replace("'", "&apos;", StringComparison.Ordinal);
}
private static string? ReadPayloadString(IDictionary<string, object?>? payload, string key)
{
if (payload is null || !payload.TryGetValue(key, out var value))
{
return null;
}
return value?.ToString();
}
private static string CreateHubMessageId()
{
return $"mid-{Guid.NewGuid()}";

View File

@@ -0,0 +1,81 @@
using Jibo.Cloud.Application.Abstractions;
namespace Jibo.Cloud.Infrastructure.Content;
public sealed class InMemoryJiboExperienceContentRepository : IJiboExperienceContentRepository
{
private static readonly JiboExperienceCatalog Catalog = new()
{
Jokes =
[
"Why did the robot cross the road? Because it was programmed by the chicken.",
"Why was the robot tired when it got home? It had a hard drive.",
"What do you call a pirate robot? Arrrr two dee two.",
"Why did the robot go on vacation? It needed to recharge.",
"What kind of shoes do frogs wear? Open-toed."
],
DanceAnimations =
[
"rom-upbeat",
"rom-ballroom",
"rom-silly",
"rom-slowdance",
"rom-electronic",
"rom-twerk"
],
GreetingReplies =
[
"Hi there. It is really good to talk with you.",
"Hello there. I am glad you said hi.",
"Hey. I am happy to see you."
],
HowAreYouReplies =
[
"I am feeling cheerful and robotic.",
"I am doing great. Thanks for asking.",
"I am feeling bright-eyed and ready to help."
],
SurpriseReplies =
[
"I can definitely surprise you. We are still mapping that path, but I am ready for the next experiment.",
"Surprise mode is still taking shape, but I heard you loud and clear.",
"That sounds fun. I am not all the way there yet, but we can keep teaching me."
],
PersonalReportReplies =
[
"I heard your personal report request. That cloud path is still being mapped.",
"Personal report is recognized, but I am not ready to deliver the real report yet."
],
WeatherReplies =
[
"I heard your weather request. We still need to wire the real provider behind it.",
"Weather is on the map now, even though the real forecast path is not finished yet."
],
CalendarReplies =
[
"I heard your calendar request. The cloud knows the phrase, but the real calendar integration is still ahead.",
"Calendar is recognized. We still need to connect the actual service path."
],
CommuteReplies =
[
"I heard your commute request. That one is recognized, but not fully implemented yet.",
"Commute is on the discovery list now. The real travel answer still needs a provider."
],
NewsReplies =
[
"I heard your news request. That path is still a future cloud integration.",
"News is recognized, but I do not have the full news service behind it yet."
],
GenericFallbackReplies =
[
"Okay. You said, {transcript}.",
"I heard you say, {transcript}.",
"Thanks. I heard, {transcript}."
]
};
public Task<JiboExperienceCatalog> GetCatalogAsync(CancellationToken cancellationToken = default)
{
return Task.FromResult(Catalog);
}
}

View File

@@ -1,6 +1,7 @@
using Jibo.Cloud.Application.Abstractions;
using Jibo.Cloud.Application.Services;
using Jibo.Cloud.Infrastructure.Audio;
using Jibo.Cloud.Infrastructure.Content;
using Jibo.Cloud.Infrastructure.Persistence;
using Jibo.Cloud.Infrastructure.Telemetry;
using Jibo.Runtime.Abstractions;
@@ -23,6 +24,10 @@ public static class ServiceCollectionExtensions
services.AddSingleton(sttOptions);
services.AddSingleton<ICloudStateStore, InMemoryCloudStateStore>();
services.AddSingleton<IJiboExperienceContentRepository, InMemoryJiboExperienceContentRepository>();
services.AddSingleton<JiboExperienceContentCache>();
services.AddSingleton<IJiboRandomizer, DefaultJiboRandomizer>();
services.AddSingleton<JiboInteractionService>();
services.AddSingleton<IConversationBroker, DemoConversationBroker>();
services.AddSingleton<IExternalProcessRunner, ExternalProcessRunner>();
services.AddSingleton<ISttStrategy, LocalWhisperCppBufferedAudioSttStrategy>();