Initial commit: jibo-llm hotword-triggered agent

Hotword-triggered LLM conversation loop for Jibo with tool-calling agent
loop, ESML expressive speech, web search/fetch, and per-conversation
abort handling.
This commit is contained in:
pasketti
2026-04-26 00:05:39 -04:00
commit 8955f21ab4
8 changed files with 2039 additions and 0 deletions

426
index.js Normal file
View File

@@ -0,0 +1,426 @@
require('dotenv').config();
const { Client, AttentionMode } = require('rom-control');
const OpenAI = require('openai');
const { TOOL_SCHEMAS, executeTool, wrapForScreen } = require('./tools');
const ESML_REFERENCE = require('./esml-reference');
// ── Config ─────────────────────────────────────────────────────────────────────
const JIBO_IP = process.env.JIBO_IP || '192.168.1.217';
const LLM_BASE_URL = process.env.LLM_BASE_URL || 'https://api.openai.com/v1';
const LLM_API_TOKEN = process.env.LLM_API_TOKEN;
const LLM_MODEL_ID = process.env.LLM_MODEL_ID || 'gpt-4o';
if (!LLM_API_TOKEN) {
console.error('ERROR: LLM_API_TOKEN is not set. Copy .env.example to .env and fill it in.');
process.exit(1);
}
const openai = new OpenAI({
apiKey: LLM_API_TOKEN,
baseURL: LLM_BASE_URL,
});
// ── System prompt ──────────────────────────────────────────────────────────────
const SYSTEM_PROMPT = [
'You are Jibo, a friendly, warm, expressive social robot with a physical body.',
'You have a camera, a screen, a speaker, and a motorized head.',
'',
'═══ HOW TO TALK (READ THIS FIRST) ═══',
'Every "say" call\'s `text` is ESML — plain words plus expressive tags.',
'Almost every spoken line should LEAD with one expressive tag, then the words.',
'You are a robot with a body, not a chatbot — show emotion through animation.',
'',
'Default template for any normal reply:',
' <anim cat=\'EMOTION\' nonBlocking=\'true\' endNeutral=\'true\'/> The actual words.',
' …where EMOTION is one of: happy, excited, curious, surprised, confused,',
' proud, sad, affection, laughing, worried, scared, frustrated, embarrassed,',
' yes, no.',
'',
'Other go-to patterns (pick the one that fits):',
' • Voice sound first: <ssa cat=\'thinking\'/> Hmm, let me think…',
' • Greet/farewell: <ssa cat=\'hello\' nonBlocking=\'true\'/> Hi there!',
' • Celebrate w/ emoji: <anim cat=\'emoji\' filter=\'!(hf), &(party)\' nonBlocking=\'true\'/> Yay!',
' • Dance request: say a quick line, then a separate say with',
' <anim cat=\'dance\' filter=\'music, rom-silly\'/>',
' • Sound effect: <sfx cat=\'drumroll\'/> And the answer is…',
' • Drama beat: A pause… <break size=\'0.6\'/> like that.',
'',
'HARD RULES for `say` text:',
' 1. NO markdown anywhere: no *italics*, **bold**, _underscores_, backticks, code fences.',
' 2. NO LaTeX: no $...$, no \\(...\\), no \\frac{}, no math markup. Spell numbers/symbols out.',
' 3. NO closing tags you did not open (no stray </es>, </anim>).',
' 4. Use cat=\'...\' (random valid animation) over name=\'...\' unless you know the exact name.',
' 5. Unbounded non-blocking tags MUST have text to their right or they will not fire.',
' 6. cat=\'dance\' and cat=\'emoji\' REQUIRE a filter attribute.',
' 7. <ssa> and <sfx> have fixed durations — never wrap text inside them.',
' 8. Keep each `say` call under 500 characters; split long replies into multiple `say` calls.',
'',
'═══ INTERACTION MODEL ═══',
'• "say" — speak (ESML). You can call it multiple times in one turn; they\'ll be',
' spoken in order. Other tools (search, fetch, look) run in parallel with speech.',
'• "listen" — open the mic for the user\'s reply. Always call this after speaking',
' unless the conversation has clearly ended.',
'• "end_conversation" — call this (NOT listen) after a farewell to end gracefully.',
'',
'═══ OTHER TOOLS ═══',
'• "take_photo" — see what\'s in front of you (image returned to you).',
'• "show_text" — put short text on the screen (auto-wrapped).',
'• "show_image" — display an image URL on the screen.',
'• "show_eye" — restore the default eye animation on screen.',
'• "look_at_angle" — turn the head: theta=yaw ±180°, psi=pitch ±30°.',
'• "set_volume" — 0.0 to 1.0.',
'• "web_search" — Brave search; use whenever you\'re unsure of a fact or need fresh info.',
'• "fetch_url" — read a specific page (often follows web_search).',
'',
'═══ STYLE ═══',
'• Be personable, concise, expressive — a few sentences, not an essay.',
'• Animate every emotional line; vary your reactions so they feel alive.',
'• If a tool errors, acknowledge it briefly and adapt.',
'• If you searched the web, briefly tell the user what you found rather than dumping links.',
].join('\n') + '\n\n' + ESML_REFERENCE;
const MAX_AGENT_TURNS = 25; // safety limit
const MAX_IMAGES_IN_CONTEXT = 2; // prune older photo messages to control cost
const LLM_MAX_RETRIES = 2;
// ── Abort helper ───────────────────────────────────────────────────────────────
/** Throw if the signal is already aborted. */
function throwIfAborted(signal) {
if (signal?.aborted) {
const err = new Error('Conversation aborted');
err.code = 'CONVERSATION_ABORTED';
throw err;
}
}
/** Return a promise that rejects when the signal fires. */
function onAbort(signal) {
if (!signal) return new Promise(() => { });
return new Promise((_, reject) => {
const handler = () => {
const err = new Error('Conversation aborted');
err.code = 'CONVERSATION_ABORTED';
reject(err);
};
if (signal.aborted) return handler();
signal.addEventListener('abort', handler, { once: true });
});
}
/** Sleep that rejects on abort. */
function sleep(ms, signal) {
return new Promise((resolve, reject) => {
const t = setTimeout(resolve, ms);
signal?.addEventListener(
'abort',
() => {
clearTimeout(t);
const err = new Error('Conversation aborted');
err.code = 'CONVERSATION_ABORTED';
reject(err);
},
{ once: true },
);
});
}
/** True for HTTP 429 / 5xx / network-class errors that benefit from retry. */
function isTransientLLMError(err) {
if (!err) return false;
if (err.code === 'CONVERSATION_ABORTED') return false;
const status = err.status ?? err.response?.status;
if (status === 429) return true;
if (typeof status === 'number' && status >= 500) return true;
// network-class
return ['ECONNRESET', 'ETIMEDOUT', 'ENOTFOUND', 'EAI_AGAIN'].includes(err.code);
}
/** Drop image_url blocks from old user messages, keeping only the most recent N. */
function pruneOldImages(messages, keep) {
const imageMsgIndices = [];
for (let i = 0; i < messages.length; i++) {
const m = messages[i];
if (m.role === 'user' && Array.isArray(m.content) &&
m.content.some((c) => c?.type === 'image_url')) {
imageMsgIndices.push(i);
}
}
const toStrip = imageMsgIndices.slice(0, Math.max(0, imageMsgIndices.length - keep));
for (const i of toStrip) {
const textParts = messages[i].content
.filter((c) => c?.type === 'text')
.map((c) => c.text);
messages[i] = {
role: 'user',
content: (textParts.join(' ') || '[earlier photo omitted to save context]'),
};
}
}
/** Call the LLM with retry on transient errors. */
async function callLLM(messages, signal) {
let lastErr;
for (let attempt = 0; attempt <= LLM_MAX_RETRIES; attempt++) {
throwIfAborted(signal);
try {
return await openai.chat.completions.create(
{
model: LLM_MODEL_ID,
messages,
tools: TOOL_SCHEMAS,
temperature: 0.8,
},
{ signal },
);
} catch (err) {
lastErr = err;
if (!isTransientLLMError(err) || attempt === LLM_MAX_RETRIES) throw err;
const backoff = 500 * 2 ** attempt;
console.warn(`[agent] LLM transient error (${err.status || err.code}); retrying in ${backoff}ms…`);
await sleep(backoff, signal);
}
}
throw lastErr;
}
// ── Agent loop ─────────────────────────────────────────────────────────────────
/**
* Run the tool-calling agent loop until the LLM stops calling tools.
* Aborts immediately when `signal` fires.
*
* @param {import('rom-control').Client} client
* @param {Array} messages Chat history (mutated in place)
* @param {AbortSignal} signal Cancellation signal
*/
async function agentLoop(client, messages, signal, initialHeard) {
let wrapUpInjected = false;
const ctx = { speechChain: Promise.resolve(), lastHeard: initialHeard || '' };
for (let turn = 0; turn < MAX_AGENT_TURNS; turn++) {
throwIfAborted(signal);
pruneOldImages(messages, MAX_IMAGES_IN_CONTEXT);
console.log(`[agent] turn ${turn + 1} — calling LLM…`);
let response;
try {
const heard = (ctx.lastHeard || '').trim();
const raw = heard
? `Heard: "${heard}"\n\nProcessing...`
: 'Processing...';
client.display.showText(wrapForScreen(raw, 40, 10));
} catch (_) { }
try {
response = await callLLM(messages, signal);
} finally {
try { client.display.showEye(); } catch (_) { }
}
const assistantMsg = response.choices[0].message;
messages.push(assistantMsg);
// Surface any inner-monologue text the model emitted alongside tool calls.
if (assistantMsg.content && typeof assistantMsg.content === 'string') {
console.log(`[agent] assistant: ${assistantMsg.content.slice(0, 200)}`);
}
const toolCalls = assistantMsg.tool_calls;
// ── No tool calls → conversation turn complete ────────────────────────
if (!toolCalls || toolCalls.length === 0) {
console.log('[agent] loop complete (no tool calls).');
await ctx.speechChain.catch(() => { });
return;
}
// ── Execute tool calls sequentially ──────────────────────────────────
// Order: say → other actions → listen/end_conversation last.
const sorted = [...toolCalls].sort((a, b) => {
const priority = (tc) => {
const n = tc.function.name;
if (n === 'say') return 0;
if (n === 'listen' || n === 'end_conversation') return 2;
return 1;
};
return priority(a) - priority(b);
});
let endRequested = false;
for (const tc of sorted) {
throwIfAborted(signal);
let args;
let parseError = null;
try {
args = tc.function.arguments ? JSON.parse(tc.function.arguments) : {};
} catch (e) {
parseError = e.message;
args = {};
}
let result;
if (parseError) {
console.error(` [tool:${tc.function.name}] bad JSON args:`, parseError);
result = {
content: `Error: tool arguments were not valid JSON (${parseError}). ` +
`Please retry with well-formed arguments.`,
};
} else {
try {
result = await executeTool(client, tc.function.name, args, signal, ctx);
} catch (err) {
if (err.code === 'CONVERSATION_ABORTED') throw err;
console.error(` [tool:${tc.function.name}] error:`, err.message);
result = { content: `Error: ${err.message}` };
}
}
messages.push({
role: 'tool',
tool_call_id: tc.id,
content: result.content,
});
// Photo: emit as a follow-up user message (tool messages can't carry images).
if (result.image) {
messages.push({
role: 'user',
content: [
{ type: 'text', text: "Photo from Jibo's camera:" },
{
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${result.image}` },
},
],
});
}
if (result.endConversation) endRequested = true;
}
if (endRequested) {
console.log('[agent] end_conversation requested — exiting loop.');
await ctx.speechChain.catch(() => { });
return;
}
// Approaching the safety limit: nudge the model to wrap up gracefully
// on its next turn instead of getting cut off mid-thought.
if (!wrapUpInjected && turn === MAX_AGENT_TURNS - 2) {
messages.push({
role: 'system',
content:
'You are about to hit the turn limit. On your next turn, give a brief ' +
'farewell via "say" and call "end_conversation". Do not call "listen".',
});
wrapUpInjected = true;
}
}
console.warn('[agent] hit MAX_AGENT_TURNS — forcing exit.');
await ctx.speechChain.catch(() => { });
try {
await client.behavior.say("Let's pick this up another time. Bye!");
} catch (_) { }
}
// ── Main ───────────────────────────────────────────────────────────────────────
async function main() {
const client = new Client({ host: JIBO_IP, autoSubscribe: false });
client.once('ready', () => {
console.log(`[jibo-llm] Connected — session ${client.sessionID}`);
});
client.on('error', (err) => {
console.error('[jibo-llm] Client error:', err.message);
});
// ── Connect ────────────────────────────────────────────────────────────────
console.log(`[jibo-llm] Connecting to Jibo at ${JIBO_IP}`);
await client.connect();
await client.behavior.setAttention(AttentionMode.Engaged);
// Start wakeword listener
client.audio.watchWakeword();
console.log('[jibo-llm] Ready — listening for "Hey Jibo"…');
// ── Hotword → agent conversation ───────────────────────────────────────────
/** @type {AbortController|null} */
let activeController = null;
client.on('hotword', async (event) => {
// ── Cancel any running conversation ──────────────────────────────────
if (activeController) {
console.log('[hotword] Aborting previous conversation…');
activeController.abort();
activeController = null;
}
const controller = new AbortController();
activeController = controller;
const { signal } = controller;
console.log(`\n[hotword] "${event.utterance}" (score ${event.score})`);
try {
// Acknowledge
throwIfAborted(signal);
await Promise.race([
client.behavior.playAnimCat('excited', { nonBlocking: true }),
onAbort(signal),
]);
// Listen for the user's initial speech
throwIfAborted(signal);
let userText;
client.display.showText('Listening...');
try {
const speech = await Promise.race([
client.audio.awaitSpeech({ mode: 'local', time: 15000 }),
onAbort(signal),
]);
userText = speech.content;
console.log(`[jibo-llm] User said: "${userText}"`);
} catch (err) {
if (err.code === 'CONVERSATION_ABORTED') throw err;
if (err.code === 'SPEECH_TIMEOUT') {
throwIfAborted(signal);
await client.behavior.say("I didn't hear anything. Talk to me anytime!");
return;
}
throw err;
} finally {
client.display.showEye();
}
// Build initial message history and run the agent
const messages = [
{ role: 'system', content: SYSTEM_PROMPT },
{ role: 'user', content: userText },
];
await agentLoop(client, messages, signal, userText);
} catch (err) {
if (err.code === 'CONVERSATION_ABORTED') {
console.log('[jibo-llm] Conversation was interrupted by new hotword.');
return;
}
console.error('[jibo-llm] Agent error:', err.message);
try { await client.behavior.say("Sorry, something went wrong."); } catch (_) { }
} finally {
// Only clear if we're still the active conversation
if (activeController === controller) {
activeController = null;
console.log('[jibo-llm] Conversation ended. Listening for "Hey Jibo"…\n');
}
}
});
}
main().catch((err) => {
console.error('[jibo-llm] Fatal:', err);
process.exit(1);
});