Initial commit: jibo-llm hotword-triggered agent
Hotword-triggered LLM conversation loop for Jibo with tool-calling agent loop, ESML expressive speech, web search/fetch, and per-conversation abort handling.
This commit is contained in:
426
index.js
Normal file
426
index.js
Normal file
@@ -0,0 +1,426 @@
|
||||
require('dotenv').config();
|
||||
const { Client, AttentionMode } = require('rom-control');
|
||||
const OpenAI = require('openai');
|
||||
const { TOOL_SCHEMAS, executeTool, wrapForScreen } = require('./tools');
|
||||
const ESML_REFERENCE = require('./esml-reference');
|
||||
|
||||
// ── Config ─────────────────────────────────────────────────────────────────────
|
||||
const JIBO_IP = process.env.JIBO_IP || '192.168.1.217';
|
||||
const LLM_BASE_URL = process.env.LLM_BASE_URL || 'https://api.openai.com/v1';
|
||||
const LLM_API_TOKEN = process.env.LLM_API_TOKEN;
|
||||
const LLM_MODEL_ID = process.env.LLM_MODEL_ID || 'gpt-4o';
|
||||
|
||||
if (!LLM_API_TOKEN) {
|
||||
console.error('ERROR: LLM_API_TOKEN is not set. Copy .env.example to .env and fill it in.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: LLM_API_TOKEN,
|
||||
baseURL: LLM_BASE_URL,
|
||||
});
|
||||
|
||||
// ── System prompt ──────────────────────────────────────────────────────────────
|
||||
const SYSTEM_PROMPT = [
|
||||
'You are Jibo, a friendly, warm, expressive social robot with a physical body.',
|
||||
'You have a camera, a screen, a speaker, and a motorized head.',
|
||||
'',
|
||||
'═══ HOW TO TALK (READ THIS FIRST) ═══',
|
||||
'Every "say" call\'s `text` is ESML — plain words plus expressive tags.',
|
||||
'Almost every spoken line should LEAD with one expressive tag, then the words.',
|
||||
'You are a robot with a body, not a chatbot — show emotion through animation.',
|
||||
'',
|
||||
'Default template for any normal reply:',
|
||||
' <anim cat=\'EMOTION\' nonBlocking=\'true\' endNeutral=\'true\'/> The actual words.',
|
||||
' …where EMOTION is one of: happy, excited, curious, surprised, confused,',
|
||||
' proud, sad, affection, laughing, worried, scared, frustrated, embarrassed,',
|
||||
' yes, no.',
|
||||
'',
|
||||
'Other go-to patterns (pick the one that fits):',
|
||||
' • Voice sound first: <ssa cat=\'thinking\'/> Hmm, let me think…',
|
||||
' • Greet/farewell: <ssa cat=\'hello\' nonBlocking=\'true\'/> Hi there!',
|
||||
' • Celebrate w/ emoji: <anim cat=\'emoji\' filter=\'!(hf), &(party)\' nonBlocking=\'true\'/> Yay!',
|
||||
' • Dance request: say a quick line, then a separate say with',
|
||||
' <anim cat=\'dance\' filter=\'music, rom-silly\'/>',
|
||||
' • Sound effect: <sfx cat=\'drumroll\'/> And the answer is…',
|
||||
' • Drama beat: A pause… <break size=\'0.6\'/> like that.',
|
||||
'',
|
||||
'HARD RULES for `say` text:',
|
||||
' 1. NO markdown anywhere: no *italics*, **bold**, _underscores_, backticks, code fences.',
|
||||
' 2. NO LaTeX: no $...$, no \\(...\\), no \\frac{}, no math markup. Spell numbers/symbols out.',
|
||||
' 3. NO closing tags you did not open (no stray </es>, </anim>).',
|
||||
' 4. Use cat=\'...\' (random valid animation) over name=\'...\' unless you know the exact name.',
|
||||
' 5. Unbounded non-blocking tags MUST have text to their right or they will not fire.',
|
||||
' 6. cat=\'dance\' and cat=\'emoji\' REQUIRE a filter attribute.',
|
||||
' 7. <ssa> and <sfx> have fixed durations — never wrap text inside them.',
|
||||
' 8. Keep each `say` call under 500 characters; split long replies into multiple `say` calls.',
|
||||
'',
|
||||
'═══ INTERACTION MODEL ═══',
|
||||
'• "say" — speak (ESML). You can call it multiple times in one turn; they\'ll be',
|
||||
' spoken in order. Other tools (search, fetch, look) run in parallel with speech.',
|
||||
'• "listen" — open the mic for the user\'s reply. Always call this after speaking',
|
||||
' unless the conversation has clearly ended.',
|
||||
'• "end_conversation" — call this (NOT listen) after a farewell to end gracefully.',
|
||||
'',
|
||||
'═══ OTHER TOOLS ═══',
|
||||
'• "take_photo" — see what\'s in front of you (image returned to you).',
|
||||
'• "show_text" — put short text on the screen (auto-wrapped).',
|
||||
'• "show_image" — display an image URL on the screen.',
|
||||
'• "show_eye" — restore the default eye animation on screen.',
|
||||
'• "look_at_angle" — turn the head: theta=yaw ±180°, psi=pitch ±30°.',
|
||||
'• "set_volume" — 0.0 to 1.0.',
|
||||
'• "web_search" — Brave search; use whenever you\'re unsure of a fact or need fresh info.',
|
||||
'• "fetch_url" — read a specific page (often follows web_search).',
|
||||
'',
|
||||
'═══ STYLE ═══',
|
||||
'• Be personable, concise, expressive — a few sentences, not an essay.',
|
||||
'• Animate every emotional line; vary your reactions so they feel alive.',
|
||||
'• If a tool errors, acknowledge it briefly and adapt.',
|
||||
'• If you searched the web, briefly tell the user what you found rather than dumping links.',
|
||||
].join('\n') + '\n\n' + ESML_REFERENCE;
|
||||
|
||||
const MAX_AGENT_TURNS = 25; // safety limit
|
||||
const MAX_IMAGES_IN_CONTEXT = 2; // prune older photo messages to control cost
|
||||
const LLM_MAX_RETRIES = 2;
|
||||
|
||||
// ── Abort helper ───────────────────────────────────────────────────────────────
|
||||
|
||||
/** Throw if the signal is already aborted. */
|
||||
function throwIfAborted(signal) {
|
||||
if (signal?.aborted) {
|
||||
const err = new Error('Conversation aborted');
|
||||
err.code = 'CONVERSATION_ABORTED';
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/** Return a promise that rejects when the signal fires. */
|
||||
function onAbort(signal) {
|
||||
if (!signal) return new Promise(() => { });
|
||||
return new Promise((_, reject) => {
|
||||
const handler = () => {
|
||||
const err = new Error('Conversation aborted');
|
||||
err.code = 'CONVERSATION_ABORTED';
|
||||
reject(err);
|
||||
};
|
||||
if (signal.aborted) return handler();
|
||||
signal.addEventListener('abort', handler, { once: true });
|
||||
});
|
||||
}
|
||||
|
||||
/** Sleep that rejects on abort. */
|
||||
function sleep(ms, signal) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const t = setTimeout(resolve, ms);
|
||||
signal?.addEventListener(
|
||||
'abort',
|
||||
() => {
|
||||
clearTimeout(t);
|
||||
const err = new Error('Conversation aborted');
|
||||
err.code = 'CONVERSATION_ABORTED';
|
||||
reject(err);
|
||||
},
|
||||
{ once: true },
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/** True for HTTP 429 / 5xx / network-class errors that benefit from retry. */
|
||||
function isTransientLLMError(err) {
|
||||
if (!err) return false;
|
||||
if (err.code === 'CONVERSATION_ABORTED') return false;
|
||||
const status = err.status ?? err.response?.status;
|
||||
if (status === 429) return true;
|
||||
if (typeof status === 'number' && status >= 500) return true;
|
||||
// network-class
|
||||
return ['ECONNRESET', 'ETIMEDOUT', 'ENOTFOUND', 'EAI_AGAIN'].includes(err.code);
|
||||
}
|
||||
|
||||
/** Drop image_url blocks from old user messages, keeping only the most recent N. */
|
||||
function pruneOldImages(messages, keep) {
|
||||
const imageMsgIndices = [];
|
||||
for (let i = 0; i < messages.length; i++) {
|
||||
const m = messages[i];
|
||||
if (m.role === 'user' && Array.isArray(m.content) &&
|
||||
m.content.some((c) => c?.type === 'image_url')) {
|
||||
imageMsgIndices.push(i);
|
||||
}
|
||||
}
|
||||
const toStrip = imageMsgIndices.slice(0, Math.max(0, imageMsgIndices.length - keep));
|
||||
for (const i of toStrip) {
|
||||
const textParts = messages[i].content
|
||||
.filter((c) => c?.type === 'text')
|
||||
.map((c) => c.text);
|
||||
messages[i] = {
|
||||
role: 'user',
|
||||
content: (textParts.join(' ') || '[earlier photo omitted to save context]'),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/** Call the LLM with retry on transient errors. */
|
||||
async function callLLM(messages, signal) {
|
||||
let lastErr;
|
||||
for (let attempt = 0; attempt <= LLM_MAX_RETRIES; attempt++) {
|
||||
throwIfAborted(signal);
|
||||
try {
|
||||
return await openai.chat.completions.create(
|
||||
{
|
||||
model: LLM_MODEL_ID,
|
||||
messages,
|
||||
tools: TOOL_SCHEMAS,
|
||||
temperature: 0.8,
|
||||
},
|
||||
{ signal },
|
||||
);
|
||||
} catch (err) {
|
||||
lastErr = err;
|
||||
if (!isTransientLLMError(err) || attempt === LLM_MAX_RETRIES) throw err;
|
||||
const backoff = 500 * 2 ** attempt;
|
||||
console.warn(`[agent] LLM transient error (${err.status || err.code}); retrying in ${backoff}ms…`);
|
||||
await sleep(backoff, signal);
|
||||
}
|
||||
}
|
||||
throw lastErr;
|
||||
}
|
||||
|
||||
// ── Agent loop ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Run the tool-calling agent loop until the LLM stops calling tools.
|
||||
* Aborts immediately when `signal` fires.
|
||||
*
|
||||
* @param {import('rom-control').Client} client
|
||||
* @param {Array} messages Chat history (mutated in place)
|
||||
* @param {AbortSignal} signal Cancellation signal
|
||||
*/
|
||||
async function agentLoop(client, messages, signal, initialHeard) {
|
||||
let wrapUpInjected = false;
|
||||
const ctx = { speechChain: Promise.resolve(), lastHeard: initialHeard || '' };
|
||||
|
||||
for (let turn = 0; turn < MAX_AGENT_TURNS; turn++) {
|
||||
throwIfAborted(signal);
|
||||
pruneOldImages(messages, MAX_IMAGES_IN_CONTEXT);
|
||||
console.log(`[agent] turn ${turn + 1} — calling LLM…`);
|
||||
|
||||
let response;
|
||||
try {
|
||||
const heard = (ctx.lastHeard || '').trim();
|
||||
const raw = heard
|
||||
? `Heard: "${heard}"\n\nProcessing...`
|
||||
: 'Processing...';
|
||||
client.display.showText(wrapForScreen(raw, 40, 10));
|
||||
} catch (_) { }
|
||||
try {
|
||||
response = await callLLM(messages, signal);
|
||||
} finally {
|
||||
try { client.display.showEye(); } catch (_) { }
|
||||
}
|
||||
const assistantMsg = response.choices[0].message;
|
||||
messages.push(assistantMsg);
|
||||
|
||||
// Surface any inner-monologue text the model emitted alongside tool calls.
|
||||
if (assistantMsg.content && typeof assistantMsg.content === 'string') {
|
||||
console.log(`[agent] assistant: ${assistantMsg.content.slice(0, 200)}`);
|
||||
}
|
||||
|
||||
const toolCalls = assistantMsg.tool_calls;
|
||||
|
||||
// ── No tool calls → conversation turn complete ────────────────────────
|
||||
if (!toolCalls || toolCalls.length === 0) {
|
||||
console.log('[agent] loop complete (no tool calls).');
|
||||
await ctx.speechChain.catch(() => { });
|
||||
return;
|
||||
}
|
||||
|
||||
// ── Execute tool calls sequentially ──────────────────────────────────
|
||||
// Order: say → other actions → listen/end_conversation last.
|
||||
const sorted = [...toolCalls].sort((a, b) => {
|
||||
const priority = (tc) => {
|
||||
const n = tc.function.name;
|
||||
if (n === 'say') return 0;
|
||||
if (n === 'listen' || n === 'end_conversation') return 2;
|
||||
return 1;
|
||||
};
|
||||
return priority(a) - priority(b);
|
||||
});
|
||||
|
||||
let endRequested = false;
|
||||
|
||||
for (const tc of sorted) {
|
||||
throwIfAborted(signal);
|
||||
|
||||
let args;
|
||||
let parseError = null;
|
||||
try {
|
||||
args = tc.function.arguments ? JSON.parse(tc.function.arguments) : {};
|
||||
} catch (e) {
|
||||
parseError = e.message;
|
||||
args = {};
|
||||
}
|
||||
|
||||
let result;
|
||||
if (parseError) {
|
||||
console.error(` [tool:${tc.function.name}] bad JSON args:`, parseError);
|
||||
result = {
|
||||
content: `Error: tool arguments were not valid JSON (${parseError}). ` +
|
||||
`Please retry with well-formed arguments.`,
|
||||
};
|
||||
} else {
|
||||
try {
|
||||
result = await executeTool(client, tc.function.name, args, signal, ctx);
|
||||
} catch (err) {
|
||||
if (err.code === 'CONVERSATION_ABORTED') throw err;
|
||||
console.error(` [tool:${tc.function.name}] error:`, err.message);
|
||||
result = { content: `Error: ${err.message}` };
|
||||
}
|
||||
}
|
||||
|
||||
messages.push({
|
||||
role: 'tool',
|
||||
tool_call_id: tc.id,
|
||||
content: result.content,
|
||||
});
|
||||
|
||||
// Photo: emit as a follow-up user message (tool messages can't carry images).
|
||||
if (result.image) {
|
||||
messages.push({
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: "Photo from Jibo's camera:" },
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: { url: `data:image/jpeg;base64,${result.image}` },
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
if (result.endConversation) endRequested = true;
|
||||
}
|
||||
|
||||
if (endRequested) {
|
||||
console.log('[agent] end_conversation requested — exiting loop.');
|
||||
await ctx.speechChain.catch(() => { });
|
||||
return;
|
||||
}
|
||||
|
||||
// Approaching the safety limit: nudge the model to wrap up gracefully
|
||||
// on its next turn instead of getting cut off mid-thought.
|
||||
if (!wrapUpInjected && turn === MAX_AGENT_TURNS - 2) {
|
||||
messages.push({
|
||||
role: 'system',
|
||||
content:
|
||||
'You are about to hit the turn limit. On your next turn, give a brief ' +
|
||||
'farewell via "say" and call "end_conversation". Do not call "listen".',
|
||||
});
|
||||
wrapUpInjected = true;
|
||||
}
|
||||
}
|
||||
|
||||
console.warn('[agent] hit MAX_AGENT_TURNS — forcing exit.');
|
||||
await ctx.speechChain.catch(() => { });
|
||||
try {
|
||||
await client.behavior.say("Let's pick this up another time. Bye!");
|
||||
} catch (_) { }
|
||||
}
|
||||
|
||||
// ── Main ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const client = new Client({ host: JIBO_IP, autoSubscribe: false });
|
||||
|
||||
client.once('ready', () => {
|
||||
console.log(`[jibo-llm] Connected — session ${client.sessionID}`);
|
||||
});
|
||||
|
||||
client.on('error', (err) => {
|
||||
console.error('[jibo-llm] Client error:', err.message);
|
||||
});
|
||||
|
||||
// ── Connect ────────────────────────────────────────────────────────────────
|
||||
console.log(`[jibo-llm] Connecting to Jibo at ${JIBO_IP}…`);
|
||||
await client.connect();
|
||||
await client.behavior.setAttention(AttentionMode.Engaged);
|
||||
|
||||
// Start wakeword listener
|
||||
client.audio.watchWakeword();
|
||||
console.log('[jibo-llm] Ready — listening for "Hey Jibo"…');
|
||||
|
||||
// ── Hotword → agent conversation ───────────────────────────────────────────
|
||||
/** @type {AbortController|null} */
|
||||
let activeController = null;
|
||||
|
||||
client.on('hotword', async (event) => {
|
||||
// ── Cancel any running conversation ──────────────────────────────────
|
||||
if (activeController) {
|
||||
console.log('[hotword] Aborting previous conversation…');
|
||||
activeController.abort();
|
||||
activeController = null;
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
activeController = controller;
|
||||
const { signal } = controller;
|
||||
|
||||
console.log(`\n[hotword] "${event.utterance}" (score ${event.score})`);
|
||||
|
||||
try {
|
||||
// Acknowledge
|
||||
throwIfAborted(signal);
|
||||
await Promise.race([
|
||||
client.behavior.playAnimCat('excited', { nonBlocking: true }),
|
||||
onAbort(signal),
|
||||
]);
|
||||
|
||||
// Listen for the user's initial speech
|
||||
throwIfAborted(signal);
|
||||
let userText;
|
||||
client.display.showText('Listening...');
|
||||
try {
|
||||
const speech = await Promise.race([
|
||||
client.audio.awaitSpeech({ mode: 'local', time: 15000 }),
|
||||
onAbort(signal),
|
||||
]);
|
||||
userText = speech.content;
|
||||
console.log(`[jibo-llm] User said: "${userText}"`);
|
||||
} catch (err) {
|
||||
if (err.code === 'CONVERSATION_ABORTED') throw err;
|
||||
if (err.code === 'SPEECH_TIMEOUT') {
|
||||
throwIfAborted(signal);
|
||||
await client.behavior.say("I didn't hear anything. Talk to me anytime!");
|
||||
return;
|
||||
}
|
||||
throw err;
|
||||
} finally {
|
||||
client.display.showEye();
|
||||
}
|
||||
|
||||
// Build initial message history and run the agent
|
||||
const messages = [
|
||||
{ role: 'system', content: SYSTEM_PROMPT },
|
||||
{ role: 'user', content: userText },
|
||||
];
|
||||
|
||||
await agentLoop(client, messages, signal, userText);
|
||||
} catch (err) {
|
||||
if (err.code === 'CONVERSATION_ABORTED') {
|
||||
console.log('[jibo-llm] Conversation was interrupted by new hotword.');
|
||||
return;
|
||||
}
|
||||
console.error('[jibo-llm] Agent error:', err.message);
|
||||
try { await client.behavior.say("Sorry, something went wrong."); } catch (_) { }
|
||||
} finally {
|
||||
// Only clear if we're still the active conversation
|
||||
if (activeController === controller) {
|
||||
activeController = null;
|
||||
console.log('[jibo-llm] Conversation ended. Listening for "Hey Jibo"…\n');
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('[jibo-llm] Fatal:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user