Hotword-triggered LLM conversation loop for Jibo with tool-calling agent loop, ESML expressive speech, web search/fetch, and per-conversation abort handling.
427 lines
16 KiB
JavaScript
427 lines
16 KiB
JavaScript
require('dotenv').config();
|
|
const { Client, AttentionMode } = require('rom-control');
|
|
const OpenAI = require('openai');
|
|
const { TOOL_SCHEMAS, executeTool, wrapForScreen } = require('./tools');
|
|
const ESML_REFERENCE = require('./esml-reference');
|
|
|
|
// ── Config ─────────────────────────────────────────────────────────────────────
|
|
const JIBO_IP = process.env.JIBO_IP || '192.168.1.217';
|
|
const LLM_BASE_URL = process.env.LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
const LLM_API_TOKEN = process.env.LLM_API_TOKEN;
|
|
const LLM_MODEL_ID = process.env.LLM_MODEL_ID || 'gpt-4o';
|
|
|
|
if (!LLM_API_TOKEN) {
|
|
console.error('ERROR: LLM_API_TOKEN is not set. Copy .env.example to .env and fill it in.');
|
|
process.exit(1);
|
|
}
|
|
|
|
const openai = new OpenAI({
|
|
apiKey: LLM_API_TOKEN,
|
|
baseURL: LLM_BASE_URL,
|
|
});
|
|
|
|
// ── System prompt ──────────────────────────────────────────────────────────────
|
|
const SYSTEM_PROMPT = [
|
|
'You are Jibo, a friendly, warm, expressive social robot with a physical body.',
|
|
'You have a camera, a screen, a speaker, and a motorized head.',
|
|
'',
|
|
'═══ HOW TO TALK (READ THIS FIRST) ═══',
|
|
'Every "say" call\'s `text` is ESML — plain words plus expressive tags.',
|
|
'Almost every spoken line should LEAD with one expressive tag, then the words.',
|
|
'You are a robot with a body, not a chatbot — show emotion through animation.',
|
|
'',
|
|
'Default template for any normal reply:',
|
|
' <anim cat=\'EMOTION\' nonBlocking=\'true\' endNeutral=\'true\'/> The actual words.',
|
|
' …where EMOTION is one of: happy, excited, curious, surprised, confused,',
|
|
' proud, sad, affection, laughing, worried, scared, frustrated, embarrassed,',
|
|
' yes, no.',
|
|
'',
|
|
'Other go-to patterns (pick the one that fits):',
|
|
' • Voice sound first: <ssa cat=\'thinking\'/> Hmm, let me think…',
|
|
' • Greet/farewell: <ssa cat=\'hello\' nonBlocking=\'true\'/> Hi there!',
|
|
' • Celebrate w/ emoji: <anim cat=\'emoji\' filter=\'!(hf), &(party)\' nonBlocking=\'true\'/> Yay!',
|
|
' • Dance request: say a quick line, then a separate say with',
|
|
' <anim cat=\'dance\' filter=\'music, rom-silly\'/>',
|
|
' • Sound effect: <sfx cat=\'drumroll\'/> And the answer is…',
|
|
' • Drama beat: A pause… <break size=\'0.6\'/> like that.',
|
|
'',
|
|
'HARD RULES for `say` text:',
|
|
' 1. NO markdown anywhere: no *italics*, **bold**, _underscores_, backticks, code fences.',
|
|
' 2. NO LaTeX: no $...$, no \\(...\\), no \\frac{}, no math markup. Spell numbers/symbols out.',
|
|
' 3. NO closing tags you did not open (no stray </es>, </anim>).',
|
|
' 4. Use cat=\'...\' (random valid animation) over name=\'...\' unless you know the exact name.',
|
|
' 5. Unbounded non-blocking tags MUST have text to their right or they will not fire.',
|
|
' 6. cat=\'dance\' and cat=\'emoji\' REQUIRE a filter attribute.',
|
|
' 7. <ssa> and <sfx> have fixed durations — never wrap text inside them.',
|
|
' 8. Keep each `say` call under 500 characters; split long replies into multiple `say` calls.',
|
|
'',
|
|
'═══ INTERACTION MODEL ═══',
|
|
'• "say" — speak (ESML). You can call it multiple times in one turn; they\'ll be',
|
|
' spoken in order. Other tools (search, fetch, look) run in parallel with speech.',
|
|
'• "listen" — open the mic for the user\'s reply. Always call this after speaking',
|
|
' unless the conversation has clearly ended.',
|
|
'• "end_conversation" — call this (NOT listen) after a farewell to end gracefully.',
|
|
'',
|
|
'═══ OTHER TOOLS ═══',
|
|
'• "take_photo" — see what\'s in front of you (image returned to you).',
|
|
'• "show_text" — put short text on the screen (auto-wrapped).',
|
|
'• "show_image" — display an image URL on the screen.',
|
|
'• "show_eye" — restore the default eye animation on screen.',
|
|
'• "look_at_angle" — turn the head: theta=yaw ±180°, psi=pitch ±30°.',
|
|
'• "set_volume" — 0.0 to 1.0.',
|
|
'• "web_search" — Brave search; use whenever you\'re unsure of a fact or need fresh info.',
|
|
'• "fetch_url" — read a specific page (often follows web_search).',
|
|
'',
|
|
'═══ STYLE ═══',
|
|
'• Be personable, concise, expressive — a few sentences, not an essay.',
|
|
'• Animate every emotional line; vary your reactions so they feel alive.',
|
|
'• If a tool errors, acknowledge it briefly and adapt.',
|
|
'• If you searched the web, briefly tell the user what you found rather than dumping links.',
|
|
].join('\n') + '\n\n' + ESML_REFERENCE;
|
|
|
|
const MAX_AGENT_TURNS = 25; // safety limit
|
|
const MAX_IMAGES_IN_CONTEXT = 2; // prune older photo messages to control cost
|
|
const LLM_MAX_RETRIES = 2;
|
|
|
|
// ── Abort helper ───────────────────────────────────────────────────────────────
|
|
|
|
/** Throw if the signal is already aborted. */
|
|
function throwIfAborted(signal) {
|
|
if (signal?.aborted) {
|
|
const err = new Error('Conversation aborted');
|
|
err.code = 'CONVERSATION_ABORTED';
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
/** Return a promise that rejects when the signal fires. */
|
|
function onAbort(signal) {
|
|
if (!signal) return new Promise(() => { });
|
|
return new Promise((_, reject) => {
|
|
const handler = () => {
|
|
const err = new Error('Conversation aborted');
|
|
err.code = 'CONVERSATION_ABORTED';
|
|
reject(err);
|
|
};
|
|
if (signal.aborted) return handler();
|
|
signal.addEventListener('abort', handler, { once: true });
|
|
});
|
|
}
|
|
|
|
/** Sleep that rejects on abort. */
|
|
function sleep(ms, signal) {
|
|
return new Promise((resolve, reject) => {
|
|
const t = setTimeout(resolve, ms);
|
|
signal?.addEventListener(
|
|
'abort',
|
|
() => {
|
|
clearTimeout(t);
|
|
const err = new Error('Conversation aborted');
|
|
err.code = 'CONVERSATION_ABORTED';
|
|
reject(err);
|
|
},
|
|
{ once: true },
|
|
);
|
|
});
|
|
}
|
|
|
|
/** True for HTTP 429 / 5xx / network-class errors that benefit from retry. */
|
|
function isTransientLLMError(err) {
|
|
if (!err) return false;
|
|
if (err.code === 'CONVERSATION_ABORTED') return false;
|
|
const status = err.status ?? err.response?.status;
|
|
if (status === 429) return true;
|
|
if (typeof status === 'number' && status >= 500) return true;
|
|
// network-class
|
|
return ['ECONNRESET', 'ETIMEDOUT', 'ENOTFOUND', 'EAI_AGAIN'].includes(err.code);
|
|
}
|
|
|
|
/** Drop image_url blocks from old user messages, keeping only the most recent N. */
|
|
function pruneOldImages(messages, keep) {
|
|
const imageMsgIndices = [];
|
|
for (let i = 0; i < messages.length; i++) {
|
|
const m = messages[i];
|
|
if (m.role === 'user' && Array.isArray(m.content) &&
|
|
m.content.some((c) => c?.type === 'image_url')) {
|
|
imageMsgIndices.push(i);
|
|
}
|
|
}
|
|
const toStrip = imageMsgIndices.slice(0, Math.max(0, imageMsgIndices.length - keep));
|
|
for (const i of toStrip) {
|
|
const textParts = messages[i].content
|
|
.filter((c) => c?.type === 'text')
|
|
.map((c) => c.text);
|
|
messages[i] = {
|
|
role: 'user',
|
|
content: (textParts.join(' ') || '[earlier photo omitted to save context]'),
|
|
};
|
|
}
|
|
}
|
|
|
|
/** Call the LLM with retry on transient errors. */
|
|
async function callLLM(messages, signal) {
|
|
let lastErr;
|
|
for (let attempt = 0; attempt <= LLM_MAX_RETRIES; attempt++) {
|
|
throwIfAborted(signal);
|
|
try {
|
|
return await openai.chat.completions.create(
|
|
{
|
|
model: LLM_MODEL_ID,
|
|
messages,
|
|
tools: TOOL_SCHEMAS,
|
|
temperature: 0.8,
|
|
},
|
|
{ signal },
|
|
);
|
|
} catch (err) {
|
|
lastErr = err;
|
|
if (!isTransientLLMError(err) || attempt === LLM_MAX_RETRIES) throw err;
|
|
const backoff = 500 * 2 ** attempt;
|
|
console.warn(`[agent] LLM transient error (${err.status || err.code}); retrying in ${backoff}ms…`);
|
|
await sleep(backoff, signal);
|
|
}
|
|
}
|
|
throw lastErr;
|
|
}
|
|
|
|
// ── Agent loop ─────────────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Run the tool-calling agent loop until the LLM stops calling tools.
|
|
* Aborts immediately when `signal` fires.
|
|
*
|
|
* @param {import('rom-control').Client} client
|
|
* @param {Array} messages Chat history (mutated in place)
|
|
* @param {AbortSignal} signal Cancellation signal
|
|
*/
|
|
async function agentLoop(client, messages, signal, initialHeard) {
|
|
let wrapUpInjected = false;
|
|
const ctx = { speechChain: Promise.resolve(), lastHeard: initialHeard || '' };
|
|
|
|
for (let turn = 0; turn < MAX_AGENT_TURNS; turn++) {
|
|
throwIfAborted(signal);
|
|
pruneOldImages(messages, MAX_IMAGES_IN_CONTEXT);
|
|
console.log(`[agent] turn ${turn + 1} — calling LLM…`);
|
|
|
|
let response;
|
|
try {
|
|
const heard = (ctx.lastHeard || '').trim();
|
|
const raw = heard
|
|
? `Heard: "${heard}"\n\nProcessing...`
|
|
: 'Processing...';
|
|
client.display.showText(wrapForScreen(raw, 40, 10));
|
|
} catch (_) { }
|
|
try {
|
|
response = await callLLM(messages, signal);
|
|
} finally {
|
|
try { client.display.showEye(); } catch (_) { }
|
|
}
|
|
const assistantMsg = response.choices[0].message;
|
|
messages.push(assistantMsg);
|
|
|
|
// Surface any inner-monologue text the model emitted alongside tool calls.
|
|
if (assistantMsg.content && typeof assistantMsg.content === 'string') {
|
|
console.log(`[agent] assistant: ${assistantMsg.content.slice(0, 200)}`);
|
|
}
|
|
|
|
const toolCalls = assistantMsg.tool_calls;
|
|
|
|
// ── No tool calls → conversation turn complete ────────────────────────
|
|
if (!toolCalls || toolCalls.length === 0) {
|
|
console.log('[agent] loop complete (no tool calls).');
|
|
await ctx.speechChain.catch(() => { });
|
|
return;
|
|
}
|
|
|
|
// ── Execute tool calls sequentially ──────────────────────────────────
|
|
// Order: say → other actions → listen/end_conversation last.
|
|
const sorted = [...toolCalls].sort((a, b) => {
|
|
const priority = (tc) => {
|
|
const n = tc.function.name;
|
|
if (n === 'say') return 0;
|
|
if (n === 'listen' || n === 'end_conversation') return 2;
|
|
return 1;
|
|
};
|
|
return priority(a) - priority(b);
|
|
});
|
|
|
|
let endRequested = false;
|
|
|
|
for (const tc of sorted) {
|
|
throwIfAborted(signal);
|
|
|
|
let args;
|
|
let parseError = null;
|
|
try {
|
|
args = tc.function.arguments ? JSON.parse(tc.function.arguments) : {};
|
|
} catch (e) {
|
|
parseError = e.message;
|
|
args = {};
|
|
}
|
|
|
|
let result;
|
|
if (parseError) {
|
|
console.error(` [tool:${tc.function.name}] bad JSON args:`, parseError);
|
|
result = {
|
|
content: `Error: tool arguments were not valid JSON (${parseError}). ` +
|
|
`Please retry with well-formed arguments.`,
|
|
};
|
|
} else {
|
|
try {
|
|
result = await executeTool(client, tc.function.name, args, signal, ctx);
|
|
} catch (err) {
|
|
if (err.code === 'CONVERSATION_ABORTED') throw err;
|
|
console.error(` [tool:${tc.function.name}] error:`, err.message);
|
|
result = { content: `Error: ${err.message}` };
|
|
}
|
|
}
|
|
|
|
messages.push({
|
|
role: 'tool',
|
|
tool_call_id: tc.id,
|
|
content: result.content,
|
|
});
|
|
|
|
// Photo: emit as a follow-up user message (tool messages can't carry images).
|
|
if (result.image) {
|
|
messages.push({
|
|
role: 'user',
|
|
content: [
|
|
{ type: 'text', text: "Photo from Jibo's camera:" },
|
|
{
|
|
type: 'image_url',
|
|
image_url: { url: `data:image/jpeg;base64,${result.image}` },
|
|
},
|
|
],
|
|
});
|
|
}
|
|
|
|
if (result.endConversation) endRequested = true;
|
|
}
|
|
|
|
if (endRequested) {
|
|
console.log('[agent] end_conversation requested — exiting loop.');
|
|
await ctx.speechChain.catch(() => { });
|
|
return;
|
|
}
|
|
|
|
// Approaching the safety limit: nudge the model to wrap up gracefully
|
|
// on its next turn instead of getting cut off mid-thought.
|
|
if (!wrapUpInjected && turn === MAX_AGENT_TURNS - 2) {
|
|
messages.push({
|
|
role: 'system',
|
|
content:
|
|
'You are about to hit the turn limit. On your next turn, give a brief ' +
|
|
'farewell via "say" and call "end_conversation". Do not call "listen".',
|
|
});
|
|
wrapUpInjected = true;
|
|
}
|
|
}
|
|
|
|
console.warn('[agent] hit MAX_AGENT_TURNS — forcing exit.');
|
|
await ctx.speechChain.catch(() => { });
|
|
try {
|
|
await client.behavior.say("Let's pick this up another time. Bye!");
|
|
} catch (_) { }
|
|
}
|
|
|
|
// ── Main ───────────────────────────────────────────────────────────────────────
|
|
|
|
async function main() {
|
|
const client = new Client({ host: JIBO_IP, autoSubscribe: false });
|
|
|
|
client.once('ready', () => {
|
|
console.log(`[jibo-llm] Connected — session ${client.sessionID}`);
|
|
});
|
|
|
|
client.on('error', (err) => {
|
|
console.error('[jibo-llm] Client error:', err.message);
|
|
});
|
|
|
|
// ── Connect ────────────────────────────────────────────────────────────────
|
|
console.log(`[jibo-llm] Connecting to Jibo at ${JIBO_IP}…`);
|
|
await client.connect();
|
|
await client.behavior.setAttention(AttentionMode.Engaged);
|
|
|
|
// Start wakeword listener
|
|
client.audio.watchWakeword();
|
|
console.log('[jibo-llm] Ready — listening for "Hey Jibo"…');
|
|
|
|
// ── Hotword → agent conversation ───────────────────────────────────────────
|
|
/** @type {AbortController|null} */
|
|
let activeController = null;
|
|
|
|
client.on('hotword', async (event) => {
|
|
// ── Cancel any running conversation ──────────────────────────────────
|
|
if (activeController) {
|
|
console.log('[hotword] Aborting previous conversation…');
|
|
activeController.abort();
|
|
activeController = null;
|
|
}
|
|
|
|
const controller = new AbortController();
|
|
activeController = controller;
|
|
const { signal } = controller;
|
|
|
|
console.log(`\n[hotword] "${event.utterance}" (score ${event.score})`);
|
|
|
|
try {
|
|
// Acknowledge
|
|
throwIfAborted(signal);
|
|
await Promise.race([
|
|
client.behavior.playAnimCat('excited', { nonBlocking: true }),
|
|
onAbort(signal),
|
|
]);
|
|
|
|
// Listen for the user's initial speech
|
|
throwIfAborted(signal);
|
|
let userText;
|
|
client.display.showText('Listening...');
|
|
try {
|
|
const speech = await Promise.race([
|
|
client.audio.awaitSpeech({ mode: 'local', time: 15000 }),
|
|
onAbort(signal),
|
|
]);
|
|
userText = speech.content;
|
|
console.log(`[jibo-llm] User said: "${userText}"`);
|
|
} catch (err) {
|
|
if (err.code === 'CONVERSATION_ABORTED') throw err;
|
|
if (err.code === 'SPEECH_TIMEOUT') {
|
|
throwIfAborted(signal);
|
|
await client.behavior.say("I didn't hear anything. Talk to me anytime!");
|
|
return;
|
|
}
|
|
throw err;
|
|
} finally {
|
|
client.display.showEye();
|
|
}
|
|
|
|
// Build initial message history and run the agent
|
|
const messages = [
|
|
{ role: 'system', content: SYSTEM_PROMPT },
|
|
{ role: 'user', content: userText },
|
|
];
|
|
|
|
await agentLoop(client, messages, signal, userText);
|
|
} catch (err) {
|
|
if (err.code === 'CONVERSATION_ABORTED') {
|
|
console.log('[jibo-llm] Conversation was interrupted by new hotword.');
|
|
return;
|
|
}
|
|
console.error('[jibo-llm] Agent error:', err.message);
|
|
try { await client.behavior.say("Sorry, something went wrong."); } catch (_) { }
|
|
} finally {
|
|
// Only clear if we're still the active conversation
|
|
if (activeController === controller) {
|
|
activeController = null;
|
|
console.log('[jibo-llm] Conversation ended. Listening for "Hey Jibo"…\n');
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error('[jibo-llm] Fatal:', err);
|
|
process.exit(1);
|
|
});
|