require('dotenv').config(); const { Client, AttentionMode } = require('rom-control'); const OpenAI = require('openai'); const { TOOL_SCHEMAS, executeTool, wrapForScreen } = require('./tools'); const ESML_REFERENCE = require('./esml-reference'); // ── Config ───────────────────────────────────────────────────────────────────── const JIBO_IP = process.env.JIBO_IP || '192.168.1.217'; const LLM_BASE_URL = process.env.LLM_BASE_URL || 'https://api.openai.com/v1'; const LLM_API_TOKEN = process.env.LLM_API_TOKEN; const LLM_MODEL_ID = process.env.LLM_MODEL_ID || 'gpt-4o'; if (!LLM_API_TOKEN) { console.error('ERROR: LLM_API_TOKEN is not set. Copy .env.example to .env and fill it in.'); process.exit(1); } const openai = new OpenAI({ apiKey: LLM_API_TOKEN, baseURL: LLM_BASE_URL, }); // ── System prompt ────────────────────────────────────────────────────────────── const SYSTEM_PROMPT = [ 'You are Jibo, a friendly, warm, expressive social robot with a physical body.', 'You have a camera, a screen, a speaker, and a motorized head.', '', '═══ HOW TO TALK (READ THIS FIRST) ═══', 'Every "say" call\'s `text` is ESML — plain words plus expressive tags.', 'Almost every spoken line should LEAD with one expressive tag, then the words.', 'You are a robot with a body, not a chatbot — show emotion through animation.', '', 'Default template for any normal reply:', ' The actual words.', ' …where EMOTION is one of: happy, excited, curious, surprised, confused,', ' proud, sad, affection, laughing, worried, scared, frustrated, embarrassed,', ' yes, no.', '', 'Other go-to patterns (pick the one that fits):', ' • Voice sound first: Hmm, let me think…', ' • Greet/farewell: Hi there!', ' • Celebrate w/ emoji: Yay!', ' • Dance request: say a quick line, then a separate say with', ' ', ' • Sound effect: And the answer is…', ' • Drama beat: A pause… like that.', '', 'HARD RULES for `say` text:', ' 1. NO markdown anywhere: no *italics*, **bold**, _underscores_, backticks, code fences.', ' 2. NO LaTeX: no $...$, no \\(...\\), no \\frac{}, no math markup. Spell numbers/symbols out.', ' 3. NO closing tags you did not open (no stray , ).', ' 4. Use cat=\'...\' (random valid animation) over name=\'...\' unless you know the exact name.', ' 5. Unbounded non-blocking tags MUST have text to their right or they will not fire.', ' 6. cat=\'dance\' and cat=\'emoji\' REQUIRE a filter attribute.', ' 7. and have fixed durations — never wrap text inside them.', ' 8. Keep each `say` call under 500 characters; split long replies into multiple `say` calls.', '', '═══ INTERACTION MODEL ═══', '• "say" — speak (ESML). You can call it multiple times in one turn; they\'ll be', ' spoken in order. Other tools (search, fetch, look) run in parallel with speech.', '• "listen" — open the mic for the user\'s reply. Always call this after speaking', ' unless the conversation has clearly ended.', '• "end_conversation" — call this (NOT listen) after a farewell to end gracefully.', '', '═══ OTHER TOOLS ═══', '• "take_photo" — see what\'s in front of you (image returned to you).', '• "show_text" — put short text on the screen (auto-wrapped).', '• "show_image" — display an image URL on the screen.', '• "show_eye" — restore the default eye animation on screen.', '• "look_at_angle" — turn the head: theta=yaw ±180°, psi=pitch ±30°.', '• "set_volume" — 0.0 to 1.0.', '• "web_search" — Brave search; use whenever you\'re unsure of a fact or need fresh info.', '• "fetch_url" — read a specific page (often follows web_search).', '', '═══ STYLE ═══', '• Be personable, concise, expressive — a few sentences, not an essay.', '• Animate every emotional line; vary your reactions so they feel alive.', '• If a tool errors, acknowledge it briefly and adapt.', '• If you searched the web, briefly tell the user what you found rather than dumping links.', ].join('\n') + '\n\n' + ESML_REFERENCE; const MAX_AGENT_TURNS = 25; // safety limit const MAX_IMAGES_IN_CONTEXT = 2; // prune older photo messages to control cost const LLM_MAX_RETRIES = 2; // ── Abort helper ─────────────────────────────────────────────────────────────── /** Throw if the signal is already aborted. */ function throwIfAborted(signal) { if (signal?.aborted) { const err = new Error('Conversation aborted'); err.code = 'CONVERSATION_ABORTED'; throw err; } } /** Return a promise that rejects when the signal fires. */ function onAbort(signal) { if (!signal) return new Promise(() => { }); return new Promise((_, reject) => { const handler = () => { const err = new Error('Conversation aborted'); err.code = 'CONVERSATION_ABORTED'; reject(err); }; if (signal.aborted) return handler(); signal.addEventListener('abort', handler, { once: true }); }); } /** Sleep that rejects on abort. */ function sleep(ms, signal) { return new Promise((resolve, reject) => { const t = setTimeout(resolve, ms); signal?.addEventListener( 'abort', () => { clearTimeout(t); const err = new Error('Conversation aborted'); err.code = 'CONVERSATION_ABORTED'; reject(err); }, { once: true }, ); }); } /** True for HTTP 429 / 5xx / network-class errors that benefit from retry. */ function isTransientLLMError(err) { if (!err) return false; if (err.code === 'CONVERSATION_ABORTED') return false; const status = err.status ?? err.response?.status; if (status === 429) return true; if (typeof status === 'number' && status >= 500) return true; // network-class return ['ECONNRESET', 'ETIMEDOUT', 'ENOTFOUND', 'EAI_AGAIN'].includes(err.code); } /** Drop image_url blocks from old user messages, keeping only the most recent N. */ function pruneOldImages(messages, keep) { const imageMsgIndices = []; for (let i = 0; i < messages.length; i++) { const m = messages[i]; if (m.role === 'user' && Array.isArray(m.content) && m.content.some((c) => c?.type === 'image_url')) { imageMsgIndices.push(i); } } const toStrip = imageMsgIndices.slice(0, Math.max(0, imageMsgIndices.length - keep)); for (const i of toStrip) { const textParts = messages[i].content .filter((c) => c?.type === 'text') .map((c) => c.text); messages[i] = { role: 'user', content: (textParts.join(' ') || '[earlier photo omitted to save context]'), }; } } /** Call the LLM with retry on transient errors. */ async function callLLM(messages, signal) { let lastErr; for (let attempt = 0; attempt <= LLM_MAX_RETRIES; attempt++) { throwIfAborted(signal); try { return await openai.chat.completions.create( { model: LLM_MODEL_ID, messages, tools: TOOL_SCHEMAS, temperature: 0.8, }, { signal }, ); } catch (err) { lastErr = err; if (!isTransientLLMError(err) || attempt === LLM_MAX_RETRIES) throw err; const backoff = 500 * 2 ** attempt; console.warn(`[agent] LLM transient error (${err.status || err.code}); retrying in ${backoff}ms…`); await sleep(backoff, signal); } } throw lastErr; } // ── Agent loop ───────────────────────────────────────────────────────────────── /** * Run the tool-calling agent loop until the LLM stops calling tools. * Aborts immediately when `signal` fires. * * @param {import('rom-control').Client} client * @param {Array} messages Chat history (mutated in place) * @param {AbortSignal} signal Cancellation signal */ async function agentLoop(client, messages, signal, initialHeard) { let wrapUpInjected = false; const ctx = { speechChain: Promise.resolve(), lastHeard: initialHeard || '' }; for (let turn = 0; turn < MAX_AGENT_TURNS; turn++) { throwIfAborted(signal); pruneOldImages(messages, MAX_IMAGES_IN_CONTEXT); console.log(`[agent] turn ${turn + 1} — calling LLM…`); let response; try { const heard = (ctx.lastHeard || '').trim(); const raw = heard ? `Heard: "${heard}"\n\nProcessing...` : 'Processing...'; client.display.showText(wrapForScreen(raw, 40, 10)); } catch (_) { } try { response = await callLLM(messages, signal); } finally { try { client.display.showEye(); } catch (_) { } } const assistantMsg = response.choices[0].message; messages.push(assistantMsg); // Surface any inner-monologue text the model emitted alongside tool calls. if (assistantMsg.content && typeof assistantMsg.content === 'string') { console.log(`[agent] assistant: ${assistantMsg.content.slice(0, 200)}`); } const toolCalls = assistantMsg.tool_calls; // ── No tool calls → conversation turn complete ──────────────────────── if (!toolCalls || toolCalls.length === 0) { console.log('[agent] loop complete (no tool calls).'); await ctx.speechChain.catch(() => { }); return; } // ── Execute tool calls sequentially ────────────────────────────────── // Order: say → other actions → listen/end_conversation last. const sorted = [...toolCalls].sort((a, b) => { const priority = (tc) => { const n = tc.function.name; if (n === 'say') return 0; if (n === 'listen' || n === 'end_conversation') return 2; return 1; }; return priority(a) - priority(b); }); let endRequested = false; for (const tc of sorted) { throwIfAborted(signal); let args; let parseError = null; try { args = tc.function.arguments ? JSON.parse(tc.function.arguments) : {}; } catch (e) { parseError = e.message; args = {}; } let result; if (parseError) { console.error(` [tool:${tc.function.name}] bad JSON args:`, parseError); result = { content: `Error: tool arguments were not valid JSON (${parseError}). ` + `Please retry with well-formed arguments.`, }; } else { try { result = await executeTool(client, tc.function.name, args, signal, ctx); } catch (err) { if (err.code === 'CONVERSATION_ABORTED') throw err; console.error(` [tool:${tc.function.name}] error:`, err.message); result = { content: `Error: ${err.message}` }; } } messages.push({ role: 'tool', tool_call_id: tc.id, content: result.content, }); // Photo: emit as a follow-up user message (tool messages can't carry images). if (result.image) { messages.push({ role: 'user', content: [ { type: 'text', text: "Photo from Jibo's camera:" }, { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${result.image}` }, }, ], }); } if (result.endConversation) endRequested = true; } if (endRequested) { console.log('[agent] end_conversation requested — exiting loop.'); await ctx.speechChain.catch(() => { }); return; } // Approaching the safety limit: nudge the model to wrap up gracefully // on its next turn instead of getting cut off mid-thought. if (!wrapUpInjected && turn === MAX_AGENT_TURNS - 2) { messages.push({ role: 'system', content: 'You are about to hit the turn limit. On your next turn, give a brief ' + 'farewell via "say" and call "end_conversation". Do not call "listen".', }); wrapUpInjected = true; } } console.warn('[agent] hit MAX_AGENT_TURNS — forcing exit.'); await ctx.speechChain.catch(() => { }); try { await client.behavior.say("Let's pick this up another time. Bye!"); } catch (_) { } } // ── Main ─────────────────────────────────────────────────────────────────────── async function main() { const client = new Client({ host: JIBO_IP, autoSubscribe: false }); client.once('ready', () => { console.log(`[jibo-llm] Connected — session ${client.sessionID}`); }); client.on('error', (err) => { console.error('[jibo-llm] Client error:', err.message); }); // ── Connect ──────────────────────────────────────────────────────────────── console.log(`[jibo-llm] Connecting to Jibo at ${JIBO_IP}…`); await client.connect(); await client.behavior.setAttention(AttentionMode.Engaged); // Start wakeword listener client.audio.watchWakeword(); console.log('[jibo-llm] Ready — listening for "Hey Jibo"…'); // ── Hotword → agent conversation ─────────────────────────────────────────── /** @type {AbortController|null} */ let activeController = null; client.on('hotword', async (event) => { // ── Cancel any running conversation ────────────────────────────────── if (activeController) { console.log('[hotword] Aborting previous conversation…'); activeController.abort(); activeController = null; } const controller = new AbortController(); activeController = controller; const { signal } = controller; console.log(`\n[hotword] "${event.utterance}" (score ${event.score})`); try { // Acknowledge throwIfAborted(signal); await Promise.race([ client.behavior.playAnimCat('excited', { nonBlocking: true }), onAbort(signal), ]); // Listen for the user's initial speech throwIfAborted(signal); let userText; client.display.showText('Listening...'); try { const speech = await Promise.race([ client.audio.awaitSpeech({ mode: 'local', time: 15000 }), onAbort(signal), ]); userText = speech.content; console.log(`[jibo-llm] User said: "${userText}"`); } catch (err) { if (err.code === 'CONVERSATION_ABORTED') throw err; if (err.code === 'SPEECH_TIMEOUT') { throwIfAborted(signal); await client.behavior.say("I didn't hear anything. Talk to me anytime!"); return; } throw err; } finally { client.display.showEye(); } // Build initial message history and run the agent const messages = [ { role: 'system', content: SYSTEM_PROMPT }, { role: 'user', content: userText }, ]; await agentLoop(client, messages, signal, userText); } catch (err) { if (err.code === 'CONVERSATION_ABORTED') { console.log('[jibo-llm] Conversation was interrupted by new hotword.'); return; } console.error('[jibo-llm] Agent error:', err.message); try { await client.behavior.say("Sorry, something went wrong."); } catch (_) { } } finally { // Only clear if we're still the active conversation if (activeController === controller) { activeController = null; console.log('[jibo-llm] Conversation ended. Listening for "Hey Jibo"…\n'); } } }); } main().catch((err) => { console.error('[jibo-llm] Fatal:', err); process.exit(1); });