Initial commit: jibo-llm hotword-triggered agent

Hotword-triggered LLM conversation loop for Jibo with tool-calling agent loop, ESML expressive speech, web search/fetch, and per-conversation abort handling.
2026-04-26 00:05:39 -04:00
commit 8955f21ab4
8 changed files with 2039 additions and 0 deletions
--- a/index.js
+++ b/index.js
@@ -0,0 +1,426 @@
+require('dotenv').config();
+const { Client, AttentionMode } = require('rom-control');
+const OpenAI = require('openai');
+const { TOOL_SCHEMAS, executeTool, wrapForScreen } = require('./tools');
+const ESML_REFERENCE = require('./esml-reference');
+
+// ── Config ─────────────────────────────────────────────────────────────────────
+const JIBO_IP = process.env.JIBO_IP || '192.168.1.217';
+const LLM_BASE_URL = process.env.LLM_BASE_URL || 'https://api.openai.com/v1';
+const LLM_API_TOKEN = process.env.LLM_API_TOKEN;
+const LLM_MODEL_ID = process.env.LLM_MODEL_ID || 'gpt-4o';
+
+if (!LLM_API_TOKEN) {
+  console.error('ERROR: LLM_API_TOKEN is not set. Copy .env.example to .env and fill it in.');
+  process.exit(1);
+}
+
+const openai = new OpenAI({
+  apiKey: LLM_API_TOKEN,
+  baseURL: LLM_BASE_URL,
+});
+
+// ── System prompt ──────────────────────────────────────────────────────────────
+const SYSTEM_PROMPT = [
+  'You are Jibo, a friendly, warm, expressive social robot with a physical body.',
+  'You have a camera, a screen, a speaker, and a motorized head.',
+  '',
+  '═══ HOW TO TALK (READ THIS FIRST) ═══',
+  'Every "say" call\'s `text` is ESML — plain words plus expressive tags.',
+  'Almost every spoken line should LEAD with one expressive tag, then the words.',
+  'You are a robot with a body, not a chatbot — show emotion through animation.',
+  '',
+  'Default template for any normal reply:',
+  '  <anim cat=\'EMOTION\' nonBlocking=\'true\' endNeutral=\'true\'/> The actual words.',
+  '  …where EMOTION is one of: happy, excited, curious, surprised, confused,',
+  '  proud, sad, affection, laughing, worried, scared, frustrated, embarrassed,',
+  '  yes, no.',
+  '',
+  'Other go-to patterns (pick the one that fits):',
+  '  • Voice sound first:    <ssa cat=\'thinking\'/> Hmm, let me think…',
+  '  • Greet/farewell:       <ssa cat=\'hello\' nonBlocking=\'true\'/> Hi there!',
+  '  • Celebrate w/ emoji:   <anim cat=\'emoji\' filter=\'!(hf), &(party)\' nonBlocking=\'true\'/> Yay!',
+  '  • Dance request:        say a quick line, then a separate say with',
+  '                          <anim cat=\'dance\' filter=\'music, rom-silly\'/>',
+  '  • Sound effect:         <sfx cat=\'drumroll\'/> And the answer is…',
+  '  • Drama beat:           A pause… <break size=\'0.6\'/> like that.',
+  '',
+  'HARD RULES for `say` text:',
+  '  1. NO markdown anywhere: no *italics*, **bold**, _underscores_, backticks, code fences.',
+  '  2. NO LaTeX: no $...$, no \\(...\\), no \\frac{}, no math markup. Spell numbers/symbols out.',
+  '  3. NO closing tags you did not open (no stray </es>, </anim>).',
+  '  4. Use cat=\'...\' (random valid animation) over name=\'...\' unless you know the exact name.',
+  '  5. Unbounded non-blocking tags MUST have text to their right or they will not fire.',
+  '  6. cat=\'dance\' and cat=\'emoji\' REQUIRE a filter attribute.',
+  '  7. <ssa> and <sfx> have fixed durations — never wrap text inside them.',
+  '  8. Keep each `say` call under 500 characters; split long replies into multiple `say` calls.',
+  '',
+  '═══ INTERACTION MODEL ═══',
+  '• "say"    — speak (ESML). You can call it multiple times in one turn; they\'ll be',
+  '             spoken in order. Other tools (search, fetch, look) run in parallel with speech.',
+  '• "listen" — open the mic for the user\'s reply. Always call this after speaking',
+  '             unless the conversation has clearly ended.',
+  '• "end_conversation" — call this (NOT listen) after a farewell to end gracefully.',
+  '',
+  '═══ OTHER TOOLS ═══',
+  '• "take_photo"     — see what\'s in front of you (image returned to you).',
+  '• "show_text"      — put short text on the screen (auto-wrapped).',
+  '• "show_image"     — display an image URL on the screen.',
+  '• "show_eye"       — restore the default eye animation on screen.',
+  '• "look_at_angle"  — turn the head: theta=yaw ±180°, psi=pitch ±30°.',
+  '• "set_volume"     — 0.0 to 1.0.',
+  '• "web_search"     — Brave search; use whenever you\'re unsure of a fact or need fresh info.',
+  '• "fetch_url"      — read a specific page (often follows web_search).',
+  '',
+  '═══ STYLE ═══',
+  '• Be personable, concise, expressive — a few sentences, not an essay.',
+  '• Animate every emotional line; vary your reactions so they feel alive.',
+  '• If a tool errors, acknowledge it briefly and adapt.',
+  '• If you searched the web, briefly tell the user what you found rather than dumping links.',
+].join('\n') + '\n\n' + ESML_REFERENCE;
+
+const MAX_AGENT_TURNS = 25; // safety limit
+const MAX_IMAGES_IN_CONTEXT = 2; // prune older photo messages to control cost
+const LLM_MAX_RETRIES = 2;
+
+// ── Abort helper ───────────────────────────────────────────────────────────────
+
+/** Throw if the signal is already aborted. */
+function throwIfAborted(signal) {
+  if (signal?.aborted) {
+    const err = new Error('Conversation aborted');
+    err.code = 'CONVERSATION_ABORTED';
+    throw err;
+  }
+}
+
+/** Return a promise that rejects when the signal fires. */
+function onAbort(signal) {
+  if (!signal) return new Promise(() => { });
+  return new Promise((_, reject) => {
+    const handler = () => {
+      const err = new Error('Conversation aborted');
+      err.code = 'CONVERSATION_ABORTED';
+      reject(err);
+    };
+    if (signal.aborted) return handler();
+    signal.addEventListener('abort', handler, { once: true });
+  });
+}
+
+/** Sleep that rejects on abort. */
+function sleep(ms, signal) {
+  return new Promise((resolve, reject) => {
+    const t = setTimeout(resolve, ms);
+    signal?.addEventListener(
+      'abort',
+      () => {
+        clearTimeout(t);
+        const err = new Error('Conversation aborted');
+        err.code = 'CONVERSATION_ABORTED';
+        reject(err);
+      },
+      { once: true },
+    );
+  });
+}
+
+/** True for HTTP 429 / 5xx / network-class errors that benefit from retry. */
+function isTransientLLMError(err) {
+  if (!err) return false;
+  if (err.code === 'CONVERSATION_ABORTED') return false;
+  const status = err.status ?? err.response?.status;
+  if (status === 429) return true;
+  if (typeof status === 'number' && status >= 500) return true;
+  // network-class
+  return ['ECONNRESET', 'ETIMEDOUT', 'ENOTFOUND', 'EAI_AGAIN'].includes(err.code);
+}
+
+/** Drop image_url blocks from old user messages, keeping only the most recent N. */
+function pruneOldImages(messages, keep) {
+  const imageMsgIndices = [];
+  for (let i = 0; i < messages.length; i++) {
+    const m = messages[i];
+    if (m.role === 'user' && Array.isArray(m.content) &&
+      m.content.some((c) => c?.type === 'image_url')) {
+      imageMsgIndices.push(i);
+    }
+  }
+  const toStrip = imageMsgIndices.slice(0, Math.max(0, imageMsgIndices.length - keep));
+  for (const i of toStrip) {
+    const textParts = messages[i].content
+      .filter((c) => c?.type === 'text')
+      .map((c) => c.text);
+    messages[i] = {
+      role: 'user',
+      content: (textParts.join(' ') || '[earlier photo omitted to save context]'),
+    };
+  }
+}
+
+/** Call the LLM with retry on transient errors. */
+async function callLLM(messages, signal) {
+  let lastErr;
+  for (let attempt = 0; attempt <= LLM_MAX_RETRIES; attempt++) {
+    throwIfAborted(signal);
+    try {
+      return await openai.chat.completions.create(
+        {
+          model: LLM_MODEL_ID,
+          messages,
+          tools: TOOL_SCHEMAS,
+          temperature: 0.8,
+        },
+        { signal },
+      );
+    } catch (err) {
+      lastErr = err;
+      if (!isTransientLLMError(err) || attempt === LLM_MAX_RETRIES) throw err;
+      const backoff = 500 * 2 ** attempt;
+      console.warn(`[agent] LLM transient error (${err.status || err.code}); retrying in ${backoff}ms…`);
+      await sleep(backoff, signal);
+    }
+  }
+  throw lastErr;
+}
+
+// ── Agent loop ─────────────────────────────────────────────────────────────────
+
+/**
+ * Run the tool-calling agent loop until the LLM stops calling tools.
+ * Aborts immediately when `signal` fires.
+ *
+ * @param {import('rom-control').Client} client
+ * @param {Array}       messages  Chat history (mutated in place)
+ * @param {AbortSignal} signal    Cancellation signal
+ */
+async function agentLoop(client, messages, signal, initialHeard) {
+  let wrapUpInjected = false;
+  const ctx = { speechChain: Promise.resolve(), lastHeard: initialHeard || '' };
+
+  for (let turn = 0; turn < MAX_AGENT_TURNS; turn++) {
+    throwIfAborted(signal);
+    pruneOldImages(messages, MAX_IMAGES_IN_CONTEXT);
+    console.log(`[agent] turn ${turn + 1} — calling LLM…`);
+
+    let response;
+    try {
+      const heard = (ctx.lastHeard || '').trim();
+      const raw = heard
+        ? `Heard: "${heard}"\n\nProcessing...`
+        : 'Processing...';
+      client.display.showText(wrapForScreen(raw, 40, 10));
+    } catch (_) { }
+    try {
+      response = await callLLM(messages, signal);
+    } finally {
+      try { client.display.showEye(); } catch (_) { }
+    }
+    const assistantMsg = response.choices[0].message;
+    messages.push(assistantMsg);
+
+    // Surface any inner-monologue text the model emitted alongside tool calls.
+    if (assistantMsg.content && typeof assistantMsg.content === 'string') {
+      console.log(`[agent] assistant: ${assistantMsg.content.slice(0, 200)}`);
+    }
+
+    const toolCalls = assistantMsg.tool_calls;
+
+    // ── No tool calls → conversation turn complete ────────────────────────
+    if (!toolCalls || toolCalls.length === 0) {
+      console.log('[agent] loop complete (no tool calls).');
+      await ctx.speechChain.catch(() => { });
+      return;
+    }
+
+    // ── Execute tool calls sequentially ──────────────────────────────────
+    // Order: say → other actions → listen/end_conversation last.
+    const sorted = [...toolCalls].sort((a, b) => {
+      const priority = (tc) => {
+        const n = tc.function.name;
+        if (n === 'say') return 0;
+        if (n === 'listen' || n === 'end_conversation') return 2;
+        return 1;
+      };
+      return priority(a) - priority(b);
+    });
+
+    let endRequested = false;
+
+    for (const tc of sorted) {
+      throwIfAborted(signal);
+
+      let args;
+      let parseError = null;
+      try {
+        args = tc.function.arguments ? JSON.parse(tc.function.arguments) : {};
+      } catch (e) {
+        parseError = e.message;
+        args = {};
+      }
+
+      let result;
+      if (parseError) {
+        console.error(`  [tool:${tc.function.name}] bad JSON args:`, parseError);
+        result = {
+          content: `Error: tool arguments were not valid JSON (${parseError}). ` +
+            `Please retry with well-formed arguments.`,
+        };
+      } else {
+        try {
+          result = await executeTool(client, tc.function.name, args, signal, ctx);
+        } catch (err) {
+          if (err.code === 'CONVERSATION_ABORTED') throw err;
+          console.error(`  [tool:${tc.function.name}] error:`, err.message);
+          result = { content: `Error: ${err.message}` };
+        }
+      }
+
+      messages.push({
+        role: 'tool',
+        tool_call_id: tc.id,
+        content: result.content,
+      });
+
+      // Photo: emit as a follow-up user message (tool messages can't carry images).
+      if (result.image) {
+        messages.push({
+          role: 'user',
+          content: [
+            { type: 'text', text: "Photo from Jibo's camera:" },
+            {
+              type: 'image_url',
+              image_url: { url: `data:image/jpeg;base64,${result.image}` },
+            },
+          ],
+        });
+      }
+
+      if (result.endConversation) endRequested = true;
+    }
+
+    if (endRequested) {
+      console.log('[agent] end_conversation requested — exiting loop.');
+      await ctx.speechChain.catch(() => { });
+      return;
+    }
+
+    // Approaching the safety limit: nudge the model to wrap up gracefully
+    // on its next turn instead of getting cut off mid-thought.
+    if (!wrapUpInjected && turn === MAX_AGENT_TURNS - 2) {
+      messages.push({
+        role: 'system',
+        content:
+          'You are about to hit the turn limit. On your next turn, give a brief ' +
+          'farewell via "say" and call "end_conversation". Do not call "listen".',
+      });
+      wrapUpInjected = true;
+    }
+  }
+
+  console.warn('[agent] hit MAX_AGENT_TURNS — forcing exit.');
+  await ctx.speechChain.catch(() => { });
+  try {
+    await client.behavior.say("Let's pick this up another time. Bye!");
+  } catch (_) { }
+}
+
+// ── Main ───────────────────────────────────────────────────────────────────────
+
+async function main() {
+  const client = new Client({ host: JIBO_IP, autoSubscribe: false });
+
+  client.once('ready', () => {
+    console.log(`[jibo-llm] Connected — session ${client.sessionID}`);
+  });
+
+  client.on('error', (err) => {
+    console.error('[jibo-llm] Client error:', err.message);
+  });
+
+  // ── Connect ────────────────────────────────────────────────────────────────
+  console.log(`[jibo-llm] Connecting to Jibo at ${JIBO_IP}…`);
+  await client.connect();
+  await client.behavior.setAttention(AttentionMode.Engaged);
+
+  // Start wakeword listener
+  client.audio.watchWakeword();
+  console.log('[jibo-llm] Ready — listening for "Hey Jibo"…');
+
+  // ── Hotword → agent conversation ───────────────────────────────────────────
+  /** @type {AbortController|null} */
+  let activeController = null;
+
+  client.on('hotword', async (event) => {
+    // ── Cancel any running conversation ──────────────────────────────────
+    if (activeController) {
+      console.log('[hotword] Aborting previous conversation…');
+      activeController.abort();
+      activeController = null;
+    }
+
+    const controller = new AbortController();
+    activeController = controller;
+    const { signal } = controller;
+
+    console.log(`\n[hotword] "${event.utterance}" (score ${event.score})`);
+
+    try {
+      // Acknowledge
+      throwIfAborted(signal);
+      await Promise.race([
+        client.behavior.playAnimCat('excited', { nonBlocking: true }),
+        onAbort(signal),
+      ]);
+
+      // Listen for the user's initial speech
+      throwIfAborted(signal);
+      let userText;
+      client.display.showText('Listening...');
+      try {
+        const speech = await Promise.race([
+          client.audio.awaitSpeech({ mode: 'local', time: 15000 }),
+          onAbort(signal),
+        ]);
+        userText = speech.content;
+        console.log(`[jibo-llm] User said: "${userText}"`);
+      } catch (err) {
+        if (err.code === 'CONVERSATION_ABORTED') throw err;
+        if (err.code === 'SPEECH_TIMEOUT') {
+          throwIfAborted(signal);
+          await client.behavior.say("I didn't hear anything. Talk to me anytime!");
+          return;
+        }
+        throw err;
+      } finally {
+        client.display.showEye();
+      }
+
+      // Build initial message history and run the agent
+      const messages = [
+        { role: 'system', content: SYSTEM_PROMPT },
+        { role: 'user', content: userText },
+      ];
+
+      await agentLoop(client, messages, signal, userText);
+    } catch (err) {
+      if (err.code === 'CONVERSATION_ABORTED') {
+        console.log('[jibo-llm] Conversation was interrupted by new hotword.');
+        return;
+      }
+      console.error('[jibo-llm] Agent error:', err.message);
+      try { await client.behavior.say("Sorry, something went wrong."); } catch (_) { }
+    } finally {
+      // Only clear if we're still the active conversation
+      if (activeController === controller) {
+        activeController = null;
+        console.log('[jibo-llm] Conversation ended. Listening for "Hey Jibo"…\n');
+      }
+    }
+  });
+}
+
+main().catch((err) => {
+  console.error('[jibo-llm] Fatal:', err);
+  process.exit(1);
+});