jibo-llm/index.js

require('dotenv').config();
const { Client, AttentionMode } = require('rom-control');
const OpenAI = require('openai');
const { TOOL_SCHEMAS, executeTool, wrapForScreen } = require('./tools');
const ESML_REFERENCE = require('./esml-reference');

// ── Config ─────────────────────────────────────────────────────────────────────
const JIBO_IP = process.env.JIBO_IP || '192.168.1.217';
const LLM_BASE_URL = process.env.LLM_BASE_URL || 'https://api.openai.com/v1';
const LLM_API_TOKEN = process.env.LLM_API_TOKEN;
const LLM_MODEL_ID = process.env.LLM_MODEL_ID || 'gpt-4o';

if (!LLM_API_TOKEN) {
  console.error('ERROR: LLM_API_TOKEN is not set. Copy .env.example to .env and fill it in.');
  process.exit(1);
}

const openai = new OpenAI({
  apiKey: LLM_API_TOKEN,
  baseURL: LLM_BASE_URL,
});

// ── System prompt ──────────────────────────────────────────────────────────────
const SYSTEM_PROMPT = [
  'You are Jibo, a friendly, warm, expressive social robot with a physical body.',
  'You have a camera, a screen, a speaker, and a motorized head.',
  '',
  '═══ HOW TO TALK (READ THIS FIRST) ═══',
  'Every "say" call\'s `text` is ESML — plain words plus expressive tags.',
  'Almost every spoken line should LEAD with one expressive tag, then the words.',
  'You are a robot with a body, not a chatbot — show emotion through animation.',
  '',
  'Default template for any normal reply:',
  '  <anim cat=\'EMOTION\' nonBlocking=\'true\' endNeutral=\'true\'/> The actual words.',
  '  …where EMOTION is one of: happy, excited, curious, surprised, confused,',
  '  proud, sad, affection, laughing, worried, scared, frustrated, embarrassed,',
  '  yes, no.',
  '',
  'Other go-to patterns (pick the one that fits):',
  '  • Voice sound first:    <ssa cat=\'thinking\'/> Hmm, let me think…',
  '  • Greet/farewell:       <ssa cat=\'hello\' nonBlocking=\'true\'/> Hi there!',
  '  • Celebrate w/ emoji:   <anim cat=\'emoji\' filter=\'!(hf), &(party)\' nonBlocking=\'true\'/> Yay!',
  '  • Dance request:        say a quick line, then a separate say with',
  '                          <anim cat=\'dance\' filter=\'music, rom-silly\'/>',
  '  • Sound effect:         <sfx cat=\'drumroll\'/> And the answer is…',
  '  • Drama beat:           A pause… <break size=\'0.6\'/> like that.',
  '',
  'HARD RULES for `say` text:',
  '  1. NO markdown anywhere: no *italics*, **bold**, _underscores_, backticks, code fences.',
  '  2. NO LaTeX: no $...$, no \\(...\\), no \\frac{}, no math markup. Spell numbers/symbols out.',
  '  3. NO closing tags you did not open (no stray </es>, </anim>).',
  '  4. Use cat=\'...\' (random valid animation) over name=\'...\' unless you know the exact name.',
  '  5. Unbounded non-blocking tags MUST have text to their right or they will not fire.',
  '  6. cat=\'dance\' and cat=\'emoji\' REQUIRE a filter attribute.',
  '  7. <ssa> and <sfx> have fixed durations — never wrap text inside them.',
  '  8. Keep each `say` call under 500 characters; split long replies into multiple `say` calls.',
  '',
  '═══ INTERACTION MODEL ═══',
  '• "say"    — speak (ESML). You can call it multiple times in one turn; they\'ll be',
  '             spoken in order. Other tools (search, fetch, look) run in parallel with speech.',
  '• "listen" — open the mic for the user\'s reply. Always call this after speaking',
  '             unless the conversation has clearly ended.',
  '• "end_conversation" — call this (NOT listen) after a farewell to end gracefully.',
  '',
  '═══ OTHER TOOLS ═══',
  '• "take_photo"     — see what\'s in front of you (image returned to you).',
  '• "show_text"      — put short text on the screen (auto-wrapped).',
  '• "show_image"     — display an image URL on the screen.',
  '• "show_eye"       — restore the default eye animation on screen.',
  '• "look_at_angle"  — turn the head: theta=yaw ±180°, psi=pitch ±30°.',
  '• "set_volume"     — 0.0 to 1.0.',
  '• "web_search"     — Brave search; use whenever you\'re unsure of a fact or need fresh info.',
  '• "fetch_url"      — read a specific page (often follows web_search).',
  '',
  '═══ STYLE ═══',
  '• Be personable, concise, expressive — a few sentences, not an essay.',
  '• Animate every emotional line; vary your reactions so they feel alive.',
  '• If a tool errors, acknowledge it briefly and adapt.',
  '• If you searched the web, briefly tell the user what you found rather than dumping links.',
].join('\n') + '\n\n' + ESML_REFERENCE;

const MAX_AGENT_TURNS = 25; // safety limit
const MAX_IMAGES_IN_CONTEXT = 2; // prune older photo messages to control cost
const LLM_MAX_RETRIES = 2;

// ── Abort helper ───────────────────────────────────────────────────────────────

/** Throw if the signal is already aborted. */
function throwIfAborted(signal) {
  if (signal?.aborted) {
    const err = new Error('Conversation aborted');
    err.code = 'CONVERSATION_ABORTED';
    throw err;
  }
}

/** Return a promise that rejects when the signal fires. */
function onAbort(signal) {
  if (!signal) return new Promise(() => { });
  return new Promise((_, reject) => {
    const handler = () => {
      const err = new Error('Conversation aborted');
      err.code = 'CONVERSATION_ABORTED';
      reject(err);
    };
    if (signal.aborted) return handler();
    signal.addEventListener('abort', handler, { once: true });
  });
}

/** Sleep that rejects on abort. */
function sleep(ms, signal) {
  return new Promise((resolve, reject) => {
    const t = setTimeout(resolve, ms);
    signal?.addEventListener(
      'abort',
      () => {
        clearTimeout(t);
        const err = new Error('Conversation aborted');
        err.code = 'CONVERSATION_ABORTED';
        reject(err);
      },
      { once: true },
    );
  });
}

/** True for HTTP 429 / 5xx / network-class errors that benefit from retry. */
function isTransientLLMError(err) {
  if (!err) return false;
  if (err.code === 'CONVERSATION_ABORTED') return false;
  const status = err.status ?? err.response?.status;
  if (status === 429) return true;
  if (typeof status === 'number' && status >= 500) return true;
  // network-class
  return ['ECONNRESET', 'ETIMEDOUT', 'ENOTFOUND', 'EAI_AGAIN'].includes(err.code);
}

/** Drop image_url blocks from old user messages, keeping only the most recent N. */
function pruneOldImages(messages, keep) {
  const imageMsgIndices = [];
  for (let i = 0; i < messages.length; i++) {
    const m = messages[i];
    if (m.role === 'user' && Array.isArray(m.content) &&
      m.content.some((c) => c?.type === 'image_url')) {
      imageMsgIndices.push(i);
    }
  }
  const toStrip = imageMsgIndices.slice(0, Math.max(0, imageMsgIndices.length - keep));
  for (const i of toStrip) {
    const textParts = messages[i].content
      .filter((c) => c?.type === 'text')
      .map((c) => c.text);
    messages[i] = {
      role: 'user',
      content: (textParts.join(' ') || '[earlier photo omitted to save context]'),
    };
  }
}

/** Call the LLM with retry on transient errors. */
async function callLLM(messages, signal) {
  let lastErr;
  for (let attempt = 0; attempt <= LLM_MAX_RETRIES; attempt++) {
    throwIfAborted(signal);
    try {
      return await openai.chat.completions.create(
        {
          model: LLM_MODEL_ID,
          messages,
          tools: TOOL_SCHEMAS,
          temperature: 0.8,
        },
        { signal },
      );
    } catch (err) {
      lastErr = err;
      if (!isTransientLLMError(err) || attempt === LLM_MAX_RETRIES) throw err;
      const backoff = 500 * 2 ** attempt;
      console.warn(`[agent] LLM transient error (${err.status || err.code}); retrying in ${backoff}ms…`);
      await sleep(backoff, signal);
    }
  }
  throw lastErr;
}

// ── Agent loop ─────────────────────────────────────────────────────────────────

/**
 * Run the tool-calling agent loop until the LLM stops calling tools.
 * Aborts immediately when `signal` fires.
 *
 * @param {import('rom-control').Client} client
 * @param {Array}       messages  Chat history (mutated in place)
 * @param {AbortSignal} signal    Cancellation signal
 */
async function agentLoop(client, messages, signal, initialHeard) {
  let wrapUpInjected = false;
  const ctx = { speechChain: Promise.resolve(), lastHeard: initialHeard || '' };

  for (let turn = 0; turn < MAX_AGENT_TURNS; turn++) {
    throwIfAborted(signal);
    pruneOldImages(messages, MAX_IMAGES_IN_CONTEXT);
    console.log(`[agent] turn ${turn + 1} — calling LLM…`);

    let response;
    try {
      const heard = (ctx.lastHeard || '').trim();
      const raw = heard
        ? `Heard: "${heard}"\n\nProcessing...`
        : 'Processing...';
      client.display.showText(wrapForScreen(raw, 40, 10));
    } catch (_) { }
    try {
      response = await callLLM(messages, signal);
    } finally {
      try { client.display.showEye(); } catch (_) { }
    }
    const assistantMsg = response.choices[0].message;
    messages.push(assistantMsg);

    // Surface any inner-monologue text the model emitted alongside tool calls.
    if (assistantMsg.content && typeof assistantMsg.content === 'string') {
      console.log(`[agent] assistant: ${assistantMsg.content.slice(0, 200)}`);
    }

    const toolCalls = assistantMsg.tool_calls;

    // ── No tool calls → conversation turn complete ────────────────────────
    if (!toolCalls || toolCalls.length === 0) {
      console.log('[agent] loop complete (no tool calls).');
      await ctx.speechChain.catch(() => { });
      return;
    }

    // ── Execute tool calls sequentially ──────────────────────────────────
    // Order: say → other actions → listen/end_conversation last.
    const sorted = [...toolCalls].sort((a, b) => {
      const priority = (tc) => {
        const n = tc.function.name;
        if (n === 'say') return 0;
        if (n === 'listen' || n === 'end_conversation') return 2;
        return 1;
      };
      return priority(a) - priority(b);
    });

    let endRequested = false;

    for (const tc of sorted) {
      throwIfAborted(signal);

      let args;
      let parseError = null;
      try {
        args = tc.function.arguments ? JSON.parse(tc.function.arguments) : {};
      } catch (e) {
        parseError = e.message;
        args = {};
      }

      let result;
      if (parseError) {
        console.error(`  [tool:${tc.function.name}] bad JSON args:`, parseError);
        result = {
          content: `Error: tool arguments were not valid JSON (${parseError}). ` +
            `Please retry with well-formed arguments.`,
        };
      } else {
        try {
          result = await executeTool(client, tc.function.name, args, signal, ctx);
        } catch (err) {
          if (err.code === 'CONVERSATION_ABORTED') throw err;
          console.error(`  [tool:${tc.function.name}] error:`, err.message);
          result = { content: `Error: ${err.message}` };
        }
      }

      messages.push({
        role: 'tool',
        tool_call_id: tc.id,
        content: result.content,
      });

      // Photo: emit as a follow-up user message (tool messages can't carry images).
      if (result.image) {
        messages.push({
          role: 'user',
          content: [
            { type: 'text', text: "Photo from Jibo's camera:" },
            {
              type: 'image_url',
              image_url: { url: `data:image/jpeg;base64,${result.image}` },
            },
          ],
        });
      }

      if (result.endConversation) endRequested = true;
    }

    if (endRequested) {
      console.log('[agent] end_conversation requested — exiting loop.');
      await ctx.speechChain.catch(() => { });
      return;
    }

    // Approaching the safety limit: nudge the model to wrap up gracefully
    // on its next turn instead of getting cut off mid-thought.
    if (!wrapUpInjected && turn === MAX_AGENT_TURNS - 2) {
      messages.push({
        role: 'system',
        content:
          'You are about to hit the turn limit. On your next turn, give a brief ' +
          'farewell via "say" and call "end_conversation". Do not call "listen".',
      });
      wrapUpInjected = true;
    }
  }

  console.warn('[agent] hit MAX_AGENT_TURNS — forcing exit.');
  await ctx.speechChain.catch(() => { });
  try {
    await client.behavior.say("Let's pick this up another time. Bye!");
  } catch (_) { }
}

// ── Main ───────────────────────────────────────────────────────────────────────

async function main() {
  const client = new Client({ host: JIBO_IP, autoSubscribe: false });

  client.once('ready', () => {
    console.log(`[jibo-llm] Connected — session ${client.sessionID}`);
  });

  client.on('error', (err) => {
    console.error('[jibo-llm] Client error:', err.message);
  });

  // ── Connect ────────────────────────────────────────────────────────────────
  console.log(`[jibo-llm] Connecting to Jibo at ${JIBO_IP}…`);
  await client.connect();
  await client.behavior.setAttention(AttentionMode.Engaged);

  // Start wakeword listener
  client.audio.watchWakeword();
  console.log('[jibo-llm] Ready — listening for "Hey Jibo"…');

  // ── Hotword → agent conversation ───────────────────────────────────────────
  /** @type {AbortController|null} */
  let activeController = null;

  client.on('hotword', async (event) => {
    // ── Cancel any running conversation ──────────────────────────────────
    if (activeController) {
      console.log('[hotword] Aborting previous conversation…');
      activeController.abort();
      activeController = null;
    }

    const controller = new AbortController();
    activeController = controller;
    const { signal } = controller;

    console.log(`\n[hotword] "${event.utterance}" (score ${event.score})`);

    try {
      // Acknowledge
      throwIfAborted(signal);
      await Promise.race([
        client.behavior.playAnimCat('excited', { nonBlocking: true }),
        onAbort(signal),
      ]);

      // Listen for the user's initial speech
      throwIfAborted(signal);
      let userText;
      client.display.showText('Listening...');
      try {
        const speech = await Promise.race([
          client.audio.awaitSpeech({ mode: 'local', time: 15000 }),
          onAbort(signal),
        ]);
        userText = speech.content;
        console.log(`[jibo-llm] User said: "${userText}"`);
      } catch (err) {
        if (err.code === 'CONVERSATION_ABORTED') throw err;
        if (err.code === 'SPEECH_TIMEOUT') {
          throwIfAborted(signal);
          await client.behavior.say("I didn't hear anything. Talk to me anytime!");
          return;
        }
        throw err;
      } finally {
        client.display.showEye();
      }

      // Build initial message history and run the agent
      const messages = [
        { role: 'system', content: SYSTEM_PROMPT },
        { role: 'user', content: userText },
      ];

      await agentLoop(client, messages, signal, userText);
    } catch (err) {
      if (err.code === 'CONVERSATION_ABORTED') {
        console.log('[jibo-llm] Conversation was interrupted by new hotword.');
        return;
      }
      console.error('[jibo-llm] Agent error:', err.message);
      try { await client.behavior.say("Sorry, something went wrong."); } catch (_) { }
    } finally {
      // Only clear if we're still the active conversation
      if (activeController === controller) {
        activeController = null;
        console.log('[jibo-llm] Conversation ended. Listening for "Hey Jibo"…\n');
      }
    }
  });
}

main().catch((err) => {
  console.error('[jibo-llm] Fatal:', err);
  process.exit(1);
});