/** * Tool definitions and executor for the Jibo LLM agent. * * Each tool maps to a rom-control capability the LLM can invoke. */ // ── OpenAI function-tool schemas ─────────────────────────────────────────────── const TOOL_SCHEMAS = [ { type: 'function', function: { name: 'say', description: "Speak text aloud through Jibo's speaker. Plain text plus valid ESML tags only " + '(e.g. , ). ' + 'NEVER include markdown (no *italics*, **bold**, backticks), LaTeX ($...$), ' + 'unmatched/closing tags like , or other symbols Jibo cannot pronounce. ' + 'Malformed input can hang the TTS engine. Keep each call under 200 chars.', parameters: { type: 'object', properties: { text: { type: 'string', description: 'Text (or ESML) to speak.' }, }, required: ['text'], }, }, }, { type: 'function', function: { name: 'listen', description: "Listen for the user's speech and return a transcript. " + 'Call this after speaking if you want to continue the conversation.', parameters: { type: 'object', properties: { timeout: { type: 'number', description: 'Max seconds to wait. Default 15.', }, }, }, }, }, { type: 'function', function: { name: 'take_photo', description: "Take a photo with Jibo's camera. The image is returned so you can see what's in front of you.", parameters: { type: 'object', properties: { resolution: { type: 'string', enum: ['medium', 'low'], description: 'Default: medium.', }, }, }, }, }, { type: 'function', function: { name: 'show_text', description: "Display text on Jibo's screen.", parameters: { type: 'object', properties: { text: { type: 'string', description: 'Text to show.' }, }, required: ['text'], }, }, }, { type: 'function', function: { name: 'show_image', description: "Display an image on Jibo's screen from a URL.", parameters: { type: 'object', properties: { url: { type: 'string', description: 'Image URL.' }, }, required: ['url'], }, }, }, { type: 'function', function: { name: 'show_eye', description: "Reset Jibo's screen to the default eye animation.", parameters: { type: 'object', properties: {} }, }, }, { type: 'function', function: { name: 'look_at_angle', description: "Turn Jibo's head. theta = yaw (±180°, positive right), psi = pitch (±30°, positive up).", parameters: { type: 'object', properties: { theta: { type: 'number', description: 'Yaw degrees.' }, psi: { type: 'number', description: 'Pitch degrees.' }, }, required: ['theta', 'psi'], }, }, }, { type: 'function', function: { name: 'set_volume', description: "Set Jibo's speaker volume (0.0 – 1.0).", parameters: { type: 'object', properties: { level: { type: 'number', description: 'Volume 0.0 to 1.0.' }, }, required: ['level'], }, }, }, { type: 'function', function: { name: 'web_search', description: 'Search the web via Brave Search. Use for current events, facts you are unsure of, ' + 'or anything that may have changed since training. Returns titles, URLs, and snippets.', parameters: { type: 'object', properties: { query: { type: 'string', description: 'The search query.' }, count: { type: 'number', description: 'How many results to return (1–10). Default 5.', }, freshness: { type: 'string', enum: ['pd', 'pw', 'pm', 'py'], description: 'Optional recency filter: pd=past day, pw=past week, pm=past month, py=past year.', }, }, required: ['query'], }, }, }, { type: 'function', function: { name: 'fetch_url', description: 'Fetch the contents of a web page by URL. Prefers markdown via content ' + 'negotiation (Cloudflare Markdown for Agents) and falls back to HTML→text. ' + 'Use after web_search to read a result, or to traverse linked pages.', parameters: { type: 'object', properties: { url: { type: 'string', description: 'Absolute http(s) URL to fetch.' }, max_chars: { type: 'number', description: 'Truncate the body to this many characters. Default 4000.', }, }, required: ['url'], }, }, }, { type: 'function', function: { name: 'end_conversation', description: 'Call this when the conversation has reached a natural end and you do NOT want to ' + 'listen for another reply. Pair it with a final "say" in the same turn for a farewell.', parameters: { type: 'object', properties: {} }, }, }, ]; // ── Resolution map ───────────────────────────────────────────────────────────── const RES_MAP = { high: 'highRes', medium: 'medRes', low: 'lowRes' }; // ── Screen text helpers ──────────────────────────────────────────────────────── /** * Word-wrap text for Jibo's small screen. Breaks oversized words, respects * existing newlines, and truncates with an ellipsis past `maxLines`. */ function wrapForScreen(text, width = 40, maxLines = 10) { const out = []; for (const para of String(text).split('\n')) { if (para === '') { out.push(''); continue; } let line = ''; for (const word of para.split(/\s+/).filter(Boolean)) { if (word.length > width) { if (line) { out.push(line); line = ''; } for (let i = 0; i < word.length; i += width) { const chunk = word.slice(i, i + width); if (chunk.length === width) out.push(chunk); else line = chunk; } continue; } const candidate = line ? `${line} ${word}` : word; if (candidate.length > width) { out.push(line); line = word; } else { line = candidate; } } if (line) out.push(line); } if (out.length > maxLines) { return out.slice(0, maxLines - 1).concat('…').join('\n'); } return out.join('\n'); } /** * Strip markup the Jibo TTS engine chokes on (markdown, LaTeX, unmatched * closing tags). Preserves valid ESML self-closing tags like and * . Defense-in-depth against models that ignore the instructions. */ function sanitizeForTTS(text) { const ESML_TAGS = /^(anim|break|prosody|emph|phoneme|phrase|style|voice)\b/i; return text // Remove LaTeX inline math: $...$ and $$...$$ .replace(/\${1,2}[^$]{0,200}\${1,2}/g, '') // Strip code fences and inline backticks .replace(/```[\s\S]*?```/g, '') .replace(/`+/g, '') // Strip markdown emphasis markers but keep the words .replace(/(\*\*|__)(.*?)\1/g, '$2') .replace(/(\*|_)(?=\S)(.+?)(?<=\S)\1/g, '$2') // Drop any tag that isn't a known ESML tag (e.g. ,
, etc.) .replace(/<\/?([a-zA-Z][^\s>/]*)\b[^>]*\/?>/g, (m, name) => ESML_TAGS.test(name) ? m : '') // Collapse extra whitespace .replace(/[ \t]+/g, ' ') .trim(); } // ── Abort helpers ────────────────────────────────────────────────────────────── function throwIfAborted(signal) { if (signal?.aborted) { const err = new Error('Conversation aborted'); err.code = 'CONVERSATION_ABORTED'; throw err; } } function onAbort(signal) { if (!signal) return new Promise(() => { }); // never resolves return new Promise((_, reject) => { const handler = () => { const err = new Error('Conversation aborted'); err.code = 'CONVERSATION_ABORTED'; reject(err); }; if (signal.aborted) return handler(); signal.addEventListener('abort', handler, { once: true }); }); } // ── Tool executor ────────────────────────────────────────────────────────────── /** * Execute a single tool call against the Jibo client. * * Returns { content, image? }. * - content — text string for the tool-result message * - image — optional base64 JPEG (only for take_photo) * * @param {import('rom-control').Client} client * @param {string} name Tool function name * @param {object} args Parsed arguments * @param {AbortSignal} [signal] Cancellation signal * @returns {Promise<{ content: string, image?: string }>} */ async function executeTool(client, name, args, signal, ctx) { throwIfAborted(signal); ctx = ctx || {}; if (!ctx.speechChain) ctx.speechChain = Promise.resolve(); switch (name) { // ── Communication ────────────────────────────────────────────────────── case 'say': { const text = sanitizeForTTS(String(args.text || '')); console.log(` [tool:say] "${text}" (queued)`); // Estimate ~80ms per char + 5s base, capped at 60s. Anything longer // is almost certainly Jibo's TTS hung on bad ESML/markup; we'd rather // log a warning and unblock the conversation than deadlock listen. const estimateMs = Math.min(60000, 5000 + text.length * 80); ctx.speechChain = ctx.speechChain .then(() => { const started = Date.now(); console.log(` [tool:say] speaking… (timeout ${estimateMs}ms)`); let timer; const timeout = new Promise((resolve) => { timer = setTimeout(() => { console.warn(` [tool:say] timed out after ${estimateMs}ms — continuing.`); resolve(); }, estimateMs); }); return Promise.race([ client.behavior.say(text, { signal }), onAbort(signal), timeout, ]).finally(() => { clearTimeout(timer); console.log(` [tool:say] done in ${Date.now() - started}ms`); }); }) .catch((err) => { if (err.code === 'CONVERSATION_ABORTED') return; console.error(' [tool:say] error:', err.message); }); return { content: 'Speech queued — Jibo will speak it shortly. Continue with other tools; listen will wait for it.' }; } case 'listen': { const ms = (args.timeout || 15) * 1000; // Make sure pending speech finishes before we open the mic, otherwise // Jibo will hear his own voice. console.log(' [tool:listen] awaiting pending speech…'); await Promise.race([ctx.speechChain, onAbort(signal)]); throwIfAborted(signal); console.log(` [tool:listen] waiting ${ms}ms…`); client.display.showText('Listening...'); try { const speech = await Promise.race([ client.audio.awaitSpeech({ mode: 'local', time: ms }), onAbort(signal), ]); console.log(` [tool:listen] heard: "${speech.content}"`); ctx.lastHeard = speech.content; return { content: `User said: "${speech.content}"` }; } catch (err) { if (err.code === 'CONVERSATION_ABORTED') throw err; if (err.code === 'SPEECH_TIMEOUT') { console.log(' [tool:listen] timed out'); return { content: 'No speech detected — user did not respond.' }; } throw err; } finally { client.display.showEye(); } } // ── Camera ───────────────────────────────────────────────────────────── case 'take_photo': { const res = RES_MAP[args.resolution] || 'medRes'; console.log(` [tool:take_photo] ${res}…`); const photo = await Promise.race([ client.camera.takePhoto({ resolution: res, timeout: 30000 }), onAbort(signal), ]); const buf = await photo.fetchBuffer(); console.log(` [tool:take_photo] ${buf.length} bytes captured`); return { content: "Photo captured from Jibo's camera.", image: buf.toString('base64'), }; } // ── Display ──────────────────────────────────────────────────────────── case 'show_text': { console.log(` [tool:show_text] "${args.text}"`); client.display.showText(wrapForScreen(args.text, 40, 10)); return { content: 'Text displayed on screen.' }; } case 'show_image': { console.log(` [tool:show_image] ${args.url}`); client.display.showImage(args.url); return { content: 'Image displayed on screen.' }; } case 'show_eye': { console.log(' [tool:show_eye]'); client.display.showEye(); return { content: 'Eye animation restored on screen.' }; } case 'look_at_angle': { console.log(` [tool:look_at_angle] θ=${args.theta}° ψ=${args.psi}°`); await client.behavior.lookAtAngle(args.theta, args.psi); return { content: `Now looking at θ=${args.theta}°, ψ=${args.psi}°.` }; } case 'set_volume': { console.log(` [tool:set_volume] ${args.level}`); await client.audio.setVolume(args.level); return { content: `Volume set to ${args.level}.` }; } // ── Web search ───────────────────────────────────────────────────────── case 'web_search': { const apiKey = process.env.BRAVE_API_KEY; if (!apiKey) { return { content: 'web_search is unavailable: BRAVE_API_KEY environment variable is not set.', }; } const query = String(args.query || '').trim(); if (!query) { return { content: 'web_search error: query is required.' }; } const count = Math.max(1, Math.min(10, Number(args.count) || 5)); const params = new URLSearchParams({ q: query, count: String(count), extra_snippets: 'true', safesearch: 'moderate', }); if (args.freshness) params.set('freshness', String(args.freshness)); console.log(` [tool:web_search] "${query}" (count=${count})`); const url = `https://api.search.brave.com/res/v1/web/search?${params.toString()}`; const ac = new AbortController(); const onAbortHandler = () => ac.abort(); signal?.addEventListener('abort', onAbortHandler, { once: true }); try { const res = await fetch(url, { headers: { Accept: 'application/json', 'Accept-Encoding': 'gzip', 'X-Subscription-Token': apiKey, }, signal: ac.signal, }); if (!res.ok) { const body = await res.text().catch(() => ''); return { content: `web_search error: ${res.status} ${res.statusText}. ${body.slice(0, 200)}`, }; } const data = await res.json(); const results = data?.web?.results || []; if (results.length === 0) { return { content: `No web results found for "${query}".` }; } const lines = results.slice(0, count).map((r, i) => { const title = r.title || '(untitled)'; const u = r.url || ''; const desc = (r.description || '').replace(/\s+/g, ' ').trim(); const extras = Array.isArray(r.extra_snippets) ? r.extra_snippets.slice(0, 2).map((s) => s.replace(/\s+/g, ' ').trim()) : []; const tail = extras.length ? `\n • ${extras.join('\n • ')}` : ''; return `${i + 1}. ${title}\n ${u}\n ${desc}${tail}`; }); return { content: `Web results for "${query}":\n\n${lines.join('\n\n')}`, }; } catch (err) { if (err.name === 'AbortError') throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' }); return { content: `web_search error: ${err.message}` }; } finally { signal?.removeEventListener('abort', onAbortHandler); } } case 'fetch_url': { const target = String(args.url || '').trim(); if (!/^https?:\/\//i.test(target)) { return { content: 'fetch_url error: url must be an absolute http(s) URL.' }; } const maxChars = Math.max(200, Math.min(20000, Number(args.max_chars) || 4000)); console.log(` [tool:fetch_url] ${target}`); const ac = new AbortController(); const onAbortHandler = () => ac.abort(); signal?.addEventListener('abort', onAbortHandler, { once: true }); const timeoutId = setTimeout(() => ac.abort(), 20000); try { const res = await fetch(target, { headers: { // Prefer markdown (Cloudflare Markdown for Agents); accept HTML/text fallback. Accept: 'text/markdown, text/plain;q=0.9, text/html;q=0.8, */*;q=0.1', 'Accept-Encoding': 'gzip', 'User-Agent': 'jibo-llm/1.0 (+agent)', }, redirect: 'follow', signal: ac.signal, }); if (!res.ok) { return { content: `fetch_url error: ${res.status} ${res.statusText} from ${target}`, }; } const ctype = (res.headers.get('content-type') || '').toLowerCase(); if (!/^(text\/|application\/(json|xml|xhtml))/.test(ctype) && ctype) { return { content: `fetch_url: refusing non-text content (${ctype}) from ${target}`, }; } let body = await res.text(); const isMarkdown = ctype.includes('markdown'); const isHtml = ctype.includes('html') || /]/i.test(body.slice(0, 500)); if (!isMarkdown && isHtml) { // Lightweight HTML→text: strip scripts/styles/tags, collapse whitespace. body = body .replace(//gi, ' ') .replace(//gi, ' ') .replace(//gi, ' ') .replace(//g, ' ') .replace(/<\/(p|div|li|h[1-6]|br|tr)>/gi, '\n') .replace(/<[^>]+>/g, ' ') .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/[ \t]+/g, ' ') .replace(/\n{3,}/g, '\n\n') .trim(); } const truncated = body.length > maxChars; const out = truncated ? body.slice(0, maxChars) + '\n…[truncated]' : body; const finalUrl = res.url || target; const fmt = isMarkdown ? 'markdown' : isHtml ? 'html→text' : 'text'; return { content: `Fetched ${finalUrl} (${fmt}, ${body.length} chars${truncated ? `, truncated to ${maxChars}` : ''}):\n\n${out}`, }; } catch (err) { if (err.name === 'AbortError') { if (signal?.aborted) { throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' }); } return { content: `fetch_url error: timeout fetching ${target}` }; } return { content: `fetch_url error: ${err.message}` }; } finally { clearTimeout(timeoutId); signal?.removeEventListener('abort', onAbortHandler); } } case 'end_conversation': { console.log(' [tool:end_conversation] awaiting pending speech…'); await Promise.race([ctx.speechChain, onAbort(signal)]); return { content: 'Conversation ended.', endConversation: true }; } default: return { content: `Unknown tool "${name}".` }; } } module.exports = { TOOL_SCHEMAS, executeTool, wrapForScreen };