Files
jibo-llm/tools.js
pasketti 8955f21ab4 Initial commit: jibo-llm hotword-triggered agent
Hotword-triggered LLM conversation loop for Jibo with tool-calling agent
loop, ESML expressive speech, web search/fetch, and per-conversation
abort handling.
2026-04-26 00:05:39 -04:00

570 lines
20 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Tool definitions and executor for the Jibo LLM agent.
*
* Each tool maps to a rom-control capability the LLM can invoke.
*/
// ── OpenAI function-tool schemas ───────────────────────────────────────────────
const TOOL_SCHEMAS = [
{
type: 'function',
function: {
name: 'say',
description:
"Speak text aloud through Jibo's speaker. Plain text plus valid ESML tags only " +
'(e.g. <anim cat="happy" nonBlocking="true"/>, <break size="0.3"/>). ' +
'NEVER include markdown (no *italics*, **bold**, backticks), LaTeX ($...$), ' +
'unmatched/closing tags like </es>, or other symbols Jibo cannot pronounce. ' +
'Malformed input can hang the TTS engine. Keep each call under 200 chars.',
parameters: {
type: 'object',
properties: {
text: { type: 'string', description: 'Text (or ESML) to speak.' },
},
required: ['text'],
},
},
},
{
type: 'function',
function: {
name: 'listen',
description:
"Listen for the user's speech and return a transcript. " +
'Call this after speaking if you want to continue the conversation.',
parameters: {
type: 'object',
properties: {
timeout: {
type: 'number',
description: 'Max seconds to wait. Default 15.',
},
},
},
},
},
{
type: 'function',
function: {
name: 'take_photo',
description:
"Take a photo with Jibo's camera. The image is returned so you can see what's in front of you.",
parameters: {
type: 'object',
properties: {
resolution: {
type: 'string',
enum: ['medium', 'low'],
description: 'Default: medium.',
},
},
},
},
},
{
type: 'function',
function: {
name: 'show_text',
description: "Display text on Jibo's screen.",
parameters: {
type: 'object',
properties: {
text: { type: 'string', description: 'Text to show.' },
},
required: ['text'],
},
},
},
{
type: 'function',
function: {
name: 'show_image',
description: "Display an image on Jibo's screen from a URL.",
parameters: {
type: 'object',
properties: {
url: { type: 'string', description: 'Image URL.' },
},
required: ['url'],
},
},
},
{
type: 'function',
function: {
name: 'show_eye',
description: "Reset Jibo's screen to the default eye animation.",
parameters: { type: 'object', properties: {} },
},
},
{
type: 'function',
function: {
name: 'look_at_angle',
description: "Turn Jibo's head. theta = yaw (±180°, positive right), psi = pitch (±30°, positive up).",
parameters: {
type: 'object',
properties: {
theta: { type: 'number', description: 'Yaw degrees.' },
psi: { type: 'number', description: 'Pitch degrees.' },
},
required: ['theta', 'psi'],
},
},
},
{
type: 'function',
function: {
name: 'set_volume',
description: "Set Jibo's speaker volume (0.0 1.0).",
parameters: {
type: 'object',
properties: {
level: { type: 'number', description: 'Volume 0.0 to 1.0.' },
},
required: ['level'],
},
},
},
{
type: 'function',
function: {
name: 'web_search',
description:
'Search the web via Brave Search. Use for current events, facts you are unsure of, ' +
'or anything that may have changed since training. Returns titles, URLs, and snippets.',
parameters: {
type: 'object',
properties: {
query: { type: 'string', description: 'The search query.' },
count: {
type: 'number',
description: 'How many results to return (110). Default 5.',
},
freshness: {
type: 'string',
enum: ['pd', 'pw', 'pm', 'py'],
description:
'Optional recency filter: pd=past day, pw=past week, pm=past month, py=past year.',
},
},
required: ['query'],
},
},
},
{
type: 'function',
function: {
name: 'fetch_url',
description:
'Fetch the contents of a web page by URL. Prefers markdown via content ' +
'negotiation (Cloudflare Markdown for Agents) and falls back to HTML→text. ' +
'Use after web_search to read a result, or to traverse linked pages.',
parameters: {
type: 'object',
properties: {
url: { type: 'string', description: 'Absolute http(s) URL to fetch.' },
max_chars: {
type: 'number',
description: 'Truncate the body to this many characters. Default 4000.',
},
},
required: ['url'],
},
},
},
{
type: 'function',
function: {
name: 'end_conversation',
description:
'Call this when the conversation has reached a natural end and you do NOT want to ' +
'listen for another reply. Pair it with a final "say" in the same turn for a farewell.',
parameters: { type: 'object', properties: {} },
},
},
];
// ── Resolution map ─────────────────────────────────────────────────────────────
const RES_MAP = { high: 'highRes', medium: 'medRes', low: 'lowRes' };
// ── Screen text helpers ────────────────────────────────────────────────────────
/**
* Word-wrap text for Jibo's small screen. Breaks oversized words, respects
* existing newlines, and truncates with an ellipsis past `maxLines`.
*/
function wrapForScreen(text, width = 40, maxLines = 10) {
const out = [];
for (const para of String(text).split('\n')) {
if (para === '') { out.push(''); continue; }
let line = '';
for (const word of para.split(/\s+/).filter(Boolean)) {
if (word.length > width) {
if (line) { out.push(line); line = ''; }
for (let i = 0; i < word.length; i += width) {
const chunk = word.slice(i, i + width);
if (chunk.length === width) out.push(chunk);
else line = chunk;
}
continue;
}
const candidate = line ? `${line} ${word}` : word;
if (candidate.length > width) {
out.push(line);
line = word;
} else {
line = candidate;
}
}
if (line) out.push(line);
}
if (out.length > maxLines) {
return out.slice(0, maxLines - 1).concat('…').join('\n');
}
return out.join('\n');
}
/**
* Strip markup the Jibo TTS engine chokes on (markdown, LaTeX, unmatched
* closing tags). Preserves valid ESML self-closing tags like <anim .../> and
* <break .../>. Defense-in-depth against models that ignore the instructions.
*/
function sanitizeForTTS(text) {
const ESML_TAGS = /^(anim|break|prosody|emph|phoneme|phrase|style|voice)\b/i;
return text
// Remove LaTeX inline math: $...$ and $$...$$
.replace(/\${1,2}[^$]{0,200}\${1,2}/g, '')
// Strip code fences and inline backticks
.replace(/```[\s\S]*?```/g, '')
.replace(/`+/g, '')
// Strip markdown emphasis markers but keep the words
.replace(/(\*\*|__)(.*?)\1/g, '$2')
.replace(/(\*|_)(?=\S)(.+?)(?<=\S)\1/g, '$2')
// Drop any tag that isn't a known ESML tag (e.g. </es>, <br>, etc.)
.replace(/<\/?([a-zA-Z][^\s>/]*)\b[^>]*\/?>/g, (m, name) =>
ESML_TAGS.test(name) ? m : '')
// Collapse extra whitespace
.replace(/[ \t]+/g, ' ')
.trim();
}
// ── Abort helpers ──────────────────────────────────────────────────────────────
function throwIfAborted(signal) {
if (signal?.aborted) {
const err = new Error('Conversation aborted');
err.code = 'CONVERSATION_ABORTED';
throw err;
}
}
function onAbort(signal) {
if (!signal) return new Promise(() => { }); // never resolves
return new Promise((_, reject) => {
const handler = () => {
const err = new Error('Conversation aborted');
err.code = 'CONVERSATION_ABORTED';
reject(err);
};
if (signal.aborted) return handler();
signal.addEventListener('abort', handler, { once: true });
});
}
// ── Tool executor ──────────────────────────────────────────────────────────────
/**
* Execute a single tool call against the Jibo client.
*
* Returns { content, image? }.
* - content — text string for the tool-result message
* - image — optional base64 JPEG (only for take_photo)
*
* @param {import('rom-control').Client} client
* @param {string} name Tool function name
* @param {object} args Parsed arguments
* @param {AbortSignal} [signal] Cancellation signal
* @returns {Promise<{ content: string, image?: string }>}
*/
async function executeTool(client, name, args, signal, ctx) {
throwIfAborted(signal);
ctx = ctx || {};
if (!ctx.speechChain) ctx.speechChain = Promise.resolve();
switch (name) {
// ── Communication ──────────────────────────────────────────────────────
case 'say': {
const text = sanitizeForTTS(String(args.text || ''));
console.log(` [tool:say] "${text}" (queued)`);
// Estimate ~80ms per char + 5s base, capped at 60s. Anything longer
// is almost certainly Jibo's TTS hung on bad ESML/markup; we'd rather
// log a warning and unblock the conversation than deadlock listen.
const estimateMs = Math.min(60000, 5000 + text.length * 80);
ctx.speechChain = ctx.speechChain
.then(() => {
const started = Date.now();
console.log(` [tool:say] speaking… (timeout ${estimateMs}ms)`);
let timer;
const timeout = new Promise((resolve) => {
timer = setTimeout(() => {
console.warn(` [tool:say] timed out after ${estimateMs}ms — continuing.`);
resolve();
}, estimateMs);
});
return Promise.race([
client.behavior.say(text, { signal }),
onAbort(signal),
timeout,
]).finally(() => {
clearTimeout(timer);
console.log(` [tool:say] done in ${Date.now() - started}ms`);
});
})
.catch((err) => {
if (err.code === 'CONVERSATION_ABORTED') return;
console.error(' [tool:say] error:', err.message);
});
return { content: 'Speech queued — Jibo will speak it shortly. Continue with other tools; listen will wait for it.' };
}
case 'listen': {
const ms = (args.timeout || 15) * 1000;
// Make sure pending speech finishes before we open the mic, otherwise
// Jibo will hear his own voice.
console.log(' [tool:listen] awaiting pending speech…');
await Promise.race([ctx.speechChain, onAbort(signal)]);
throwIfAborted(signal);
console.log(` [tool:listen] waiting ${ms}ms…`);
client.display.showText('Listening...');
try {
const speech = await Promise.race([
client.audio.awaitSpeech({ mode: 'local', time: ms }),
onAbort(signal),
]);
console.log(` [tool:listen] heard: "${speech.content}"`);
ctx.lastHeard = speech.content;
return { content: `User said: "${speech.content}"` };
} catch (err) {
if (err.code === 'CONVERSATION_ABORTED') throw err;
if (err.code === 'SPEECH_TIMEOUT') {
console.log(' [tool:listen] timed out');
return { content: 'No speech detected — user did not respond.' };
}
throw err;
} finally {
client.display.showEye();
}
}
// ── Camera ─────────────────────────────────────────────────────────────
case 'take_photo': {
const res = RES_MAP[args.resolution] || 'medRes';
console.log(` [tool:take_photo] ${res}`);
const photo = await Promise.race([
client.camera.takePhoto({ resolution: res, timeout: 30000 }),
onAbort(signal),
]);
const buf = await photo.fetchBuffer();
console.log(` [tool:take_photo] ${buf.length} bytes captured`);
return {
content: "Photo captured from Jibo's camera.",
image: buf.toString('base64'),
};
}
// ── Display ────────────────────────────────────────────────────────────
case 'show_text': {
console.log(` [tool:show_text] "${args.text}"`);
client.display.showText(wrapForScreen(args.text, 40, 10));
return { content: 'Text displayed on screen.' };
}
case 'show_image': {
console.log(` [tool:show_image] ${args.url}`);
client.display.showImage(args.url);
return { content: 'Image displayed on screen.' };
}
case 'show_eye': {
console.log(' [tool:show_eye]');
client.display.showEye();
return { content: 'Eye animation restored on screen.' };
}
case 'look_at_angle': {
console.log(` [tool:look_at_angle] θ=${args.theta}° ψ=${args.psi}°`);
await client.behavior.lookAtAngle(args.theta, args.psi);
return { content: `Now looking at θ=${args.theta}°, ψ=${args.psi}°.` };
}
case 'set_volume': {
console.log(` [tool:set_volume] ${args.level}`);
await client.audio.setVolume(args.level);
return { content: `Volume set to ${args.level}.` };
}
// ── Web search ─────────────────────────────────────────────────────────
case 'web_search': {
const apiKey = process.env.BRAVE_API_KEY;
if (!apiKey) {
return {
content:
'web_search is unavailable: BRAVE_API_KEY environment variable is not set.',
};
}
const query = String(args.query || '').trim();
if (!query) {
return { content: 'web_search error: query is required.' };
}
const count = Math.max(1, Math.min(10, Number(args.count) || 5));
const params = new URLSearchParams({
q: query,
count: String(count),
extra_snippets: 'true',
safesearch: 'moderate',
});
if (args.freshness) params.set('freshness', String(args.freshness));
console.log(` [tool:web_search] "${query}" (count=${count})`);
const url = `https://api.search.brave.com/res/v1/web/search?${params.toString()}`;
const ac = new AbortController();
const onAbortHandler = () => ac.abort();
signal?.addEventListener('abort', onAbortHandler, { once: true });
try {
const res = await fetch(url, {
headers: {
Accept: 'application/json',
'Accept-Encoding': 'gzip',
'X-Subscription-Token': apiKey,
},
signal: ac.signal,
});
if (!res.ok) {
const body = await res.text().catch(() => '');
return {
content: `web_search error: ${res.status} ${res.statusText}. ${body.slice(0, 200)}`,
};
}
const data = await res.json();
const results = data?.web?.results || [];
if (results.length === 0) {
return { content: `No web results found for "${query}".` };
}
const lines = results.slice(0, count).map((r, i) => {
const title = r.title || '(untitled)';
const u = r.url || '';
const desc = (r.description || '').replace(/\s+/g, ' ').trim();
const extras = Array.isArray(r.extra_snippets)
? r.extra_snippets.slice(0, 2).map((s) => s.replace(/\s+/g, ' ').trim())
: [];
const tail = extras.length ? `\n${extras.join('\n • ')}` : '';
return `${i + 1}. ${title}\n ${u}\n ${desc}${tail}`;
});
return {
content: `Web results for "${query}":\n\n${lines.join('\n\n')}`,
};
} catch (err) {
if (err.name === 'AbortError') throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' });
return { content: `web_search error: ${err.message}` };
} finally {
signal?.removeEventListener('abort', onAbortHandler);
}
}
case 'fetch_url': {
const target = String(args.url || '').trim();
if (!/^https?:\/\//i.test(target)) {
return { content: 'fetch_url error: url must be an absolute http(s) URL.' };
}
const maxChars = Math.max(200, Math.min(20000, Number(args.max_chars) || 4000));
console.log(` [tool:fetch_url] ${target}`);
const ac = new AbortController();
const onAbortHandler = () => ac.abort();
signal?.addEventListener('abort', onAbortHandler, { once: true });
const timeoutId = setTimeout(() => ac.abort(), 20000);
try {
const res = await fetch(target, {
headers: {
// Prefer markdown (Cloudflare Markdown for Agents); accept HTML/text fallback.
Accept: 'text/markdown, text/plain;q=0.9, text/html;q=0.8, */*;q=0.1',
'Accept-Encoding': 'gzip',
'User-Agent': 'jibo-llm/1.0 (+agent)',
},
redirect: 'follow',
signal: ac.signal,
});
if (!res.ok) {
return {
content: `fetch_url error: ${res.status} ${res.statusText} from ${target}`,
};
}
const ctype = (res.headers.get('content-type') || '').toLowerCase();
if (!/^(text\/|application\/(json|xml|xhtml))/.test(ctype) && ctype) {
return {
content: `fetch_url: refusing non-text content (${ctype}) from ${target}`,
};
}
let body = await res.text();
const isMarkdown = ctype.includes('markdown');
const isHtml = ctype.includes('html') || /<html[\s>]/i.test(body.slice(0, 500));
if (!isMarkdown && isHtml) {
// Lightweight HTML→text: strip scripts/styles/tags, collapse whitespace.
body = body
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
.replace(/<!--[\s\S]*?-->/g, ' ')
.replace(/<\/(p|div|li|h[1-6]|br|tr)>/gi, '\n')
.replace(/<[^>]+>/g, ' ')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/[ \t]+/g, ' ')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
const truncated = body.length > maxChars;
const out = truncated ? body.slice(0, maxChars) + '\n…[truncated]' : body;
const finalUrl = res.url || target;
const fmt = isMarkdown ? 'markdown' : isHtml ? 'html→text' : 'text';
return {
content: `Fetched ${finalUrl} (${fmt}, ${body.length} chars${truncated ? `, truncated to ${maxChars}` : ''}):\n\n${out}`,
};
} catch (err) {
if (err.name === 'AbortError') {
if (signal?.aborted) {
throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' });
}
return { content: `fetch_url error: timeout fetching ${target}` };
}
return { content: `fetch_url error: ${err.message}` };
} finally {
clearTimeout(timeoutId);
signal?.removeEventListener('abort', onAbortHandler);
}
}
case 'end_conversation': {
console.log(' [tool:end_conversation] awaiting pending speech…');
await Promise.race([ctx.speechChain, onAbort(signal)]);
return { content: 'Conversation ended.', endConversation: true };
}
default:
return { content: `Unknown tool "${name}".` };
}
}
module.exports = { TOOL_SCHEMAS, executeTool, wrapForScreen };