Initial commit: jibo-llm hotword-triggered agent

Hotword-triggered LLM conversation loop for Jibo with tool-calling agent
loop, ESML expressive speech, web search/fetch, and per-conversation
abort handling.
This commit is contained in:
pasketti
2026-04-26 00:05:39 -04:00
commit 8955f21ab4
8 changed files with 2039 additions and 0 deletions

569
tools.js Normal file
View File

@@ -0,0 +1,569 @@
/**
* Tool definitions and executor for the Jibo LLM agent.
*
* Each tool maps to a rom-control capability the LLM can invoke.
*/
// ── OpenAI function-tool schemas ───────────────────────────────────────────────
const TOOL_SCHEMAS = [
{
type: 'function',
function: {
name: 'say',
description:
"Speak text aloud through Jibo's speaker. Plain text plus valid ESML tags only " +
'(e.g. <anim cat="happy" nonBlocking="true"/>, <break size="0.3"/>). ' +
'NEVER include markdown (no *italics*, **bold**, backticks), LaTeX ($...$), ' +
'unmatched/closing tags like </es>, or other symbols Jibo cannot pronounce. ' +
'Malformed input can hang the TTS engine. Keep each call under 200 chars.',
parameters: {
type: 'object',
properties: {
text: { type: 'string', description: 'Text (or ESML) to speak.' },
},
required: ['text'],
},
},
},
{
type: 'function',
function: {
name: 'listen',
description:
"Listen for the user's speech and return a transcript. " +
'Call this after speaking if you want to continue the conversation.',
parameters: {
type: 'object',
properties: {
timeout: {
type: 'number',
description: 'Max seconds to wait. Default 15.',
},
},
},
},
},
{
type: 'function',
function: {
name: 'take_photo',
description:
"Take a photo with Jibo's camera. The image is returned so you can see what's in front of you.",
parameters: {
type: 'object',
properties: {
resolution: {
type: 'string',
enum: ['medium', 'low'],
description: 'Default: medium.',
},
},
},
},
},
{
type: 'function',
function: {
name: 'show_text',
description: "Display text on Jibo's screen.",
parameters: {
type: 'object',
properties: {
text: { type: 'string', description: 'Text to show.' },
},
required: ['text'],
},
},
},
{
type: 'function',
function: {
name: 'show_image',
description: "Display an image on Jibo's screen from a URL.",
parameters: {
type: 'object',
properties: {
url: { type: 'string', description: 'Image URL.' },
},
required: ['url'],
},
},
},
{
type: 'function',
function: {
name: 'show_eye',
description: "Reset Jibo's screen to the default eye animation.",
parameters: { type: 'object', properties: {} },
},
},
{
type: 'function',
function: {
name: 'look_at_angle',
description: "Turn Jibo's head. theta = yaw (±180°, positive right), psi = pitch (±30°, positive up).",
parameters: {
type: 'object',
properties: {
theta: { type: 'number', description: 'Yaw degrees.' },
psi: { type: 'number', description: 'Pitch degrees.' },
},
required: ['theta', 'psi'],
},
},
},
{
type: 'function',
function: {
name: 'set_volume',
description: "Set Jibo's speaker volume (0.0 1.0).",
parameters: {
type: 'object',
properties: {
level: { type: 'number', description: 'Volume 0.0 to 1.0.' },
},
required: ['level'],
},
},
},
{
type: 'function',
function: {
name: 'web_search',
description:
'Search the web via Brave Search. Use for current events, facts you are unsure of, ' +
'or anything that may have changed since training. Returns titles, URLs, and snippets.',
parameters: {
type: 'object',
properties: {
query: { type: 'string', description: 'The search query.' },
count: {
type: 'number',
description: 'How many results to return (110). Default 5.',
},
freshness: {
type: 'string',
enum: ['pd', 'pw', 'pm', 'py'],
description:
'Optional recency filter: pd=past day, pw=past week, pm=past month, py=past year.',
},
},
required: ['query'],
},
},
},
{
type: 'function',
function: {
name: 'fetch_url',
description:
'Fetch the contents of a web page by URL. Prefers markdown via content ' +
'negotiation (Cloudflare Markdown for Agents) and falls back to HTML→text. ' +
'Use after web_search to read a result, or to traverse linked pages.',
parameters: {
type: 'object',
properties: {
url: { type: 'string', description: 'Absolute http(s) URL to fetch.' },
max_chars: {
type: 'number',
description: 'Truncate the body to this many characters. Default 4000.',
},
},
required: ['url'],
},
},
},
{
type: 'function',
function: {
name: 'end_conversation',
description:
'Call this when the conversation has reached a natural end and you do NOT want to ' +
'listen for another reply. Pair it with a final "say" in the same turn for a farewell.',
parameters: { type: 'object', properties: {} },
},
},
];
// ── Resolution map ─────────────────────────────────────────────────────────────
const RES_MAP = { high: 'highRes', medium: 'medRes', low: 'lowRes' };
// ── Screen text helpers ────────────────────────────────────────────────────────
/**
* Word-wrap text for Jibo's small screen. Breaks oversized words, respects
* existing newlines, and truncates with an ellipsis past `maxLines`.
*/
function wrapForScreen(text, width = 40, maxLines = 10) {
const out = [];
for (const para of String(text).split('\n')) {
if (para === '') { out.push(''); continue; }
let line = '';
for (const word of para.split(/\s+/).filter(Boolean)) {
if (word.length > width) {
if (line) { out.push(line); line = ''; }
for (let i = 0; i < word.length; i += width) {
const chunk = word.slice(i, i + width);
if (chunk.length === width) out.push(chunk);
else line = chunk;
}
continue;
}
const candidate = line ? `${line} ${word}` : word;
if (candidate.length > width) {
out.push(line);
line = word;
} else {
line = candidate;
}
}
if (line) out.push(line);
}
if (out.length > maxLines) {
return out.slice(0, maxLines - 1).concat('…').join('\n');
}
return out.join('\n');
}
/**
* Strip markup the Jibo TTS engine chokes on (markdown, LaTeX, unmatched
* closing tags). Preserves valid ESML self-closing tags like <anim .../> and
* <break .../>. Defense-in-depth against models that ignore the instructions.
*/
function sanitizeForTTS(text) {
const ESML_TAGS = /^(anim|break|prosody|emph|phoneme|phrase|style|voice)\b/i;
return text
// Remove LaTeX inline math: $...$ and $$...$$
.replace(/\${1,2}[^$]{0,200}\${1,2}/g, '')
// Strip code fences and inline backticks
.replace(/```[\s\S]*?```/g, '')
.replace(/`+/g, '')
// Strip markdown emphasis markers but keep the words
.replace(/(\*\*|__)(.*?)\1/g, '$2')
.replace(/(\*|_)(?=\S)(.+?)(?<=\S)\1/g, '$2')
// Drop any tag that isn't a known ESML tag (e.g. </es>, <br>, etc.)
.replace(/<\/?([a-zA-Z][^\s>/]*)\b[^>]*\/?>/g, (m, name) =>
ESML_TAGS.test(name) ? m : '')
// Collapse extra whitespace
.replace(/[ \t]+/g, ' ')
.trim();
}
// ── Abort helpers ──────────────────────────────────────────────────────────────
function throwIfAborted(signal) {
if (signal?.aborted) {
const err = new Error('Conversation aborted');
err.code = 'CONVERSATION_ABORTED';
throw err;
}
}
function onAbort(signal) {
if (!signal) return new Promise(() => { }); // never resolves
return new Promise((_, reject) => {
const handler = () => {
const err = new Error('Conversation aborted');
err.code = 'CONVERSATION_ABORTED';
reject(err);
};
if (signal.aborted) return handler();
signal.addEventListener('abort', handler, { once: true });
});
}
// ── Tool executor ──────────────────────────────────────────────────────────────
/**
* Execute a single tool call against the Jibo client.
*
* Returns { content, image? }.
* - content — text string for the tool-result message
* - image — optional base64 JPEG (only for take_photo)
*
* @param {import('rom-control').Client} client
* @param {string} name Tool function name
* @param {object} args Parsed arguments
* @param {AbortSignal} [signal] Cancellation signal
* @returns {Promise<{ content: string, image?: string }>}
*/
async function executeTool(client, name, args, signal, ctx) {
throwIfAborted(signal);
ctx = ctx || {};
if (!ctx.speechChain) ctx.speechChain = Promise.resolve();
switch (name) {
// ── Communication ──────────────────────────────────────────────────────
case 'say': {
const text = sanitizeForTTS(String(args.text || ''));
console.log(` [tool:say] "${text}" (queued)`);
// Estimate ~80ms per char + 5s base, capped at 60s. Anything longer
// is almost certainly Jibo's TTS hung on bad ESML/markup; we'd rather
// log a warning and unblock the conversation than deadlock listen.
const estimateMs = Math.min(60000, 5000 + text.length * 80);
ctx.speechChain = ctx.speechChain
.then(() => {
const started = Date.now();
console.log(` [tool:say] speaking… (timeout ${estimateMs}ms)`);
let timer;
const timeout = new Promise((resolve) => {
timer = setTimeout(() => {
console.warn(` [tool:say] timed out after ${estimateMs}ms — continuing.`);
resolve();
}, estimateMs);
});
return Promise.race([
client.behavior.say(text, { signal }),
onAbort(signal),
timeout,
]).finally(() => {
clearTimeout(timer);
console.log(` [tool:say] done in ${Date.now() - started}ms`);
});
})
.catch((err) => {
if (err.code === 'CONVERSATION_ABORTED') return;
console.error(' [tool:say] error:', err.message);
});
return { content: 'Speech queued — Jibo will speak it shortly. Continue with other tools; listen will wait for it.' };
}
case 'listen': {
const ms = (args.timeout || 15) * 1000;
// Make sure pending speech finishes before we open the mic, otherwise
// Jibo will hear his own voice.
console.log(' [tool:listen] awaiting pending speech…');
await Promise.race([ctx.speechChain, onAbort(signal)]);
throwIfAborted(signal);
console.log(` [tool:listen] waiting ${ms}ms…`);
client.display.showText('Listening...');
try {
const speech = await Promise.race([
client.audio.awaitSpeech({ mode: 'local', time: ms }),
onAbort(signal),
]);
console.log(` [tool:listen] heard: "${speech.content}"`);
ctx.lastHeard = speech.content;
return { content: `User said: "${speech.content}"` };
} catch (err) {
if (err.code === 'CONVERSATION_ABORTED') throw err;
if (err.code === 'SPEECH_TIMEOUT') {
console.log(' [tool:listen] timed out');
return { content: 'No speech detected — user did not respond.' };
}
throw err;
} finally {
client.display.showEye();
}
}
// ── Camera ─────────────────────────────────────────────────────────────
case 'take_photo': {
const res = RES_MAP[args.resolution] || 'medRes';
console.log(` [tool:take_photo] ${res}`);
const photo = await Promise.race([
client.camera.takePhoto({ resolution: res, timeout: 30000 }),
onAbort(signal),
]);
const buf = await photo.fetchBuffer();
console.log(` [tool:take_photo] ${buf.length} bytes captured`);
return {
content: "Photo captured from Jibo's camera.",
image: buf.toString('base64'),
};
}
// ── Display ────────────────────────────────────────────────────────────
case 'show_text': {
console.log(` [tool:show_text] "${args.text}"`);
client.display.showText(wrapForScreen(args.text, 40, 10));
return { content: 'Text displayed on screen.' };
}
case 'show_image': {
console.log(` [tool:show_image] ${args.url}`);
client.display.showImage(args.url);
return { content: 'Image displayed on screen.' };
}
case 'show_eye': {
console.log(' [tool:show_eye]');
client.display.showEye();
return { content: 'Eye animation restored on screen.' };
}
case 'look_at_angle': {
console.log(` [tool:look_at_angle] θ=${args.theta}° ψ=${args.psi}°`);
await client.behavior.lookAtAngle(args.theta, args.psi);
return { content: `Now looking at θ=${args.theta}°, ψ=${args.psi}°.` };
}
case 'set_volume': {
console.log(` [tool:set_volume] ${args.level}`);
await client.audio.setVolume(args.level);
return { content: `Volume set to ${args.level}.` };
}
// ── Web search ─────────────────────────────────────────────────────────
case 'web_search': {
const apiKey = process.env.BRAVE_API_KEY;
if (!apiKey) {
return {
content:
'web_search is unavailable: BRAVE_API_KEY environment variable is not set.',
};
}
const query = String(args.query || '').trim();
if (!query) {
return { content: 'web_search error: query is required.' };
}
const count = Math.max(1, Math.min(10, Number(args.count) || 5));
const params = new URLSearchParams({
q: query,
count: String(count),
extra_snippets: 'true',
safesearch: 'moderate',
});
if (args.freshness) params.set('freshness', String(args.freshness));
console.log(` [tool:web_search] "${query}" (count=${count})`);
const url = `https://api.search.brave.com/res/v1/web/search?${params.toString()}`;
const ac = new AbortController();
const onAbortHandler = () => ac.abort();
signal?.addEventListener('abort', onAbortHandler, { once: true });
try {
const res = await fetch(url, {
headers: {
Accept: 'application/json',
'Accept-Encoding': 'gzip',
'X-Subscription-Token': apiKey,
},
signal: ac.signal,
});
if (!res.ok) {
const body = await res.text().catch(() => '');
return {
content: `web_search error: ${res.status} ${res.statusText}. ${body.slice(0, 200)}`,
};
}
const data = await res.json();
const results = data?.web?.results || [];
if (results.length === 0) {
return { content: `No web results found for "${query}".` };
}
const lines = results.slice(0, count).map((r, i) => {
const title = r.title || '(untitled)';
const u = r.url || '';
const desc = (r.description || '').replace(/\s+/g, ' ').trim();
const extras = Array.isArray(r.extra_snippets)
? r.extra_snippets.slice(0, 2).map((s) => s.replace(/\s+/g, ' ').trim())
: [];
const tail = extras.length ? `\n${extras.join('\n • ')}` : '';
return `${i + 1}. ${title}\n ${u}\n ${desc}${tail}`;
});
return {
content: `Web results for "${query}":\n\n${lines.join('\n\n')}`,
};
} catch (err) {
if (err.name === 'AbortError') throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' });
return { content: `web_search error: ${err.message}` };
} finally {
signal?.removeEventListener('abort', onAbortHandler);
}
}
case 'fetch_url': {
const target = String(args.url || '').trim();
if (!/^https?:\/\//i.test(target)) {
return { content: 'fetch_url error: url must be an absolute http(s) URL.' };
}
const maxChars = Math.max(200, Math.min(20000, Number(args.max_chars) || 4000));
console.log(` [tool:fetch_url] ${target}`);
const ac = new AbortController();
const onAbortHandler = () => ac.abort();
signal?.addEventListener('abort', onAbortHandler, { once: true });
const timeoutId = setTimeout(() => ac.abort(), 20000);
try {
const res = await fetch(target, {
headers: {
// Prefer markdown (Cloudflare Markdown for Agents); accept HTML/text fallback.
Accept: 'text/markdown, text/plain;q=0.9, text/html;q=0.8, */*;q=0.1',
'Accept-Encoding': 'gzip',
'User-Agent': 'jibo-llm/1.0 (+agent)',
},
redirect: 'follow',
signal: ac.signal,
});
if (!res.ok) {
return {
content: `fetch_url error: ${res.status} ${res.statusText} from ${target}`,
};
}
const ctype = (res.headers.get('content-type') || '').toLowerCase();
if (!/^(text\/|application\/(json|xml|xhtml))/.test(ctype) && ctype) {
return {
content: `fetch_url: refusing non-text content (${ctype}) from ${target}`,
};
}
let body = await res.text();
const isMarkdown = ctype.includes('markdown');
const isHtml = ctype.includes('html') || /<html[\s>]/i.test(body.slice(0, 500));
if (!isMarkdown && isHtml) {
// Lightweight HTML→text: strip scripts/styles/tags, collapse whitespace.
body = body
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
.replace(/<!--[\s\S]*?-->/g, ' ')
.replace(/<\/(p|div|li|h[1-6]|br|tr)>/gi, '\n')
.replace(/<[^>]+>/g, ' ')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/[ \t]+/g, ' ')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
const truncated = body.length > maxChars;
const out = truncated ? body.slice(0, maxChars) + '\n…[truncated]' : body;
const finalUrl = res.url || target;
const fmt = isMarkdown ? 'markdown' : isHtml ? 'html→text' : 'text';
return {
content: `Fetched ${finalUrl} (${fmt}, ${body.length} chars${truncated ? `, truncated to ${maxChars}` : ''}):\n\n${out}`,
};
} catch (err) {
if (err.name === 'AbortError') {
if (signal?.aborted) {
throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' });
}
return { content: `fetch_url error: timeout fetching ${target}` };
}
return { content: `fetch_url error: ${err.message}` };
} finally {
clearTimeout(timeoutId);
signal?.removeEventListener('abort', onAbortHandler);
}
}
case 'end_conversation': {
console.log(' [tool:end_conversation] awaiting pending speech…');
await Promise.race([ctx.speechChain, onAbort(signal)]);
return { content: 'Conversation ended.', endConversation: true };
}
default:
return { content: `Unknown tool "${name}".` };
}
}
module.exports = { TOOL_SCHEMAS, executeTool, wrapForScreen };