Initial commit: jibo-llm hotword-triggered agent
Hotword-triggered LLM conversation loop for Jibo with tool-calling agent loop, ESML expressive speech, web search/fetch, and per-conversation abort handling.
This commit is contained in:
569
tools.js
Normal file
569
tools.js
Normal file
@@ -0,0 +1,569 @@
|
||||
/**
|
||||
* Tool definitions and executor for the Jibo LLM agent.
|
||||
*
|
||||
* Each tool maps to a rom-control capability the LLM can invoke.
|
||||
*/
|
||||
|
||||
// ── OpenAI function-tool schemas ───────────────────────────────────────────────
|
||||
|
||||
const TOOL_SCHEMAS = [
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'say',
|
||||
description:
|
||||
"Speak text aloud through Jibo's speaker. Plain text plus valid ESML tags only " +
|
||||
'(e.g. <anim cat="happy" nonBlocking="true"/>, <break size="0.3"/>). ' +
|
||||
'NEVER include markdown (no *italics*, **bold**, backticks), LaTeX ($...$), ' +
|
||||
'unmatched/closing tags like </es>, or other symbols Jibo cannot pronounce. ' +
|
||||
'Malformed input can hang the TTS engine. Keep each call under 200 chars.',
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
text: { type: 'string', description: 'Text (or ESML) to speak.' },
|
||||
},
|
||||
required: ['text'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'listen',
|
||||
description:
|
||||
"Listen for the user's speech and return a transcript. " +
|
||||
'Call this after speaking if you want to continue the conversation.',
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
timeout: {
|
||||
type: 'number',
|
||||
description: 'Max seconds to wait. Default 15.',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'take_photo',
|
||||
description:
|
||||
"Take a photo with Jibo's camera. The image is returned so you can see what's in front of you.",
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
resolution: {
|
||||
type: 'string',
|
||||
enum: ['medium', 'low'],
|
||||
description: 'Default: medium.',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'show_text',
|
||||
description: "Display text on Jibo's screen.",
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
text: { type: 'string', description: 'Text to show.' },
|
||||
},
|
||||
required: ['text'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'show_image',
|
||||
description: "Display an image on Jibo's screen from a URL.",
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
url: { type: 'string', description: 'Image URL.' },
|
||||
},
|
||||
required: ['url'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'show_eye',
|
||||
description: "Reset Jibo's screen to the default eye animation.",
|
||||
parameters: { type: 'object', properties: {} },
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'look_at_angle',
|
||||
description: "Turn Jibo's head. theta = yaw (±180°, positive right), psi = pitch (±30°, positive up).",
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
theta: { type: 'number', description: 'Yaw degrees.' },
|
||||
psi: { type: 'number', description: 'Pitch degrees.' },
|
||||
},
|
||||
required: ['theta', 'psi'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'set_volume',
|
||||
description: "Set Jibo's speaker volume (0.0 – 1.0).",
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
level: { type: 'number', description: 'Volume 0.0 to 1.0.' },
|
||||
},
|
||||
required: ['level'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'web_search',
|
||||
description:
|
||||
'Search the web via Brave Search. Use for current events, facts you are unsure of, ' +
|
||||
'or anything that may have changed since training. Returns titles, URLs, and snippets.',
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
query: { type: 'string', description: 'The search query.' },
|
||||
count: {
|
||||
type: 'number',
|
||||
description: 'How many results to return (1–10). Default 5.',
|
||||
},
|
||||
freshness: {
|
||||
type: 'string',
|
||||
enum: ['pd', 'pw', 'pm', 'py'],
|
||||
description:
|
||||
'Optional recency filter: pd=past day, pw=past week, pm=past month, py=past year.',
|
||||
},
|
||||
},
|
||||
required: ['query'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'fetch_url',
|
||||
description:
|
||||
'Fetch the contents of a web page by URL. Prefers markdown via content ' +
|
||||
'negotiation (Cloudflare Markdown for Agents) and falls back to HTML→text. ' +
|
||||
'Use after web_search to read a result, or to traverse linked pages.',
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
url: { type: 'string', description: 'Absolute http(s) URL to fetch.' },
|
||||
max_chars: {
|
||||
type: 'number',
|
||||
description: 'Truncate the body to this many characters. Default 4000.',
|
||||
},
|
||||
},
|
||||
required: ['url'],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'end_conversation',
|
||||
description:
|
||||
'Call this when the conversation has reached a natural end and you do NOT want to ' +
|
||||
'listen for another reply. Pair it with a final "say" in the same turn for a farewell.',
|
||||
parameters: { type: 'object', properties: {} },
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
// ── Resolution map ─────────────────────────────────────────────────────────────
|
||||
|
||||
const RES_MAP = { high: 'highRes', medium: 'medRes', low: 'lowRes' };
|
||||
|
||||
// ── Screen text helpers ────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Word-wrap text for Jibo's small screen. Breaks oversized words, respects
|
||||
* existing newlines, and truncates with an ellipsis past `maxLines`.
|
||||
*/
|
||||
function wrapForScreen(text, width = 40, maxLines = 10) {
|
||||
const out = [];
|
||||
for (const para of String(text).split('\n')) {
|
||||
if (para === '') { out.push(''); continue; }
|
||||
let line = '';
|
||||
for (const word of para.split(/\s+/).filter(Boolean)) {
|
||||
if (word.length > width) {
|
||||
if (line) { out.push(line); line = ''; }
|
||||
for (let i = 0; i < word.length; i += width) {
|
||||
const chunk = word.slice(i, i + width);
|
||||
if (chunk.length === width) out.push(chunk);
|
||||
else line = chunk;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const candidate = line ? `${line} ${word}` : word;
|
||||
if (candidate.length > width) {
|
||||
out.push(line);
|
||||
line = word;
|
||||
} else {
|
||||
line = candidate;
|
||||
}
|
||||
}
|
||||
if (line) out.push(line);
|
||||
}
|
||||
if (out.length > maxLines) {
|
||||
return out.slice(0, maxLines - 1).concat('…').join('\n');
|
||||
}
|
||||
return out.join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip markup the Jibo TTS engine chokes on (markdown, LaTeX, unmatched
|
||||
* closing tags). Preserves valid ESML self-closing tags like <anim .../> and
|
||||
* <break .../>. Defense-in-depth against models that ignore the instructions.
|
||||
*/
|
||||
function sanitizeForTTS(text) {
|
||||
const ESML_TAGS = /^(anim|break|prosody|emph|phoneme|phrase|style|voice)\b/i;
|
||||
return text
|
||||
// Remove LaTeX inline math: $...$ and $$...$$
|
||||
.replace(/\${1,2}[^$]{0,200}\${1,2}/g, '')
|
||||
// Strip code fences and inline backticks
|
||||
.replace(/```[\s\S]*?```/g, '')
|
||||
.replace(/`+/g, '')
|
||||
// Strip markdown emphasis markers but keep the words
|
||||
.replace(/(\*\*|__)(.*?)\1/g, '$2')
|
||||
.replace(/(\*|_)(?=\S)(.+?)(?<=\S)\1/g, '$2')
|
||||
// Drop any tag that isn't a known ESML tag (e.g. </es>, <br>, etc.)
|
||||
.replace(/<\/?([a-zA-Z][^\s>/]*)\b[^>]*\/?>/g, (m, name) =>
|
||||
ESML_TAGS.test(name) ? m : '')
|
||||
// Collapse extra whitespace
|
||||
.replace(/[ \t]+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
// ── Abort helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
function throwIfAborted(signal) {
|
||||
if (signal?.aborted) {
|
||||
const err = new Error('Conversation aborted');
|
||||
err.code = 'CONVERSATION_ABORTED';
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
function onAbort(signal) {
|
||||
if (!signal) return new Promise(() => { }); // never resolves
|
||||
return new Promise((_, reject) => {
|
||||
const handler = () => {
|
||||
const err = new Error('Conversation aborted');
|
||||
err.code = 'CONVERSATION_ABORTED';
|
||||
reject(err);
|
||||
};
|
||||
if (signal.aborted) return handler();
|
||||
signal.addEventListener('abort', handler, { once: true });
|
||||
});
|
||||
}
|
||||
|
||||
// ── Tool executor ──────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Execute a single tool call against the Jibo client.
|
||||
*
|
||||
* Returns { content, image? }.
|
||||
* - content — text string for the tool-result message
|
||||
* - image — optional base64 JPEG (only for take_photo)
|
||||
*
|
||||
* @param {import('rom-control').Client} client
|
||||
* @param {string} name Tool function name
|
||||
* @param {object} args Parsed arguments
|
||||
* @param {AbortSignal} [signal] Cancellation signal
|
||||
* @returns {Promise<{ content: string, image?: string }>}
|
||||
*/
|
||||
async function executeTool(client, name, args, signal, ctx) {
|
||||
throwIfAborted(signal);
|
||||
ctx = ctx || {};
|
||||
if (!ctx.speechChain) ctx.speechChain = Promise.resolve();
|
||||
switch (name) {
|
||||
// ── Communication ──────────────────────────────────────────────────────
|
||||
case 'say': {
|
||||
const text = sanitizeForTTS(String(args.text || ''));
|
||||
console.log(` [tool:say] "${text}" (queued)`);
|
||||
// Estimate ~80ms per char + 5s base, capped at 60s. Anything longer
|
||||
// is almost certainly Jibo's TTS hung on bad ESML/markup; we'd rather
|
||||
// log a warning and unblock the conversation than deadlock listen.
|
||||
const estimateMs = Math.min(60000, 5000 + text.length * 80);
|
||||
|
||||
ctx.speechChain = ctx.speechChain
|
||||
.then(() => {
|
||||
const started = Date.now();
|
||||
console.log(` [tool:say] speaking… (timeout ${estimateMs}ms)`);
|
||||
let timer;
|
||||
const timeout = new Promise((resolve) => {
|
||||
timer = setTimeout(() => {
|
||||
console.warn(` [tool:say] timed out after ${estimateMs}ms — continuing.`);
|
||||
resolve();
|
||||
}, estimateMs);
|
||||
});
|
||||
return Promise.race([
|
||||
client.behavior.say(text, { signal }),
|
||||
onAbort(signal),
|
||||
timeout,
|
||||
]).finally(() => {
|
||||
clearTimeout(timer);
|
||||
console.log(` [tool:say] done in ${Date.now() - started}ms`);
|
||||
});
|
||||
})
|
||||
.catch((err) => {
|
||||
if (err.code === 'CONVERSATION_ABORTED') return;
|
||||
console.error(' [tool:say] error:', err.message);
|
||||
});
|
||||
return { content: 'Speech queued — Jibo will speak it shortly. Continue with other tools; listen will wait for it.' };
|
||||
}
|
||||
|
||||
case 'listen': {
|
||||
const ms = (args.timeout || 15) * 1000;
|
||||
// Make sure pending speech finishes before we open the mic, otherwise
|
||||
// Jibo will hear his own voice.
|
||||
console.log(' [tool:listen] awaiting pending speech…');
|
||||
await Promise.race([ctx.speechChain, onAbort(signal)]);
|
||||
throwIfAborted(signal);
|
||||
console.log(` [tool:listen] waiting ${ms}ms…`);
|
||||
client.display.showText('Listening...');
|
||||
try {
|
||||
const speech = await Promise.race([
|
||||
client.audio.awaitSpeech({ mode: 'local', time: ms }),
|
||||
onAbort(signal),
|
||||
]);
|
||||
console.log(` [tool:listen] heard: "${speech.content}"`);
|
||||
ctx.lastHeard = speech.content;
|
||||
return { content: `User said: "${speech.content}"` };
|
||||
} catch (err) {
|
||||
if (err.code === 'CONVERSATION_ABORTED') throw err;
|
||||
if (err.code === 'SPEECH_TIMEOUT') {
|
||||
console.log(' [tool:listen] timed out');
|
||||
return { content: 'No speech detected — user did not respond.' };
|
||||
}
|
||||
throw err;
|
||||
} finally {
|
||||
client.display.showEye();
|
||||
}
|
||||
}
|
||||
|
||||
// ── Camera ─────────────────────────────────────────────────────────────
|
||||
case 'take_photo': {
|
||||
const res = RES_MAP[args.resolution] || 'medRes';
|
||||
console.log(` [tool:take_photo] ${res}…`);
|
||||
const photo = await Promise.race([
|
||||
client.camera.takePhoto({ resolution: res, timeout: 30000 }),
|
||||
onAbort(signal),
|
||||
]);
|
||||
const buf = await photo.fetchBuffer();
|
||||
console.log(` [tool:take_photo] ${buf.length} bytes captured`);
|
||||
return {
|
||||
content: "Photo captured from Jibo's camera.",
|
||||
image: buf.toString('base64'),
|
||||
};
|
||||
}
|
||||
|
||||
// ── Display ────────────────────────────────────────────────────────────
|
||||
case 'show_text': {
|
||||
console.log(` [tool:show_text] "${args.text}"`);
|
||||
client.display.showText(wrapForScreen(args.text, 40, 10));
|
||||
return { content: 'Text displayed on screen.' };
|
||||
}
|
||||
|
||||
case 'show_image': {
|
||||
console.log(` [tool:show_image] ${args.url}`);
|
||||
client.display.showImage(args.url);
|
||||
return { content: 'Image displayed on screen.' };
|
||||
}
|
||||
|
||||
case 'show_eye': {
|
||||
console.log(' [tool:show_eye]');
|
||||
client.display.showEye();
|
||||
return { content: 'Eye animation restored on screen.' };
|
||||
}
|
||||
|
||||
|
||||
|
||||
case 'look_at_angle': {
|
||||
console.log(` [tool:look_at_angle] θ=${args.theta}° ψ=${args.psi}°`);
|
||||
await client.behavior.lookAtAngle(args.theta, args.psi);
|
||||
return { content: `Now looking at θ=${args.theta}°, ψ=${args.psi}°.` };
|
||||
}
|
||||
|
||||
case 'set_volume': {
|
||||
console.log(` [tool:set_volume] ${args.level}`);
|
||||
await client.audio.setVolume(args.level);
|
||||
return { content: `Volume set to ${args.level}.` };
|
||||
}
|
||||
|
||||
// ── Web search ─────────────────────────────────────────────────────────
|
||||
case 'web_search': {
|
||||
const apiKey = process.env.BRAVE_API_KEY;
|
||||
if (!apiKey) {
|
||||
return {
|
||||
content:
|
||||
'web_search is unavailable: BRAVE_API_KEY environment variable is not set.',
|
||||
};
|
||||
}
|
||||
const query = String(args.query || '').trim();
|
||||
if (!query) {
|
||||
return { content: 'web_search error: query is required.' };
|
||||
}
|
||||
const count = Math.max(1, Math.min(10, Number(args.count) || 5));
|
||||
const params = new URLSearchParams({
|
||||
q: query,
|
||||
count: String(count),
|
||||
extra_snippets: 'true',
|
||||
safesearch: 'moderate',
|
||||
});
|
||||
if (args.freshness) params.set('freshness', String(args.freshness));
|
||||
|
||||
console.log(` [tool:web_search] "${query}" (count=${count})`);
|
||||
const url = `https://api.search.brave.com/res/v1/web/search?${params.toString()}`;
|
||||
const ac = new AbortController();
|
||||
const onAbortHandler = () => ac.abort();
|
||||
signal?.addEventListener('abort', onAbortHandler, { once: true });
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
headers: {
|
||||
Accept: 'application/json',
|
||||
'Accept-Encoding': 'gzip',
|
||||
'X-Subscription-Token': apiKey,
|
||||
},
|
||||
signal: ac.signal,
|
||||
});
|
||||
if (!res.ok) {
|
||||
const body = await res.text().catch(() => '');
|
||||
return {
|
||||
content: `web_search error: ${res.status} ${res.statusText}. ${body.slice(0, 200)}`,
|
||||
};
|
||||
}
|
||||
const data = await res.json();
|
||||
const results = data?.web?.results || [];
|
||||
if (results.length === 0) {
|
||||
return { content: `No web results found for "${query}".` };
|
||||
}
|
||||
const lines = results.slice(0, count).map((r, i) => {
|
||||
const title = r.title || '(untitled)';
|
||||
const u = r.url || '';
|
||||
const desc = (r.description || '').replace(/\s+/g, ' ').trim();
|
||||
const extras = Array.isArray(r.extra_snippets)
|
||||
? r.extra_snippets.slice(0, 2).map((s) => s.replace(/\s+/g, ' ').trim())
|
||||
: [];
|
||||
const tail = extras.length ? `\n • ${extras.join('\n • ')}` : '';
|
||||
return `${i + 1}. ${title}\n ${u}\n ${desc}${tail}`;
|
||||
});
|
||||
return {
|
||||
content: `Web results for "${query}":\n\n${lines.join('\n\n')}`,
|
||||
};
|
||||
} catch (err) {
|
||||
if (err.name === 'AbortError') throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' });
|
||||
return { content: `web_search error: ${err.message}` };
|
||||
} finally {
|
||||
signal?.removeEventListener('abort', onAbortHandler);
|
||||
}
|
||||
}
|
||||
|
||||
case 'fetch_url': {
|
||||
const target = String(args.url || '').trim();
|
||||
if (!/^https?:\/\//i.test(target)) {
|
||||
return { content: 'fetch_url error: url must be an absolute http(s) URL.' };
|
||||
}
|
||||
const maxChars = Math.max(200, Math.min(20000, Number(args.max_chars) || 4000));
|
||||
console.log(` [tool:fetch_url] ${target}`);
|
||||
|
||||
const ac = new AbortController();
|
||||
const onAbortHandler = () => ac.abort();
|
||||
signal?.addEventListener('abort', onAbortHandler, { once: true });
|
||||
const timeoutId = setTimeout(() => ac.abort(), 20000);
|
||||
try {
|
||||
const res = await fetch(target, {
|
||||
headers: {
|
||||
// Prefer markdown (Cloudflare Markdown for Agents); accept HTML/text fallback.
|
||||
Accept: 'text/markdown, text/plain;q=0.9, text/html;q=0.8, */*;q=0.1',
|
||||
'Accept-Encoding': 'gzip',
|
||||
'User-Agent': 'jibo-llm/1.0 (+agent)',
|
||||
},
|
||||
redirect: 'follow',
|
||||
signal: ac.signal,
|
||||
});
|
||||
if (!res.ok) {
|
||||
return {
|
||||
content: `fetch_url error: ${res.status} ${res.statusText} from ${target}`,
|
||||
};
|
||||
}
|
||||
const ctype = (res.headers.get('content-type') || '').toLowerCase();
|
||||
if (!/^(text\/|application\/(json|xml|xhtml))/.test(ctype) && ctype) {
|
||||
return {
|
||||
content: `fetch_url: refusing non-text content (${ctype}) from ${target}`,
|
||||
};
|
||||
}
|
||||
let body = await res.text();
|
||||
const isMarkdown = ctype.includes('markdown');
|
||||
const isHtml = ctype.includes('html') || /<html[\s>]/i.test(body.slice(0, 500));
|
||||
|
||||
if (!isMarkdown && isHtml) {
|
||||
// Lightweight HTML→text: strip scripts/styles/tags, collapse whitespace.
|
||||
body = body
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
||||
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
|
||||
.replace(/<!--[\s\S]*?-->/g, ' ')
|
||||
.replace(/<\/(p|div|li|h[1-6]|br|tr)>/gi, '\n')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/[ \t]+/g, ' ')
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
.trim();
|
||||
}
|
||||
|
||||
const truncated = body.length > maxChars;
|
||||
const out = truncated ? body.slice(0, maxChars) + '\n…[truncated]' : body;
|
||||
const finalUrl = res.url || target;
|
||||
const fmt = isMarkdown ? 'markdown' : isHtml ? 'html→text' : 'text';
|
||||
return {
|
||||
content: `Fetched ${finalUrl} (${fmt}, ${body.length} chars${truncated ? `, truncated to ${maxChars}` : ''}):\n\n${out}`,
|
||||
};
|
||||
} catch (err) {
|
||||
if (err.name === 'AbortError') {
|
||||
if (signal?.aborted) {
|
||||
throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' });
|
||||
}
|
||||
return { content: `fetch_url error: timeout fetching ${target}` };
|
||||
}
|
||||
return { content: `fetch_url error: ${err.message}` };
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
signal?.removeEventListener('abort', onAbortHandler);
|
||||
}
|
||||
}
|
||||
|
||||
case 'end_conversation': {
|
||||
console.log(' [tool:end_conversation] awaiting pending speech…');
|
||||
await Promise.race([ctx.speechChain, onAbort(signal)]);
|
||||
return { content: 'Conversation ended.', endConversation: true };
|
||||
}
|
||||
|
||||
default:
|
||||
return { content: `Unknown tool "${name}".` };
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { TOOL_SCHEMAS, executeTool, wrapForScreen };
|
||||
Reference in New Issue
Block a user