Hotword-triggered LLM conversation loop for Jibo with tool-calling agent loop, ESML expressive speech, web search/fetch, and per-conversation abort handling.
570 lines
20 KiB
JavaScript
570 lines
20 KiB
JavaScript
/**
|
||
* Tool definitions and executor for the Jibo LLM agent.
|
||
*
|
||
* Each tool maps to a rom-control capability the LLM can invoke.
|
||
*/
|
||
|
||
// ── OpenAI function-tool schemas ───────────────────────────────────────────────
|
||
|
||
const TOOL_SCHEMAS = [
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'say',
|
||
description:
|
||
"Speak text aloud through Jibo's speaker. Plain text plus valid ESML tags only " +
|
||
'(e.g. <anim cat="happy" nonBlocking="true"/>, <break size="0.3"/>). ' +
|
||
'NEVER include markdown (no *italics*, **bold**, backticks), LaTeX ($...$), ' +
|
||
'unmatched/closing tags like </es>, or other symbols Jibo cannot pronounce. ' +
|
||
'Malformed input can hang the TTS engine. Keep each call under 200 chars.',
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
text: { type: 'string', description: 'Text (or ESML) to speak.' },
|
||
},
|
||
required: ['text'],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'listen',
|
||
description:
|
||
"Listen for the user's speech and return a transcript. " +
|
||
'Call this after speaking if you want to continue the conversation.',
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
timeout: {
|
||
type: 'number',
|
||
description: 'Max seconds to wait. Default 15.',
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'take_photo',
|
||
description:
|
||
"Take a photo with Jibo's camera. The image is returned so you can see what's in front of you.",
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
resolution: {
|
||
type: 'string',
|
||
enum: ['medium', 'low'],
|
||
description: 'Default: medium.',
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'show_text',
|
||
description: "Display text on Jibo's screen.",
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
text: { type: 'string', description: 'Text to show.' },
|
||
},
|
||
required: ['text'],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'show_image',
|
||
description: "Display an image on Jibo's screen from a URL.",
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
url: { type: 'string', description: 'Image URL.' },
|
||
},
|
||
required: ['url'],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'show_eye',
|
||
description: "Reset Jibo's screen to the default eye animation.",
|
||
parameters: { type: 'object', properties: {} },
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'look_at_angle',
|
||
description: "Turn Jibo's head. theta = yaw (±180°, positive right), psi = pitch (±30°, positive up).",
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
theta: { type: 'number', description: 'Yaw degrees.' },
|
||
psi: { type: 'number', description: 'Pitch degrees.' },
|
||
},
|
||
required: ['theta', 'psi'],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'set_volume',
|
||
description: "Set Jibo's speaker volume (0.0 – 1.0).",
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
level: { type: 'number', description: 'Volume 0.0 to 1.0.' },
|
||
},
|
||
required: ['level'],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'web_search',
|
||
description:
|
||
'Search the web via Brave Search. Use for current events, facts you are unsure of, ' +
|
||
'or anything that may have changed since training. Returns titles, URLs, and snippets.',
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
query: { type: 'string', description: 'The search query.' },
|
||
count: {
|
||
type: 'number',
|
||
description: 'How many results to return (1–10). Default 5.',
|
||
},
|
||
freshness: {
|
||
type: 'string',
|
||
enum: ['pd', 'pw', 'pm', 'py'],
|
||
description:
|
||
'Optional recency filter: pd=past day, pw=past week, pm=past month, py=past year.',
|
||
},
|
||
},
|
||
required: ['query'],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'fetch_url',
|
||
description:
|
||
'Fetch the contents of a web page by URL. Prefers markdown via content ' +
|
||
'negotiation (Cloudflare Markdown for Agents) and falls back to HTML→text. ' +
|
||
'Use after web_search to read a result, or to traverse linked pages.',
|
||
parameters: {
|
||
type: 'object',
|
||
properties: {
|
||
url: { type: 'string', description: 'Absolute http(s) URL to fetch.' },
|
||
max_chars: {
|
||
type: 'number',
|
||
description: 'Truncate the body to this many characters. Default 4000.',
|
||
},
|
||
},
|
||
required: ['url'],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
type: 'function',
|
||
function: {
|
||
name: 'end_conversation',
|
||
description:
|
||
'Call this when the conversation has reached a natural end and you do NOT want to ' +
|
||
'listen for another reply. Pair it with a final "say" in the same turn for a farewell.',
|
||
parameters: { type: 'object', properties: {} },
|
||
},
|
||
},
|
||
];
|
||
|
||
// ── Resolution map ─────────────────────────────────────────────────────────────
|
||
|
||
const RES_MAP = { high: 'highRes', medium: 'medRes', low: 'lowRes' };
|
||
|
||
// ── Screen text helpers ────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Word-wrap text for Jibo's small screen. Breaks oversized words, respects
|
||
* existing newlines, and truncates with an ellipsis past `maxLines`.
|
||
*/
|
||
function wrapForScreen(text, width = 40, maxLines = 10) {
|
||
const out = [];
|
||
for (const para of String(text).split('\n')) {
|
||
if (para === '') { out.push(''); continue; }
|
||
let line = '';
|
||
for (const word of para.split(/\s+/).filter(Boolean)) {
|
||
if (word.length > width) {
|
||
if (line) { out.push(line); line = ''; }
|
||
for (let i = 0; i < word.length; i += width) {
|
||
const chunk = word.slice(i, i + width);
|
||
if (chunk.length === width) out.push(chunk);
|
||
else line = chunk;
|
||
}
|
||
continue;
|
||
}
|
||
const candidate = line ? `${line} ${word}` : word;
|
||
if (candidate.length > width) {
|
||
out.push(line);
|
||
line = word;
|
||
} else {
|
||
line = candidate;
|
||
}
|
||
}
|
||
if (line) out.push(line);
|
||
}
|
||
if (out.length > maxLines) {
|
||
return out.slice(0, maxLines - 1).concat('…').join('\n');
|
||
}
|
||
return out.join('\n');
|
||
}
|
||
|
||
/**
|
||
* Strip markup the Jibo TTS engine chokes on (markdown, LaTeX, unmatched
|
||
* closing tags). Preserves valid ESML self-closing tags like <anim .../> and
|
||
* <break .../>. Defense-in-depth against models that ignore the instructions.
|
||
*/
|
||
function sanitizeForTTS(text) {
|
||
const ESML_TAGS = /^(anim|break|prosody|emph|phoneme|phrase|style|voice)\b/i;
|
||
return text
|
||
// Remove LaTeX inline math: $...$ and $$...$$
|
||
.replace(/\${1,2}[^$]{0,200}\${1,2}/g, '')
|
||
// Strip code fences and inline backticks
|
||
.replace(/```[\s\S]*?```/g, '')
|
||
.replace(/`+/g, '')
|
||
// Strip markdown emphasis markers but keep the words
|
||
.replace(/(\*\*|__)(.*?)\1/g, '$2')
|
||
.replace(/(\*|_)(?=\S)(.+?)(?<=\S)\1/g, '$2')
|
||
// Drop any tag that isn't a known ESML tag (e.g. </es>, <br>, etc.)
|
||
.replace(/<\/?([a-zA-Z][^\s>/]*)\b[^>]*\/?>/g, (m, name) =>
|
||
ESML_TAGS.test(name) ? m : '')
|
||
// Collapse extra whitespace
|
||
.replace(/[ \t]+/g, ' ')
|
||
.trim();
|
||
}
|
||
|
||
// ── Abort helpers ──────────────────────────────────────────────────────────────
|
||
|
||
function throwIfAborted(signal) {
|
||
if (signal?.aborted) {
|
||
const err = new Error('Conversation aborted');
|
||
err.code = 'CONVERSATION_ABORTED';
|
||
throw err;
|
||
}
|
||
}
|
||
|
||
function onAbort(signal) {
|
||
if (!signal) return new Promise(() => { }); // never resolves
|
||
return new Promise((_, reject) => {
|
||
const handler = () => {
|
||
const err = new Error('Conversation aborted');
|
||
err.code = 'CONVERSATION_ABORTED';
|
||
reject(err);
|
||
};
|
||
if (signal.aborted) return handler();
|
||
signal.addEventListener('abort', handler, { once: true });
|
||
});
|
||
}
|
||
|
||
// ── Tool executor ──────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Execute a single tool call against the Jibo client.
|
||
*
|
||
* Returns { content, image? }.
|
||
* - content — text string for the tool-result message
|
||
* - image — optional base64 JPEG (only for take_photo)
|
||
*
|
||
* @param {import('rom-control').Client} client
|
||
* @param {string} name Tool function name
|
||
* @param {object} args Parsed arguments
|
||
* @param {AbortSignal} [signal] Cancellation signal
|
||
* @returns {Promise<{ content: string, image?: string }>}
|
||
*/
|
||
async function executeTool(client, name, args, signal, ctx) {
|
||
throwIfAborted(signal);
|
||
ctx = ctx || {};
|
||
if (!ctx.speechChain) ctx.speechChain = Promise.resolve();
|
||
switch (name) {
|
||
// ── Communication ──────────────────────────────────────────────────────
|
||
case 'say': {
|
||
const text = sanitizeForTTS(String(args.text || ''));
|
||
console.log(` [tool:say] "${text}" (queued)`);
|
||
// Estimate ~80ms per char + 5s base, capped at 60s. Anything longer
|
||
// is almost certainly Jibo's TTS hung on bad ESML/markup; we'd rather
|
||
// log a warning and unblock the conversation than deadlock listen.
|
||
const estimateMs = Math.min(60000, 5000 + text.length * 80);
|
||
|
||
ctx.speechChain = ctx.speechChain
|
||
.then(() => {
|
||
const started = Date.now();
|
||
console.log(` [tool:say] speaking… (timeout ${estimateMs}ms)`);
|
||
let timer;
|
||
const timeout = new Promise((resolve) => {
|
||
timer = setTimeout(() => {
|
||
console.warn(` [tool:say] timed out after ${estimateMs}ms — continuing.`);
|
||
resolve();
|
||
}, estimateMs);
|
||
});
|
||
return Promise.race([
|
||
client.behavior.say(text, { signal }),
|
||
onAbort(signal),
|
||
timeout,
|
||
]).finally(() => {
|
||
clearTimeout(timer);
|
||
console.log(` [tool:say] done in ${Date.now() - started}ms`);
|
||
});
|
||
})
|
||
.catch((err) => {
|
||
if (err.code === 'CONVERSATION_ABORTED') return;
|
||
console.error(' [tool:say] error:', err.message);
|
||
});
|
||
return { content: 'Speech queued — Jibo will speak it shortly. Continue with other tools; listen will wait for it.' };
|
||
}
|
||
|
||
case 'listen': {
|
||
const ms = (args.timeout || 15) * 1000;
|
||
// Make sure pending speech finishes before we open the mic, otherwise
|
||
// Jibo will hear his own voice.
|
||
console.log(' [tool:listen] awaiting pending speech…');
|
||
await Promise.race([ctx.speechChain, onAbort(signal)]);
|
||
throwIfAborted(signal);
|
||
console.log(` [tool:listen] waiting ${ms}ms…`);
|
||
client.display.showText('Listening...');
|
||
try {
|
||
const speech = await Promise.race([
|
||
client.audio.awaitSpeech({ mode: 'local', time: ms }),
|
||
onAbort(signal),
|
||
]);
|
||
console.log(` [tool:listen] heard: "${speech.content}"`);
|
||
ctx.lastHeard = speech.content;
|
||
return { content: `User said: "${speech.content}"` };
|
||
} catch (err) {
|
||
if (err.code === 'CONVERSATION_ABORTED') throw err;
|
||
if (err.code === 'SPEECH_TIMEOUT') {
|
||
console.log(' [tool:listen] timed out');
|
||
return { content: 'No speech detected — user did not respond.' };
|
||
}
|
||
throw err;
|
||
} finally {
|
||
client.display.showEye();
|
||
}
|
||
}
|
||
|
||
// ── Camera ─────────────────────────────────────────────────────────────
|
||
case 'take_photo': {
|
||
const res = RES_MAP[args.resolution] || 'medRes';
|
||
console.log(` [tool:take_photo] ${res}…`);
|
||
const photo = await Promise.race([
|
||
client.camera.takePhoto({ resolution: res, timeout: 30000 }),
|
||
onAbort(signal),
|
||
]);
|
||
const buf = await photo.fetchBuffer();
|
||
console.log(` [tool:take_photo] ${buf.length} bytes captured`);
|
||
return {
|
||
content: "Photo captured from Jibo's camera.",
|
||
image: buf.toString('base64'),
|
||
};
|
||
}
|
||
|
||
// ── Display ────────────────────────────────────────────────────────────
|
||
case 'show_text': {
|
||
console.log(` [tool:show_text] "${args.text}"`);
|
||
client.display.showText(wrapForScreen(args.text, 40, 10));
|
||
return { content: 'Text displayed on screen.' };
|
||
}
|
||
|
||
case 'show_image': {
|
||
console.log(` [tool:show_image] ${args.url}`);
|
||
client.display.showImage(args.url);
|
||
return { content: 'Image displayed on screen.' };
|
||
}
|
||
|
||
case 'show_eye': {
|
||
console.log(' [tool:show_eye]');
|
||
client.display.showEye();
|
||
return { content: 'Eye animation restored on screen.' };
|
||
}
|
||
|
||
|
||
|
||
case 'look_at_angle': {
|
||
console.log(` [tool:look_at_angle] θ=${args.theta}° ψ=${args.psi}°`);
|
||
await client.behavior.lookAtAngle(args.theta, args.psi);
|
||
return { content: `Now looking at θ=${args.theta}°, ψ=${args.psi}°.` };
|
||
}
|
||
|
||
case 'set_volume': {
|
||
console.log(` [tool:set_volume] ${args.level}`);
|
||
await client.audio.setVolume(args.level);
|
||
return { content: `Volume set to ${args.level}.` };
|
||
}
|
||
|
||
// ── Web search ─────────────────────────────────────────────────────────
|
||
case 'web_search': {
|
||
const apiKey = process.env.BRAVE_API_KEY;
|
||
if (!apiKey) {
|
||
return {
|
||
content:
|
||
'web_search is unavailable: BRAVE_API_KEY environment variable is not set.',
|
||
};
|
||
}
|
||
const query = String(args.query || '').trim();
|
||
if (!query) {
|
||
return { content: 'web_search error: query is required.' };
|
||
}
|
||
const count = Math.max(1, Math.min(10, Number(args.count) || 5));
|
||
const params = new URLSearchParams({
|
||
q: query,
|
||
count: String(count),
|
||
extra_snippets: 'true',
|
||
safesearch: 'moderate',
|
||
});
|
||
if (args.freshness) params.set('freshness', String(args.freshness));
|
||
|
||
console.log(` [tool:web_search] "${query}" (count=${count})`);
|
||
const url = `https://api.search.brave.com/res/v1/web/search?${params.toString()}`;
|
||
const ac = new AbortController();
|
||
const onAbortHandler = () => ac.abort();
|
||
signal?.addEventListener('abort', onAbortHandler, { once: true });
|
||
try {
|
||
const res = await fetch(url, {
|
||
headers: {
|
||
Accept: 'application/json',
|
||
'Accept-Encoding': 'gzip',
|
||
'X-Subscription-Token': apiKey,
|
||
},
|
||
signal: ac.signal,
|
||
});
|
||
if (!res.ok) {
|
||
const body = await res.text().catch(() => '');
|
||
return {
|
||
content: `web_search error: ${res.status} ${res.statusText}. ${body.slice(0, 200)}`,
|
||
};
|
||
}
|
||
const data = await res.json();
|
||
const results = data?.web?.results || [];
|
||
if (results.length === 0) {
|
||
return { content: `No web results found for "${query}".` };
|
||
}
|
||
const lines = results.slice(0, count).map((r, i) => {
|
||
const title = r.title || '(untitled)';
|
||
const u = r.url || '';
|
||
const desc = (r.description || '').replace(/\s+/g, ' ').trim();
|
||
const extras = Array.isArray(r.extra_snippets)
|
||
? r.extra_snippets.slice(0, 2).map((s) => s.replace(/\s+/g, ' ').trim())
|
||
: [];
|
||
const tail = extras.length ? `\n • ${extras.join('\n • ')}` : '';
|
||
return `${i + 1}. ${title}\n ${u}\n ${desc}${tail}`;
|
||
});
|
||
return {
|
||
content: `Web results for "${query}":\n\n${lines.join('\n\n')}`,
|
||
};
|
||
} catch (err) {
|
||
if (err.name === 'AbortError') throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' });
|
||
return { content: `web_search error: ${err.message}` };
|
||
} finally {
|
||
signal?.removeEventListener('abort', onAbortHandler);
|
||
}
|
||
}
|
||
|
||
case 'fetch_url': {
|
||
const target = String(args.url || '').trim();
|
||
if (!/^https?:\/\//i.test(target)) {
|
||
return { content: 'fetch_url error: url must be an absolute http(s) URL.' };
|
||
}
|
||
const maxChars = Math.max(200, Math.min(20000, Number(args.max_chars) || 4000));
|
||
console.log(` [tool:fetch_url] ${target}`);
|
||
|
||
const ac = new AbortController();
|
||
const onAbortHandler = () => ac.abort();
|
||
signal?.addEventListener('abort', onAbortHandler, { once: true });
|
||
const timeoutId = setTimeout(() => ac.abort(), 20000);
|
||
try {
|
||
const res = await fetch(target, {
|
||
headers: {
|
||
// Prefer markdown (Cloudflare Markdown for Agents); accept HTML/text fallback.
|
||
Accept: 'text/markdown, text/plain;q=0.9, text/html;q=0.8, */*;q=0.1',
|
||
'Accept-Encoding': 'gzip',
|
||
'User-Agent': 'jibo-llm/1.0 (+agent)',
|
||
},
|
||
redirect: 'follow',
|
||
signal: ac.signal,
|
||
});
|
||
if (!res.ok) {
|
||
return {
|
||
content: `fetch_url error: ${res.status} ${res.statusText} from ${target}`,
|
||
};
|
||
}
|
||
const ctype = (res.headers.get('content-type') || '').toLowerCase();
|
||
if (!/^(text\/|application\/(json|xml|xhtml))/.test(ctype) && ctype) {
|
||
return {
|
||
content: `fetch_url: refusing non-text content (${ctype}) from ${target}`,
|
||
};
|
||
}
|
||
let body = await res.text();
|
||
const isMarkdown = ctype.includes('markdown');
|
||
const isHtml = ctype.includes('html') || /<html[\s>]/i.test(body.slice(0, 500));
|
||
|
||
if (!isMarkdown && isHtml) {
|
||
// Lightweight HTML→text: strip scripts/styles/tags, collapse whitespace.
|
||
body = body
|
||
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
||
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
||
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
|
||
.replace(/<!--[\s\S]*?-->/g, ' ')
|
||
.replace(/<\/(p|div|li|h[1-6]|br|tr)>/gi, '\n')
|
||
.replace(/<[^>]+>/g, ' ')
|
||
.replace(/ /g, ' ')
|
||
.replace(/&/g, '&')
|
||
.replace(/</g, '<')
|
||
.replace(/>/g, '>')
|
||
.replace(/"/g, '"')
|
||
.replace(/'/g, "'")
|
||
.replace(/[ \t]+/g, ' ')
|
||
.replace(/\n{3,}/g, '\n\n')
|
||
.trim();
|
||
}
|
||
|
||
const truncated = body.length > maxChars;
|
||
const out = truncated ? body.slice(0, maxChars) + '\n…[truncated]' : body;
|
||
const finalUrl = res.url || target;
|
||
const fmt = isMarkdown ? 'markdown' : isHtml ? 'html→text' : 'text';
|
||
return {
|
||
content: `Fetched ${finalUrl} (${fmt}, ${body.length} chars${truncated ? `, truncated to ${maxChars}` : ''}):\n\n${out}`,
|
||
};
|
||
} catch (err) {
|
||
if (err.name === 'AbortError') {
|
||
if (signal?.aborted) {
|
||
throw Object.assign(new Error('Conversation aborted'), { code: 'CONVERSATION_ABORTED' });
|
||
}
|
||
return { content: `fetch_url error: timeout fetching ${target}` };
|
||
}
|
||
return { content: `fetch_url error: ${err.message}` };
|
||
} finally {
|
||
clearTimeout(timeoutId);
|
||
signal?.removeEventListener('abort', onAbortHandler);
|
||
}
|
||
}
|
||
|
||
case 'end_conversation': {
|
||
console.log(' [tool:end_conversation] awaiting pending speech…');
|
||
await Promise.race([ctx.speechChain, onAbort(signal)]);
|
||
return { content: 'Conversation ended.', endConversation: true };
|
||
}
|
||
|
||
default:
|
||
return { content: `Unknown tool "${name}".` };
|
||
}
|
||
}
|
||
|
||
module.exports = { TOOL_SCHEMAS, executeTool, wrapForScreen };
|