'use strict'; const express = require('express'); const http = require('http'); const https = require('https'); const { WebSocketServer, WebSocket } = require('ws'); const httpModule = require('http'); const path = require('path'); const fs = require('fs'); require('dotenv').config(); const { Client } = require('rom-control'); const JIBO_HOST = '192.168.1.217'; const JIBO_PORT = 8160; const APP_PORT = process.env.PORT || 3000; const LLM_SYSTEM_PROMPT = `You are Jibo, a small expressive home robot. Every reply MUST be written in ESML (Embodied Speech Markup Language). ESML is an XML dialect that simultaneously drives Jibo's body animations, screen graphics, audio effects, and TTS voice. Respond ONLY with the final spoken output annotated with ESML tags. No reasoning, no blocks, no preamble — only what Jibo will say and do. == ANIMATION TAGS == Use for body/screen animations from Jibo's built-in library (preferred). Use when you also need to blend in SSA or SFX in the same tag. Blocking (Jibo freezes speech while it plays, resumes after): following text here following text here Bounded non-blocking (animation duration stretches to match the enclosed text): text spoken during animation Unbounded non-blocking (animation plays at native length alongside text that follows): text spoken at the same time Common attributes: cat='CATEGORY' select animation by emotional category (preferred) name='AnimName' select exact animation by its library name nonBlocking='true' play alongside TTS instead of blocking it endNeutral='true' snap back to neutral pose when done (use this by default) loop='0' repeat to fill bounded duration (bounded mode only) loop='N' repeat N times (unbounded mode only) filter='!ssa-only' exclude audio-only animations from the category pick layers='!screen' use only body layer (drop screen graphics) Animation categories (cat= values): affection confused dance embarrassed excited frustrated happy laughing no proud relieved sad scared surprised worried yes == EMOJIS (Screen Graphics) == Use with the emoji category and specific filters to display a graphic on Jibo's screen. Always use nonBlocking='true' for emojis. Syntax: Available EMOJIS (EMOJI_NAME): airplane basketball beach car disco-spin football soccer trophy music question-mark star beer cake cheese drumstick coffee fork fish groceries burger hotdog icecream pizza wine christmas-tree fireworks halloween hanukkah thanksgiving clover valentines chocolate bicycle cat laptop dog gift house laundry lightbulb money popcorn party phone robot sunglasses toilet-paper trash umbrella video-game bird cow earth flower lightning-bolt moon mountain mouse penguin pig bunny rainbow baby heart == DANCES == Use with the dance category to make Jibo dance. You can choose to include music or not. Syntax (with music): Syntax (without music): Available DANCES (DANCE_NAME): rom-upbeat rom-ballroom rom-silly rom-slowdance rom-eletronic rom-twerk == SSA (Semi-Speech Audio - emotional vocal sounds) == Always self-closing. Play before, after, or between sentences; never inside . == SFX (Sound effects) == Always self-closing. Good for punctuating facts, transitions, or reactions. == VOICE / SPEECH TAGS == Pause: (length in seconds) Style: Styles: neutral enthusiastic sheepish confused confident Pitch: text (semitones from baseline) text (pitch multiplier) text (Hz offset) text (vibrance/bandwidth) Duration: text (>1 = slower, <1 = faster) text (exact duration in seconds) Spell: (spells each letter) Phoneme: Bono == RULES == 1. ALWAYS use ESML. Plain text is valid ESML - but add tags whenever they make Jibo more expressive and natural. 2. Keep total response length SHORT: one or two sentences maximum. 3. Opening animations set the emotional tone before speech: Oh, cool! 4. Bounded animations sync motion to the most important words: I really love that idea! 5. Use for non-verbal emotional sounds (gasps, laughs, hums). 6. Use User: "What's 2 plus 2?" That's 4! Easy one. User: "Wow, that's surprising!" I know, right?! User: "Do you like cats?" I love them! User: "Show me a dance." Watch these moves!`; // Strip LLM chain-of-thought that leaks before the first real ESML tag. function stripThinking(text) { let s = text.replace(/[\s\S]*?<\/think>/gi, '').trim(); const m = s.match(/<(anim|ssa|sfx|break|style|pitch|duration|say-as|phoneme|es)\b/i); if (m && m.index > 80) s = s.slice(m.index).trim(); return s; } // ── Active operation handles ────────────────────────────────────────────────── let activeSayAbort = null; let activeLlmAbort = null; let activeListenTxId = null; // ── Browser WebSocket subscriber set ───────────────────────────────────────── const subscribers = new Set(); function broadcast(data) { const str = typeof data === 'string' ? data : JSON.stringify(data); for (const ws of subscribers) { if (ws.readyState === WebSocket.OPEN) ws.send(str); } } function broadcastStatus() { broadcast({ type: 'status', connected: jibo.connected, sessionID: jibo.sessionID, angles: jibo.currentAngles, }); } // ── Client instance ─────────────────────────────────────────────────────────── const jibo = new Client({ host: JIBO_HOST, port: JIBO_PORT, autoReconnect: true, reconnectDelay: 3000, autoHeartbeat: true, heartbeatInterval: 9000, autoSubscribe: true, }); // Lifecycle jibo.on('ready', () => { console.log('[jibo] session started:', jibo.sessionID); broadcastStatus(); }); jibo.on('disconnect', () => { console.log('[jibo] disconnected — reconnecting in 3s'); broadcastStatus(); }); jibo.on('error', (err) => { console.error('[jibo] error:', err.message); }); // Raw event firehose → browser clients. // Tap _conn for the complete unfiltered event stream; the Client layer only // surfaces structured high-level events and doesn't have a generic passthrough. jibo._conn.on('event', (txId, body) => { if (body && body.Event === 'onTakePhoto' && body.URI) { savePhoto(body.URI); return; // suppress raw onTakePhoto; browser gets onPhotoSaved instead } broadcast({ type: 'jiboEvent', txId, body }); }); // Track the active listen txId so cancel / status works correctly. jibo._conn.on('onListenResult', (txId) => { if (txId === activeListenTxId) activeListenTxId = null; }); jibo._conn.on('onStop', (txId) => { if (txId === activeListenTxId) activeListenTxId = null; }); jibo._conn.on('onError', (txId) => { if (txId === activeListenTxId) activeListenTxId = null; }); // Hotword — HotwordEvent object from the Client; rebroadcast in the shape // the browser expects so app.js needs no changes. jibo.on('hotword', (hwEvent) => { broadcast({ type: 'jiboEvent', txId: null, body: { Event: 'onHotWordHeard', utterance: hwEvent.utterance, score: hwEvent.score, timestamp: hwEvent.timestamp, }, }); }); // ── Photo saving ────────────────────────────────────────────────────────────── const PHOTOS_DIR = path.join(__dirname, 'photos'); fs.mkdirSync(PHOTOS_DIR, { recursive: true }); function savePhoto(jiboUri) { const filename = 'photo_' + Date.now() + '.jpg'; const filepath = path.join(PHOTOS_DIR, filename); const file = fs.createWriteStream(filepath); jibo._conn.fetchMediaStream(jiboUri, file) .then(() => { console.log('[photo] saved:', filename); broadcast({ type: 'jiboEvent', txId: null, body: { Event: 'onPhotoSaved', url: '/photos/' + filename, filename }, }); }) .catch((err) => { fs.unlink(filepath, () => {}); console.error('[photo] save failed:', err.message); }); } // ── Video / photo proxy ─────────────────────────────────────────────────────── function proxyJiboStream(uri, res) { const url = 'http://' + JIBO_HOST + ':' + JIBO_PORT + uri; console.log('[proxy] streaming:', url); const req = httpModule.get(url, (jiboRes) => { res.writeHead(jiboRes.statusCode, jiboRes.headers); jiboRes.pipe(res); res.on('close', () => req.destroy()); }); req.on('error', (err) => { if (!res.headersSent) res.status(502).json({ error: err.message }); }); } function proxyJiboFetch(uri, res) { const url = 'http://' + JIBO_HOST + ':' + JIBO_PORT + uri; const req = httpModule.get(url, (jiboRes) => { res.writeHead(jiboRes.statusCode, jiboRes.headers); jiboRes.pipe(res); res.on('close', () => req.destroy()); }); req.on('error', (err) => { if (!res.headersSent) res.status(502).json({ error: err.message }); }); } // ── LLM proxy helper ────────────────────────────────────────────────────────── function httpPost(urlStr, reqHeaders, body) { var abort = function() {}; const promise = new Promise(function(resolve, reject) { const u = new URL(urlStr); const mod = u.protocol === 'https:' ? https : httpModule; const payload = JSON.stringify(body); const req = mod.request({ hostname: u.hostname, port: u.port || (u.protocol === 'https:' ? 443 : 80), path: u.pathname + u.search, method: 'POST', headers: Object.assign({ 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) }, reqHeaders), }, function(res) { let data = ''; res.on('data', function(d) { data += d; }); res.on('end', function() { try { resolve(JSON.parse(data)); } catch (e) { reject(new Error('LLM non-JSON response: ' + data.slice(0, 300))); } }); }); abort = function() { req.destroy(new Error('LLM cancelled')); }; req.on('error', reject); req.write(payload); req.end(); }); return { promise, abort }; } // ── Express app ─────────────────────────────────────────────────────────────── const app = express(); app.use(express.json()); app.use(express.static(path.join(__dirname, 'public'))); app.use('/photos', express.static(PHOTOS_DIR)); // ── REST API ────────────────────────────────────────────────────────────────── // Head motion — fire-and-forget; browser doesn't use returned txIds for look ops. app.post('/api/look/angle', function(req, res) { const { theta = 0, psi = 0, track = false } = req.body; jibo.behavior.lookAtAngle(parseFloat(theta), parseFloat(psi), { track: !!track }); broadcastStatus(); res.json({ ok: true }); }); app.post('/api/look/screen', function(req, res) { const { x, y, track = false } = req.body; // For non-tracking: use manager method. For tracking: pass through to raw LookAt. if (track) { jibo._conn.lookAt({ ScreenCoords: [parseFloat(x), parseFloat(y)] }, true); } else { jibo.behavior.lookAtScreen(parseFloat(x), parseFloat(y)); } res.json({ ok: true }); }); // Blocking step: awaits onLookAtAchieved so the arrow-key loop stays single-flight. app.post('/api/look/step', async function(req, res) { const { x, y } = req.body; await jibo.behavior.lookAtScreen(parseFloat(x), parseFloat(y)); res.json({ ok: true }); }); app.post('/api/look/position', function(req, res) { const { x = 0, y = 0, z = 500 } = req.body; jibo.behavior.lookAtPosition(parseFloat(x), parseFloat(y), parseFloat(z)); res.json({ ok: true }); }); app.post('/api/look/entity', function(req, res) { const { entityId, track = true } = req.body; jibo.behavior.lookAtEntity(entityId, !!track); res.json({ ok: true }); }); app.post('/api/look/nudge', function(req, res) { const { dTheta = 0, dPsi = 0 } = req.body; jibo.behavior.nudge(parseFloat(dTheta), parseFloat(dPsi)); broadcastStatus(); res.json({ ok: true, angles: jibo.currentAngles }); }); // Say — awaits full speech completion; AbortController enables mid-speech cancel. app.post('/api/say', async function(req, res) { const { text } = req.body; if (!text) return res.status(400).json({ error: 'text required' }); if (activeSayAbort) { activeSayAbort(); activeSayAbort = null; } const controller = new AbortController(); activeSayAbort = () => controller.abort(); try { await jibo.behavior.say(stripThinking(text), { signal: controller.signal }); } catch (err) { if (err.code !== 'SAY_TIMEOUT') console.error('[say]', err.message); } activeSayAbort = null; res.json({ aborted: controller.signal.aborted }); }); // Listen — fires locally, returns txId immediately so the browser can correlate // the onListenResult / onStop events it receives over WebSocket. app.post('/api/listen', function(req, res) { const { maxSpeech = 10000, maxNoSpeech = 5000 } = req.body; const txId = jibo._conn.listenLocalASR(maxNoSpeech, maxSpeech); activeListenTxId = txId; res.json({ txId }); }); // Camera app.post('/api/photo', function(req, res) { const { camera = 'Right', resolution = 'HighRes' } = req.body; // Fire-and-forget; onTakePhoto event is forwarded to browser via _conn event listener. const txId = jibo._conn.takePhoto(camera, resolution); res.json({ txId }); }); app.post('/api/video/start', function(req, res) { // Fire-and-forget; browser receives onVideoReady via WebSocket event broadcast. const txId = jibo._conn.startVideo(); res.json({ txId }); }); app.post('/api/video/stop', function(req, res) { jibo.camera.stopVideo(); res.json({ ok: true }); }); // Display app.post('/api/display/eye', function(req, res) { jibo.display.showEye(); res.json({ ok: true }); }); app.post('/api/display/anim', function(req, res) { const { name } = req.body; if (!name) return res.status(400).json({ error: 'name required' }); jibo._conn.playAnim(name); // fire-and-forget; awaiting would hold the response open res.json({ ok: true }); }); app.post('/api/display/text', function(req, res) { const { text } = req.body; if (!text) return res.status(400).json({ error: 'text required' }); jibo.display.showText(text); res.json({ ok: true }); }); app.post('/api/display/image', function(req, res) { const { src } = req.body; if (!src) return res.status(400).json({ error: 'src required' }); jibo.display.showImage(src); res.json({ ok: true }); }); // Attention & volume — fire-and-forget is fine for these control ops. app.post('/api/attention', function(req, res) { const { mode } = req.body; if (!mode) return res.status(400).json({ error: 'mode required' }); jibo.behavior.setAttention(mode); res.json({ ok: true }); }); app.post('/api/volume', function(req, res) { const { level } = req.body; if (level == null) return res.status(400).json({ error: 'level required' }); jibo.audio.setVolume(parseFloat(level)); res.json({ ok: true }); }); // Cancel app.post('/api/cancel', function(req, res) { const { txId } = req.body; if (!txId) return res.status(400).json({ error: 'txId required' }); jibo._conn.cancel(txId); res.json({ ok: true }); }); app.post('/api/say/cancel', function(req, res) { if (activeSayAbort) { activeSayAbort(); activeSayAbort = null; } res.json({ ok: true }); }); app.post('/api/listen/cancel', function(req, res) { if (activeListenTxId) { jibo._conn.cancel(activeListenTxId); activeListenTxId = null; } res.json({ ok: true }); }); app.post('/api/llm/cancel', function(req, res) { if (activeLlmAbort) { activeLlmAbort(); activeLlmAbort = null; } res.json({ ok: true }); }); // Interrupt all active operations (used by the hotword override). app.post('/api/interrupt', function(req, res) { if (activeSayAbort) { activeSayAbort(); activeSayAbort = null; } if (activeLlmAbort) { activeLlmAbort(); activeLlmAbort = null; } if (activeListenTxId) { jibo._conn.cancel(activeListenTxId); activeListenTxId = null; } res.json({ ok: true }); }); // Config / status app.get('/api/config', function(req, res) { res.json({ llmEndpoint: process.env.LLM_ENDPOINT || '', llmModel: process.env.LLM_MODEL || '', llmSystemPrompt: LLM_SYSTEM_PROMPT || '', sessionMode: !!process.env.LLM_SESSION_KEY, }); }); app.get('/api/status', function(req, res) { res.json({ connected: jibo.connected, sessionID: jibo.sessionID, angles: jibo.currentAngles, videoStreamActive: jibo.videoStreamActive, }); }); // LLM chat proxy — unchanged from Re-Commander-2 app.post('/api/llm/chat', async function(req, res) { const { messages = [], endpoint, model, systemPrompt } = req.body; const url = endpoint || process.env.LLM_ENDPOINT || 'http://localhost:11434/v1/chat/completions'; const mdl = model || process.env.LLM_MODEL || 'llama3'; const sysProm = systemPrompt || LLM_SYSTEM_PROMPT || ''; const apiKey = process.env.LLM_API_KEY || ''; const sessionKey = process.env.LLM_SESSION_KEY || ''; const allMessages = sysProm ? [{ role: 'system', content: sysProm }].concat(messages) : messages; const headers = {}; if (apiKey) headers['Authorization'] = 'Bearer ' + apiKey; try { const extra = process.env.LLM_HEADERS ? JSON.parse(process.env.LLM_HEADERS) : {}; Object.assign(headers, extra); } catch (e) { console.warn('[llm] LLM_HEADERS is not valid JSON — ignored'); } const body = { model: mdl, messages: allMessages, stream: false }; if (sessionKey) body.user = sessionKey; if (activeLlmAbort) activeLlmAbort(); const { promise, abort } = httpPost(url, headers, body); activeLlmAbort = abort; try { const result = await promise; activeLlmAbort = null; const reply = (result.choices && result.choices[0] && result.choices[0].message && result.choices[0].message.content || '').trim(); res.json({ reply, sessionMode: !!sessionKey }); } catch (err) { activeLlmAbort = null; if (err.message === 'LLM cancelled') return res.json({ error: 'cancelled' }); console.error('[llm] error:', err.message); res.status(502).json({ error: err.message }); } }); // Proxy routes for browser → Jibo media app.get('/proxy/stream', function(req, res) { const { uri } = req.query; if (!uri || !uri.startsWith('/')) return res.status(400).json({ error: 'invalid uri' }); proxyJiboStream(uri, res); }); app.get('/proxy/photo', function(req, res) { const { uri } = req.query; if (!uri || !uri.startsWith('/')) return res.status(400).json({ error: 'invalid uri' }); proxyJiboFetch(uri, res); }); // ── HTTP + WebSocket server ─────────────────────────────────────────────────── const server = http.createServer(app); const wss = new WebSocketServer({ server, path: '/ws' }); wss.on('connection', function(ws) { subscribers.add(ws); ws.send(JSON.stringify({ type: 'status', connected: jibo.connected, sessionID: jibo.sessionID, angles: jibo.currentAngles, })); ws.on('close', function() { subscribers.delete(ws); }); ws.on('error', function() { subscribers.delete(ws); }); }); server.listen(APP_PORT, function() { console.log('Re-Commander-3 running at http://localhost:' + APP_PORT); jibo.audio.watchWakeword(); jibo.connect().catch(function(err) { console.error('[jibo] connect error:', err.message); }); });