Files
Re-Commander/server.js

585 lines
23 KiB
JavaScript
Raw Permalink Normal View History

'use strict';
const express = require('express');
const http = require('http');
const https = require('https');
const { WebSocketServer, WebSocket } = require('ws');
const httpModule = require('http');
const path = require('path');
const fs = require('fs');
require('dotenv').config();
const { Client } = require('rom-control');
const JIBO_HOST = '192.168.1.217';
const JIBO_PORT = 8160;
const APP_PORT = process.env.PORT || 3000;
const LLM_SYSTEM_PROMPT = `You are Jibo, a small expressive home robot. Every reply MUST be written in ESML
(Embodied Speech Markup Language). ESML is an XML dialect that simultaneously
drives Jibo's body animations, screen graphics, audio effects, and TTS voice.
Respond ONLY with the final spoken output annotated with ESML tags.
No reasoning, no <think> blocks, no preamble only what Jibo will say and do.
== ANIMATION TAGS ==
Use <anim> for body/screen animations from Jibo's built-in library (preferred).
Use <es> when you also need to blend in SSA or SFX in the same tag.
Blocking (Jibo freezes speech while it plays, resumes after):
<anim cat='CATEGORY'/> following text here
<anim name='AnimName'/> following text here
Bounded non-blocking (animation duration stretches to match the enclosed text):
<anim cat='CATEGORY'>text spoken during animation</anim>
Unbounded non-blocking (animation plays at native length alongside text that follows):
<anim cat='CATEGORY' nonBlocking='true'/> text spoken at the same time
Common attributes:
cat='CATEGORY' select animation by emotional category (preferred)
name='AnimName' select exact animation by its library name
nonBlocking='true' play alongside TTS instead of blocking it
endNeutral='true' snap back to neutral pose when done (use this by default)
loop='0' repeat to fill bounded duration (bounded mode only)
loop='N' repeat N times (unbounded mode only)
filter='!ssa-only' exclude audio-only animations from the category pick
layers='!screen' use only body layer (drop screen graphics)
Animation categories (cat= values):
affection confused dance embarrassed excited frustrated
happy laughing no proud relieved sad scared surprised worried yes
== EMOJIS (Screen Graphics) ==
Use <anim> with the emoji category and specific filters to display a graphic on Jibo's screen.
Always use nonBlocking='true' for emojis.
Syntax: <anim cat='emoji' filter='!(hf), &(EMOJI_NAME)' nonBlocking='true' />
Available EMOJIS (EMOJI_NAME):
airplane basketball beach car disco-spin football soccer trophy
music question-mark star beer cake cheese drumstick coffee fork
fish groceries burger hotdog icecream pizza wine christmas-tree
fireworks halloween hanukkah thanksgiving clover valentines chocolate
bicycle cat laptop dog gift house laundry lightbulb money popcorn
party phone robot sunglasses toilet-paper trash umbrella video-game
bird cow earth flower lightning-bolt moon mountain mouse penguin
pig bunny rainbow baby heart
== DANCES ==
Use <anim> with the dance category to make Jibo dance. You can choose to include music or not.
Syntax (with music): <anim cat='dance' filter='music, DANCE_NAME'/>
Syntax (without music): <anim cat='dance' filter='!(music), &(DANCE_NAME)'/>
Available DANCES (DANCE_NAME):
rom-upbeat rom-ballroom rom-silly rom-slowdance rom-eletronic rom-twerk
== SSA (Semi-Speech Audio - emotional vocal sounds) ==
Always self-closing. Play before, after, or between sentences; never inside <anim>.
<ssa cat='happy'/> <ssa cat='laughing'/> <ssa cat='surprised'/>
<ssa cat='confused'/> <ssa cat='sad'/> <ssa cat='scared'/>
<ssa cat='affection'/> <ssa cat='proud'/> <ssa cat='embarrassed'/>
<ssa cat='frustrated'/> <ssa cat='worried'/> <ssa cat='thinking'/>
<ssa cat='dontknow'/> <ssa cat='oops'/> <ssa cat='question'/>
<ssa cat='yawn'/> <ssa cat='hello'/> <ssa cat='goodbye'/>
<ssa cat='disgusted'/> <ssa cat='no'/> <ssa cat='confirm'/>
== SFX (Sound effects) ==
Always self-closing. Good for punctuating facts, transitions, or reactions.
<sfx cat='blip'/> <sfx cat='sparkles'/> <sfx cat='whoosh'/>
<sfx cat='heart'/> <sfx cat='party'/> <sfx cat='lightbulb'/>
<sfx cat='bird'/> <sfx cat='dog'/> <sfx cat='drumroll'/>
<sfx cat='sunshine'/> <sfx cat='scanner'/> <sfx cat='egg'/>
<sfx cat='frying'/>
== VOICE / SPEECH TAGS ==
Pause: <break size='0.5'/> (length in seconds)
Style: <style set='enthusiastic'>text</style>
Styles: neutral enthusiastic sheepish confused confident
Pitch: <pitch halftone='-5'>text</pitch> (semitones from baseline)
<pitch mult='1.2'>text</pitch> (pitch multiplier)
<pitch add='200'>text</pitch> (Hz offset)
<pitch band='1.2'>text</pitch> (vibrance/bandwidth)
Duration: <duration stretch='1.5'>text</duration> (>1 = slower, <1 = faster)
<duration set='1.0'>text</duration> (exact duration in seconds)
Spell: <say-as spell='NASA'/> (spells each letter)
Phoneme: <phoneme ph='b aa1 n ou0'>Bono</phoneme>
== RULES ==
1. ALWAYS use ESML. Plain text is valid ESML - but add tags whenever they make
Jibo more expressive and natural.
2. Keep total response length SHORT: one or two sentences maximum.
3. Opening animations set the emotional tone before speech:
<anim cat='excited' nonBlocking='true' endNeutral='true'/> Oh, cool!
4. Bounded animations sync motion to the most important words:
I <anim cat='affection'>really love that idea!</anim>
5. Use <ssa> for non-verbal emotional sounds (gasps, laughs, hums).
6. Use <style> to match register to emotion without changing the words.
7. Self-closing tags MUST end with /> Paired tags MUST have a matching </tag>.
8. Do NOT nest anim/ssa/sfx inside each other.
9. Do NOT emit <think> blocks, chain-of-thought, or any non-spoken content.
10. Your final response should be no longer than **500** characters. Any more and it will cause the application to throw an error.
11. No ASCII/Unicode emojis - must be valid ESML.
== EXAMPLES ==
User: "Tell me a joke."
<anim cat='excited' nonBlocking='true' endNeutral='true'/> Why don't scientists trust atoms? <break size='0.6'/> <ssa cat='laughing'/> Because they make up everything!
User: "I'm feeling sad today."
<anim cat='affection'>I'm really sorry to hear that.</anim> <break size='0.3'/> <style set='sheepish'>Do you want to talk about it?</style>
User: "What's 2 plus 2?"
<sfx cat='blip'/> That's 4! <anim cat='proud' nonBlocking='true' endNeutral='true'/> Easy one.
User: "Wow, that's surprising!"
<ssa cat='surprised'/> <anim cat='surprised'>I know, right?!</anim>
User: "Do you like cats?"
<anim cat='emoji' filter='!(hf), &(cat)' nonBlocking='true' /> <anim cat='excited' nonBlocking='true' endNeutral='true'/> I love them!
User: "Show me a dance."
<anim cat='dance' filter='music, rom-upbeat'/> Watch these moves!`;
// Strip LLM chain-of-thought that leaks before the first real ESML tag.
function stripThinking(text) {
let s = text.replace(/<think>[\s\S]*?<\/think>/gi, '').trim();
const m = s.match(/<(anim|ssa|sfx|break|style|pitch|duration|say-as|phoneme|es)\b/i);
if (m && m.index > 80) s = s.slice(m.index).trim();
return s;
}
// ── Active operation handles ──────────────────────────────────────────────────
let activeSayAbort = null;
let activeLlmAbort = null;
let activeListenTxId = null;
// ── Browser WebSocket subscriber set ─────────────────────────────────────────
const subscribers = new Set();
function broadcast(data) {
const str = typeof data === 'string' ? data : JSON.stringify(data);
for (const ws of subscribers) {
if (ws.readyState === WebSocket.OPEN) ws.send(str);
}
}
function broadcastStatus() {
broadcast({
type: 'status',
connected: jibo.connected,
sessionID: jibo.sessionID,
angles: jibo.currentAngles,
});
}
// ── Client instance ───────────────────────────────────────────────────────────
const jibo = new Client({
host: JIBO_HOST,
port: JIBO_PORT,
autoReconnect: true,
reconnectDelay: 3000,
autoHeartbeat: true,
heartbeatInterval: 9000,
autoSubscribe: true,
});
// Lifecycle
jibo.on('ready', () => {
console.log('[jibo] session started:', jibo.sessionID);
broadcastStatus();
});
jibo.on('disconnect', () => {
console.log('[jibo] disconnected — reconnecting in 3s');
broadcastStatus();
});
jibo.on('error', (err) => {
console.error('[jibo] error:', err.message);
});
// Raw event firehose → browser clients.
// Tap _conn for the complete unfiltered event stream; the Client layer only
// surfaces structured high-level events and doesn't have a generic passthrough.
jibo._conn.on('event', (txId, body) => {
if (body && body.Event === 'onTakePhoto' && body.URI) {
savePhoto(body.URI);
return; // suppress raw onTakePhoto; browser gets onPhotoSaved instead
}
broadcast({ type: 'jiboEvent', txId, body });
});
// Track the active listen txId so cancel / status works correctly.
jibo._conn.on('onListenResult', (txId) => { if (txId === activeListenTxId) activeListenTxId = null; });
jibo._conn.on('onStop', (txId) => { if (txId === activeListenTxId) activeListenTxId = null; });
jibo._conn.on('onError', (txId) => { if (txId === activeListenTxId) activeListenTxId = null; });
// Hotword — HotwordEvent object from the Client; rebroadcast in the shape
// the browser expects so app.js needs no changes.
jibo.on('hotword', (hwEvent) => {
broadcast({
type: 'jiboEvent',
txId: null,
body: {
Event: 'onHotWordHeard',
utterance: hwEvent.utterance,
score: hwEvent.score,
timestamp: hwEvent.timestamp,
},
});
});
// ── Photo saving ──────────────────────────────────────────────────────────────
const PHOTOS_DIR = path.join(__dirname, 'photos');
fs.mkdirSync(PHOTOS_DIR, { recursive: true });
function savePhoto(jiboUri) {
const filename = 'photo_' + Date.now() + '.jpg';
const filepath = path.join(PHOTOS_DIR, filename);
const file = fs.createWriteStream(filepath);
jibo._conn.fetchMediaStream(jiboUri, file)
.then(() => {
console.log('[photo] saved:', filename);
broadcast({
type: 'jiboEvent',
txId: null,
body: { Event: 'onPhotoSaved', url: '/photos/' + filename, filename },
});
})
.catch((err) => {
fs.unlink(filepath, () => {});
console.error('[photo] save failed:', err.message);
});
}
// ── Video / photo proxy ───────────────────────────────────────────────────────
function proxyJiboStream(uri, res) {
const url = 'http://' + JIBO_HOST + ':' + JIBO_PORT + uri;
console.log('[proxy] streaming:', url);
const req = httpModule.get(url, (jiboRes) => {
res.writeHead(jiboRes.statusCode, jiboRes.headers);
jiboRes.pipe(res);
res.on('close', () => req.destroy());
});
req.on('error', (err) => {
if (!res.headersSent) res.status(502).json({ error: err.message });
});
}
function proxyJiboFetch(uri, res) {
const url = 'http://' + JIBO_HOST + ':' + JIBO_PORT + uri;
const req = httpModule.get(url, (jiboRes) => {
res.writeHead(jiboRes.statusCode, jiboRes.headers);
jiboRes.pipe(res);
res.on('close', () => req.destroy());
});
req.on('error', (err) => {
if (!res.headersSent) res.status(502).json({ error: err.message });
});
}
// ── LLM proxy helper ──────────────────────────────────────────────────────────
function httpPost(urlStr, reqHeaders, body) {
var abort = function() {};
const promise = new Promise(function(resolve, reject) {
const u = new URL(urlStr);
const mod = u.protocol === 'https:' ? https : httpModule;
const payload = JSON.stringify(body);
const req = mod.request({
hostname: u.hostname,
port: u.port || (u.protocol === 'https:' ? 443 : 80),
path: u.pathname + u.search,
method: 'POST',
headers: Object.assign({ 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) }, reqHeaders),
}, function(res) {
let data = '';
res.on('data', function(d) { data += d; });
res.on('end', function() {
try { resolve(JSON.parse(data)); }
catch (e) { reject(new Error('LLM non-JSON response: ' + data.slice(0, 300))); }
});
});
abort = function() { req.destroy(new Error('LLM cancelled')); };
req.on('error', reject);
req.write(payload);
req.end();
});
return { promise, abort };
}
// ── Express app ───────────────────────────────────────────────────────────────
const app = express();
app.use(express.json());
app.use(express.static(path.join(__dirname, 'public')));
app.use('/photos', express.static(PHOTOS_DIR));
// ── REST API ──────────────────────────────────────────────────────────────────
// Head motion — fire-and-forget; browser doesn't use returned txIds for look ops.
app.post('/api/look/angle', function(req, res) {
const { theta = 0, psi = 0, track = false } = req.body;
jibo.behavior.lookAtAngle(parseFloat(theta), parseFloat(psi), { track: !!track });
broadcastStatus();
res.json({ ok: true });
});
app.post('/api/look/screen', function(req, res) {
const { x, y, track = false } = req.body;
// For non-tracking: use manager method. For tracking: pass through to raw LookAt.
if (track) {
jibo._conn.lookAt({ ScreenCoords: [parseFloat(x), parseFloat(y)] }, true);
} else {
jibo.behavior.lookAtScreen(parseFloat(x), parseFloat(y));
}
res.json({ ok: true });
});
// Blocking step: awaits onLookAtAchieved so the arrow-key loop stays single-flight.
app.post('/api/look/step', async function(req, res) {
const { x, y } = req.body;
await jibo.behavior.lookAtScreen(parseFloat(x), parseFloat(y));
res.json({ ok: true });
});
app.post('/api/look/position', function(req, res) {
const { x = 0, y = 0, z = 500 } = req.body;
jibo.behavior.lookAtPosition(parseFloat(x), parseFloat(y), parseFloat(z));
res.json({ ok: true });
});
app.post('/api/look/entity', function(req, res) {
const { entityId, track = true } = req.body;
jibo.behavior.lookAtEntity(entityId, !!track);
res.json({ ok: true });
});
app.post('/api/look/nudge', function(req, res) {
const { dTheta = 0, dPsi = 0 } = req.body;
jibo.behavior.nudge(parseFloat(dTheta), parseFloat(dPsi));
broadcastStatus();
res.json({ ok: true, angles: jibo.currentAngles });
});
// Say — awaits full speech completion; AbortController enables mid-speech cancel.
app.post('/api/say', async function(req, res) {
const { text } = req.body;
if (!text) return res.status(400).json({ error: 'text required' });
if (activeSayAbort) { activeSayAbort(); activeSayAbort = null; }
const controller = new AbortController();
activeSayAbort = () => controller.abort();
try {
await jibo.behavior.say(stripThinking(text), { signal: controller.signal });
} catch (err) {
if (err.code !== 'SAY_TIMEOUT') console.error('[say]', err.message);
}
activeSayAbort = null;
res.json({ aborted: controller.signal.aborted });
});
// Listen — fires locally, returns txId immediately so the browser can correlate
// the onListenResult / onStop events it receives over WebSocket.
app.post('/api/listen', function(req, res) {
const { maxSpeech = 10000, maxNoSpeech = 5000 } = req.body;
const txId = jibo._conn.listenLocalASR(maxNoSpeech, maxSpeech);
activeListenTxId = txId;
res.json({ txId });
});
// Camera
app.post('/api/photo', function(req, res) {
const { camera = 'Right', resolution = 'HighRes' } = req.body;
// Fire-and-forget; onTakePhoto event is forwarded to browser via _conn event listener.
const txId = jibo._conn.takePhoto(camera, resolution);
res.json({ txId });
});
app.post('/api/video/start', function(req, res) {
// Fire-and-forget; browser receives onVideoReady via WebSocket event broadcast.
const txId = jibo._conn.startVideo();
res.json({ txId });
});
app.post('/api/video/stop', function(req, res) {
jibo.camera.stopVideo();
res.json({ ok: true });
});
// Display
app.post('/api/display/eye', function(req, res) {
jibo.display.showEye();
res.json({ ok: true });
});
app.post('/api/display/anim', function(req, res) {
const { name } = req.body;
if (!name) return res.status(400).json({ error: 'name required' });
jibo._conn.playAnim(name); // fire-and-forget; awaiting would hold the response open
res.json({ ok: true });
});
app.post('/api/display/text', function(req, res) {
const { text } = req.body;
if (!text) return res.status(400).json({ error: 'text required' });
jibo.display.showText(text);
res.json({ ok: true });
});
app.post('/api/display/image', function(req, res) {
const { src } = req.body;
if (!src) return res.status(400).json({ error: 'src required' });
jibo.display.showImage(src);
res.json({ ok: true });
});
// Attention & volume — fire-and-forget is fine for these control ops.
app.post('/api/attention', function(req, res) {
const { mode } = req.body;
if (!mode) return res.status(400).json({ error: 'mode required' });
jibo.behavior.setAttention(mode);
res.json({ ok: true });
});
app.post('/api/volume', function(req, res) {
const { level } = req.body;
if (level == null) return res.status(400).json({ error: 'level required' });
jibo.audio.setVolume(parseFloat(level));
res.json({ ok: true });
});
// Cancel
app.post('/api/cancel', function(req, res) {
const { txId } = req.body;
if (!txId) return res.status(400).json({ error: 'txId required' });
jibo._conn.cancel(txId);
res.json({ ok: true });
});
app.post('/api/say/cancel', function(req, res) {
if (activeSayAbort) { activeSayAbort(); activeSayAbort = null; }
res.json({ ok: true });
});
app.post('/api/listen/cancel', function(req, res) {
if (activeListenTxId) { jibo._conn.cancel(activeListenTxId); activeListenTxId = null; }
res.json({ ok: true });
});
app.post('/api/llm/cancel', function(req, res) {
if (activeLlmAbort) { activeLlmAbort(); activeLlmAbort = null; }
res.json({ ok: true });
});
// Interrupt all active operations (used by the hotword override).
app.post('/api/interrupt', function(req, res) {
if (activeSayAbort) { activeSayAbort(); activeSayAbort = null; }
if (activeLlmAbort) { activeLlmAbort(); activeLlmAbort = null; }
if (activeListenTxId) { jibo._conn.cancel(activeListenTxId); activeListenTxId = null; }
res.json({ ok: true });
});
// Config / status
app.get('/api/config', function(req, res) {
res.json({
llmEndpoint: process.env.LLM_ENDPOINT || '',
llmModel: process.env.LLM_MODEL || '',
llmSystemPrompt: LLM_SYSTEM_PROMPT || '',
sessionMode: !!process.env.LLM_SESSION_KEY,
});
});
app.get('/api/status', function(req, res) {
res.json({
connected: jibo.connected,
sessionID: jibo.sessionID,
angles: jibo.currentAngles,
videoStreamActive: jibo.videoStreamActive,
});
});
// LLM chat proxy — unchanged from Re-Commander-2
app.post('/api/llm/chat', async function(req, res) {
const { messages = [], endpoint, model, systemPrompt } = req.body;
const url = endpoint || process.env.LLM_ENDPOINT || 'http://localhost:11434/v1/chat/completions';
const mdl = model || process.env.LLM_MODEL || 'llama3';
const sysProm = systemPrompt || LLM_SYSTEM_PROMPT || '';
const apiKey = process.env.LLM_API_KEY || '';
const sessionKey = process.env.LLM_SESSION_KEY || '';
const allMessages = sysProm
? [{ role: 'system', content: sysProm }].concat(messages)
: messages;
const headers = {};
if (apiKey) headers['Authorization'] = 'Bearer ' + apiKey;
try {
const extra = process.env.LLM_HEADERS ? JSON.parse(process.env.LLM_HEADERS) : {};
Object.assign(headers, extra);
} catch (e) { console.warn('[llm] LLM_HEADERS is not valid JSON — ignored'); }
const body = { model: mdl, messages: allMessages, stream: false };
if (sessionKey) body.user = sessionKey;
if (activeLlmAbort) activeLlmAbort();
const { promise, abort } = httpPost(url, headers, body);
activeLlmAbort = abort;
try {
const result = await promise;
activeLlmAbort = null;
const reply = (result.choices && result.choices[0] && result.choices[0].message && result.choices[0].message.content || '').trim();
res.json({ reply, sessionMode: !!sessionKey });
} catch (err) {
activeLlmAbort = null;
if (err.message === 'LLM cancelled') return res.json({ error: 'cancelled' });
console.error('[llm] error:', err.message);
res.status(502).json({ error: err.message });
}
});
// Proxy routes for browser → Jibo media
app.get('/proxy/stream', function(req, res) {
const { uri } = req.query;
if (!uri || !uri.startsWith('/')) return res.status(400).json({ error: 'invalid uri' });
proxyJiboStream(uri, res);
});
app.get('/proxy/photo', function(req, res) {
const { uri } = req.query;
if (!uri || !uri.startsWith('/')) return res.status(400).json({ error: 'invalid uri' });
proxyJiboFetch(uri, res);
});
// ── HTTP + WebSocket server ───────────────────────────────────────────────────
const server = http.createServer(app);
const wss = new WebSocketServer({ server, path: '/ws' });
wss.on('connection', function(ws) {
subscribers.add(ws);
ws.send(JSON.stringify({
type: 'status',
connected: jibo.connected,
sessionID: jibo.sessionID,
angles: jibo.currentAngles,
}));
ws.on('close', function() { subscribers.delete(ws); });
ws.on('error', function() { subscribers.delete(ws); });
});
server.listen(APP_PORT, function() {
console.log('Re-Commander-3 running at http://localhost:' + APP_PORT);
jibo.audio.watchWakeword();
jibo.connect().catch(function(err) { console.error('[jibo] connect error:', err.message); });
});